diff --git a/experiments/speedrun/hackable_transformer_attn_gate/hackable_transformer_attn_gate.py b/experiments/speedrun/hackable_transformer_attn_gate/hackable_transformer_attn_gate.py
new file mode 100644
index 0000000000..2b767f2c96
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/hackable_transformer_attn_gate.py
@@ -0,0 +1,566 @@
+# Copyright 2025 The Marin Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Hackable transformer training speedrun sweep
+
+This file is intentionally self-contained:
+- Defines a compact, Llama-ish transformer that implements Levanter's LmHeadModel
+- Provides a ready-to-run speedrun sweep across multiple model sizes
+
+(this example allows comparing using / not using gated attention)
+
+How to run (GPU or TPU):
+  1) Set env vars (WANDB_API_KEY, HF_TOKEN, etc.) as in the tutorial:
+     https://marin.readthedocs.io/en/latest/tutorials/submitting-speedrun/
+  2) From repo root:
+       python marin/run/ray_run.py -- \
+         python -m experiments.speedrun.hackable_transformer_attn_gate.hackable_transformer_attn_gate
+  3) Optional: SR_USE_GPU=1 to use GPU resource presets.
+
+The transformer is a pared-down version of levanter.models.llama; you can refer to it if you wish to
+add back functionality (like inference, HF exports)
+
+To edit this file for your speedrun:
+  1) Copy and rename the file in your location under experiments.speedrun
+  2) Make changes to the architecture or configurations
+  3) Add your author information
+  4) Submit (see "How to run" above)
+"""
+
+# nodryrun
+import sys
+import os
+import dataclasses
+import logging
+from dataclasses import dataclass
+from collections.abc import Callable
+from typing import Literal
+
+import equinox as eqx
+import numpy as np
+import jax.random as jrandom
+from jaxtyping import PRNGKeyArray
+
+import haliax as hax
+import haliax.nn as hnn
+from haliax import Axis, AxisSpec, NamedArray
+from haliax.jax_utils import maybe_rng_split, named_call, shaped_rng_split
+from haliax.nn.scan import ScanCheckpointPolicy, Stacked
+from haliax.state_dict import ModuleWithStateDictSerialization
+from levanter.utils.types import BlockFoldable
+
+from levanter.layers import RmsNormConfig, LayerNormConfigBase
+from levanter.layers.attention import Attention, AttentionConfig, AttentionMask, AttentionBackend
+from levanter.layers.rotary import DefaultRotaryEmbeddingsConfig, RotaryEmbeddingsConfig
+from levanter.models.lm_model import LmConfig, LmHeadModel
+from levanter.utils.activation import ActivationFunctionEnum
+from levanter.utils.flop_utils import lm_flops_per_token
+from levanter.utils.logging import silence_transformer_nag
+
+from marin.speedrun.speedrun import Author, SpeedrunConfig, default_speedrun
+from marin.execution.executor import executor_main
+from fray.cluster import ResourceConfig
+from experiments.simple_train_config import SimpleTrainConfig
+
+# Optional: Muon optimizer configs
+from levanter.optim import MuonConfig
+from experiments.llama import llama3_tokenizer_vocab_size
+
+logger = logging.getLogger("ray")
+
+_IMPORT_PATH = getattr(__spec__, "name", __name__)
+
+silence_transformer_nag()
+
+# =========================
+# Hackable config & modules
+# =========================
+
+
+@LmConfig.register_subclass("hackable_transformer")
+@dataclass(frozen=True)
+class HackableTransformerConfig(LmConfig["HackableLMHeadModel"]):
+    # Core dims
+    seq_len: int = 2048
+    hidden_dim: int = 4096
+    intermediate_dim: int = 11008
+    num_layers: int = 32
+    num_heads: int = 32
+    num_kv_heads: int = 32
+    head_dim: int | None = None
+
+    activation_function: ActivationFunctionEnum = ActivationFunctionEnum.silu
+    use_bias: bool = False
+    use_layer_norm_weight: bool = True
+    layer_norm_epsilon: float = 1e-5
+    tie_word_embeddings: bool = False
+    input_embedding_norm: bool = False
+
+    # Attention
+    use_gated_attention: Literal["none", "headwise", "elementwise"] = "none"
+    upcast_attn: bool = False
+    attn_backend: AttentionBackend | None = None
+    flash_attention_block_size: int | None = None
+    rope: RotaryEmbeddingsConfig = dataclasses.field(default_factory=DefaultRotaryEmbeddingsConfig)
+    qk_norm: LayerNormConfigBase | None = None  # set to RmsNormConfig(...) to enable
+
+    gradient_checkpointing: bool | ScanCheckpointPolicy | str = True
+    initializer_range: float = 0.02
+    reference_checkpoint: str = "NousResearch/Llama-2-7b-hf"
+    tokenizer: str | None = None
+
+    def __post_init__(self):
+        assert self.num_heads % self.num_kv_heads == 0, "num_heads must be divisible by num_kv_heads"
+        if self.head_dim is None:
+            assert self.hidden_dim % self.num_heads == 0, "hidden_dim % num_heads must be 0 when head_dim=None"
+
+    # ---- LmConfig API ----
+    @property
+    def model_type(self) -> type["HackableLMHeadModel"]:
+        return HackableLMHeadModel
+
+    Pos = property(lambda self: Axis("position", self.seq_len))
+    KeyPos = property(lambda self: self.Pos.alias("key_position"))
+    Embed = property(lambda self: Axis("embed", self.hidden_dim))
+    Layers = property(lambda self: Axis("layers", self.num_layers))
+    Mlp = property(lambda self: Axis("mlp", self.intermediate_dim))
+
+    @property
+    def norm_config(self) -> LayerNormConfigBase:
+        return RmsNormConfig(use_weight=self.use_layer_norm_weight, use_bias=self.use_bias, eps=self.layer_norm_epsilon)
+
+    def mk_LayerNorm(self, axis: AxisSpec):
+        return self.norm_config.build(axis)
+
+    def attention_config(self) -> AttentionConfig:
+        return AttentionConfig(
+            Embed=self.Embed,
+            num_heads=self.num_heads,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            use_bias=self.use_bias,
+            upcast_attn=self.upcast_attn,
+            attn_backend=self.attn_backend,
+            flash_attention_block_size=self.flash_attention_block_size,
+            rope=self.rope,
+            qk_norm=self.qk_norm,
+            gated=self.use_gated_attention,
+        )
+
+    @property
+    def actual_head_size(self) -> int:
+        return self.head_dim or (self.hidden_dim // self.num_heads)
+
+    def flops_per_token(self, vocab_size: int, context_length: int) -> float | None:
+        return lm_flops_per_token(
+            hidden_dim=self.hidden_dim,
+            intermediate_dim=self.intermediate_dim,
+            num_layers=self.num_layers,
+            num_kv_heads=self.num_kv_heads,
+            num_heads=self.num_heads,
+            seq_len=context_length,
+            vocab_size=vocab_size,
+            glu=True,
+        )
+
+    def total_trainable_params(self, vocab_size: int) -> int:
+        token_embedding = vocab_size * self.hidden_dim
+        hs = self.actual_head_size
+        attn = (
+            self.hidden_dim * hs * self.num_heads
+            + 2 * self.hidden_dim * hs * self.num_kv_heads
+            + hs * self.num_heads * self.hidden_dim
+        )
+        if self.use_gated_attention == "headwise":
+            attn += self.hidden_dim * self.num_heads
+        elif self.use_gated_attention == "elementwise":
+            attn += self.hidden_dim * hs * self.num_heads
+        else:
+            raise ValueError(f"Unknown gated attention mode: {self.use_gated_attention}")
+        mlp = 3 * self.hidden_dim * self.intermediate_dim
+        transformer = self.num_layers * (attn + mlp + 2 * self.hidden_dim) + self.hidden_dim
+        if self.input_embedding_norm:
+            transformer += self.hidden_dim
+        head = 0 if self.tie_word_embeddings else token_embedding
+        return int(transformer + token_embedding + head)
+
+
+class HackableMlp(eqx.Module):
+    """GLU MLP"""
+
+    gate_proj: hnn.Linear
+    up_proj: hnn.Linear
+    down_proj: hnn.Linear
+    act: Callable = eqx.field(static=True)
+
+    @staticmethod
+    def init(Embed: AxisSpec, Mlp: AxisSpec, activation_fn: ActivationFunctionEnum | Callable, *, key, use_bias=False):
+        k_fc, k_up_proj, k_down_proj = jrandom.split(key, 3)
+        gate_proj = hnn.Linear.init(Out=Mlp, In=Embed, key=k_fc, use_bias=use_bias, out_first=True)
+        up_proj = hnn.Linear.init(Out=Mlp, In=Embed, key=k_up_proj, use_bias=use_bias, out_first=True)
+        down_proj = hnn.Linear.init(Out=Embed, In=Mlp, key=k_down_proj, use_bias=use_bias, out_first=True)
+        if isinstance(activation_fn, ActivationFunctionEnum):
+            activation_fn = activation_fn.to_fn()
+        elif isinstance(activation_fn, str):
+            activation_fn = ActivationFunctionEnum(activation_fn).to_fn()
+        return HackableMlp(gate_proj, up_proj, down_proj, activation_fn)
+
+    @named_call
+    def __call__(self, x: NamedArray, *, key=None) -> NamedArray:
+        k_gate, k_up, k_down = maybe_rng_split(key, 3)
+        h = self.act(self.gate_proj(x, key=k_gate)) * self.up_proj(x, key=k_up)
+        return self.down_proj(h, key=k_down)
+
+
+class HackableDecoderLayer(eqx.Module):
+    """One transformer block."""
+
+    config: HackableTransformerConfig = eqx.field(static=True)
+    self_attn: Attention
+    mlp: HackableMlp
+    input_layernorm: hnn.RmsNorm
+    post_attention_layernorm: hnn.RmsNorm
+    post_attn_layernorm: hnn.RmsNorm | None = None
+    post_mlp_layernorm: hnn.RmsNorm | None = None
+
+    @staticmethod
+    def init(config: HackableTransformerConfig, *, key) -> "HackableDecoderLayer":
+        k_attn, k_mlp = jrandom.split(key, 2)
+        attn_cfg = config.attention_config()
+        attn = Attention.init(attn_cfg, key=k_attn)
+        mlp = HackableMlp.init(config.Embed, config.Mlp, config.activation_function, key=k_mlp, use_bias=config.use_bias)
+        ln1 = config.mk_LayerNorm(config.Embed)
+        ln2 = config.mk_LayerNorm(config.Embed)
+        return HackableDecoderLayer(config, attn, mlp, ln1, ln2)
+
+    @named_call
+    def __call__(
+        self, x: NamedArray, mask: NamedArray | AttentionMask | None, *, key=None, pos_ids: NamedArray | None = None
+    ):
+        k_attn, k_mlp = maybe_rng_split(key, 2)
+        # self attention and skip connection
+        residual = x
+        x = self.input_layernorm(x)
+        attn_output = self.self_attn(x=x, mask=mask, key=k_attn, pos_ids=pos_ids)
+        if self.post_attn_layernorm is not None:
+            attn_output = self.post_attn_layernorm(attn_output)
+        x = residual + attn_output
+
+        # MLP and skip connection
+        residual = x
+        x = self.post_attention_layernorm(x)
+        mlp_output = self.mlp(x, key=k_mlp)
+        if self.post_mlp_layernorm is not None:
+            mlp_output = self.post_mlp_layernorm(mlp_output)
+        output = residual + mlp_output
+        return output
+
+
+class HackableTransformer(eqx.Module):
+    config: HackableTransformerConfig = eqx.field(static=True)
+    layers: BlockFoldable[HackableDecoderLayer]
+    norm: hnn.RmsNorm
+
+    @staticmethod
+    def init(config: HackableTransformerConfig, *, key):
+        S = Stacked  # use BlockSeq for non-homogeneous layers
+        layers = S.init(config.Layers, HackableDecoderLayer, gradient_checkpointing=config.gradient_checkpointing)(
+            config, key=shaped_rng_split(key, config.num_layers)
+        )
+        return HackableTransformer(config, layers, config.mk_LayerNorm(config.Embed))
+
+    @named_call
+    def __call__(
+        self, x: NamedArray, attn_mask: NamedArray | AttentionMask | None, *, key=None, pos_ids: NamedArray | None = None
+    ) -> NamedArray:
+        keys = maybe_rng_split(key, self.config.num_layers) if key is not None else None
+        x = self.layers.fold(x, mask=attn_mask, key=keys, pos_ids=pos_ids)
+        return self.norm(x)
+
+
+class HackableEmbedding(ModuleWithStateDictSerialization, eqx.Module):
+    token_embeddings: hnn.Embedding
+    norm: hnn.RmsNorm | None = None
+
+    @staticmethod
+    def init(Vocab: Axis, config: HackableTransformerConfig, *, key):
+        emb = hnn.Embedding.init(Vocab, config.Embed, key=key)
+        ln = config.mk_LayerNorm(config.Embed) if config.input_embedding_norm else None
+        return HackableEmbedding(emb, ln)
+
+    @property
+    def Vocab(self) -> Axis:
+        return self.token_embeddings.Vocab
+
+    @named_call
+    def embed(self, input_ids: NamedArray):
+        x = self.token_embeddings(input_ids)
+        return self.norm(x) if self.norm is not None else x
+
+
+class HackableLMHeadModel(
+    ModuleWithStateDictSerialization,
+    LmHeadModel[HackableTransformerConfig],
+):
+    """Minimal Llama-like implementation of LmHeadModel"""
+
+    transformer: HackableTransformer
+    embeddings: HackableEmbedding
+    lm_head: hnn.Linear | None
+
+    @property
+    def config(self) -> HackableTransformerConfig:
+        return self.transformer.config
+
+    @property
+    def Vocab(self) -> Axis:
+        return self.embeddings.Vocab
+
+    @classmethod
+    def init(cls, Vocab: Axis, config: HackableTransformerConfig, *, key) -> "HackableLMHeadModel":
+        k_t, k_e = jrandom.split(key, 2)
+        transformer = HackableTransformer.init(config, key=k_t)
+        embeddings = HackableEmbedding.init(Vocab, config, key=k_e)
+        lm_head = (
+            None
+            if config.tie_word_embeddings
+            else hnn.Linear.init(In=config.Embed, Out=Vocab, key=k_e, use_bias=False, out_first=True)
+        )
+        return HackableLMHeadModel(transformer, embeddings, lm_head)
+
+    def activations(
+        self,
+        input_ids: NamedArray,
+        attn_mask: AttentionMask | NamedArray | None = None,
+        *,
+        key=None,
+        pos_ids: NamedArray | None = None,
+    ) -> NamedArray:
+        return self.transformer(self.embeddings.embed(input_ids), attn_mask=attn_mask, key=key, pos_ids=pos_ids)
+
+    def get_lm_head(self) -> hax.NamedArray:
+        return self.embeddings.token_embeddings.weight if self.lm_head is None else self.lm_head.weight
+
+    def resize_vocab(self, new_size: int, key: PRNGKeyArray | None = None) -> "HackableLMHeadModel":
+        pass
+
+
+# =========================
+# Speedrun sweep definition
+# =========================
+
+AUTHOR = Author(name="Calvin Xu", affiliation="Stanford University", url="https://pinlinxu.com")  # TODO: update me
+
+
+def _get_num_train_steps(param_count: int, batch_size: int, seq_len: int, tpp: int = 20) -> int:
+    total_tokens = param_count * tpp
+    return max(1, total_tokens // (batch_size * seq_len))
+
+
+def _size_presets() -> dict[str, HackableTransformerConfig]:
+    base = dict(
+        max_seq_len=4096,
+        rope=DefaultRotaryEmbeddingsConfig(),  # e.g., Llama3RotaryEmbeddingsConfig()
+        attn_backend=None,
+        qk_norm=None,  # e.g. RmsNormConfig(use_weight=True, eps=1e-5)
+        tie_word_embeddings=False,
+    )
+    return {
+        "130m": HackableTransformerConfig(
+            hidden_dim=512, intermediate_dim=1792, num_layers=6, num_heads=8, num_kv_heads=8, **base
+        ),
+        "300m": HackableTransformerConfig(
+            hidden_dim=768, intermediate_dim=2688, num_layers=12, num_heads=12, num_kv_heads=12, **base
+        ),
+        "520m": HackableTransformerConfig(
+            hidden_dim=1024, intermediate_dim=3584, num_layers=24, num_heads=16, num_kv_heads=8, **base
+        ),
+        "1_2b": HackableTransformerConfig(
+            hidden_dim=2048, intermediate_dim=7168, num_layers=16, num_heads=16, num_kv_heads=8, **base
+        ),
+    }
+
+
+def _muon_presets() -> dict[str, MuonConfig]:
+    return {
+        "130m": MuonConfig(
+            learning_rate=0.016,
+            adam_lr=0.0032,
+            weight_decay=0.1,
+            min_lr_ratio=0,
+            warmup=0,
+            momentum=0.95,
+            beta1=0.8,
+            beta2=0.98,
+            epsilon=1e-15,
+            muon_epsilon=1e-5,
+            max_grad_norm=1,
+            lr_schedule="linear",
+            decay=0.8,
+        ),
+        "300m": MuonConfig(
+            learning_rate=0.008,
+            adam_lr=0.0024,
+            weight_decay=0.1,
+            min_lr_ratio=0,
+            warmup=0,
+            momentum=0.98,
+            beta1=0.8,
+            beta2=0.98,
+            epsilon=1e-15,
+            muon_epsilon=1e-5,
+            max_grad_norm=1,
+            lr_schedule="linear",
+            decay=0.8,
+        ),
+        "520m": MuonConfig(
+            learning_rate=0.008,
+            adam_lr=0.0024,
+            weight_decay=0.1,
+            min_lr_ratio=0,
+            warmup=0,
+            momentum=0.98,
+            beta1=0.8,
+            beta2=0.98,
+            epsilon=1e-25,
+            muon_epsilon=1e-5,
+            max_grad_norm=1,
+            lr_schedule="linear",
+            decay=1,
+        ),
+        "1_2b": MuonConfig(
+            learning_rate=0.004,
+            adam_lr=0.0012,
+            weight_decay=0.1,
+            min_lr_ratio=0,
+            warmup=0,
+            momentum=0.98,
+            beta1=0.8,
+            beta2=0.98,
+            epsilon=1e-15,
+            muon_epsilon=1e-5,
+            max_grad_norm=2,
+            lr_schedule="linear",
+            decay=1,
+        ),
+    }
+
+
+def _resource_presets(use_gpu: bool = False):
+    if use_gpu:
+        return {
+            "130m": ResourceConfig.with_gpu("A100-80G", count=1),
+            "300m": ResourceConfig.with_gpu("A100-80G", count=1),
+            "520m": ResourceConfig.with_gpu("A100-80G", count=2),
+            "1_2b": ResourceConfig.with_gpu("A100-80G", count=4),
+        }
+    return {
+        "130m": ResourceConfig.with_tpu("v5p-32"),
+        "300m": ResourceConfig.with_tpu("v5p-32"),
+        "520m": ResourceConfig.with_tpu("v5p-32"),
+        "1_2b": ResourceConfig.with_tpu("v5p-32"),
+    }
+
+
+def _batch_sizes() -> dict[str, int]:
+    return {"130m": 128, "300m": 128, "520m": 128, "1_2b": 256}
+
+
+def _lr_multipliers(start: float = 1.0, stop: float = 2.5, step: float = 0.5) -> list[float]:
+    """Generate LR multipliers for sweep. Paper suggests training with increased LR."""
+    vals = np.arange(start, stop + step / 2, step)  # +step/2 to include stop
+    return [float(v) for v in vals]
+
+
+def _format_multiplier_label(mult: float) -> str:
+    s = f"{mult:.6g}"
+    s = s.rstrip("0").rstrip(".") if "." in s else s
+    return s.replace(".", "_")
+
+
+def build_run(
+    size: str,
+    use_gate: bool,
+    *,
+    use_gpu: bool = False,
+    lr_multiplier: float | None = None,
+) -> tuple[str, SpeedrunConfig]:
+    sizes = _size_presets()
+    if size not in sizes:
+        raise ValueError(f"Unknown size: {size}")
+    model_cfg = dataclasses.replace(sizes[size], use_gated_attention=use_gate)
+
+    batch = _batch_sizes()[size]
+    seq_len = model_cfg.seq_len
+    params = int(model_cfg.total_trainable_params(llama3_tokenizer_vocab_size))
+    steps = _get_num_train_steps(params, batch, seq_len, tpp=20)
+
+    muon = _muon_presets()[size]
+    if lr_multiplier is not None:
+        muon = dataclasses.replace(
+            muon,
+            learning_rate=muon.learning_rate * lr_multiplier,
+            adam_lr=muon.adam_lr * lr_multiplier,
+        )
+    resources = _resource_presets(use_gpu=use_gpu)[size]
+
+    train = SimpleTrainConfig(
+        resources,
+        train_batch_size=batch,
+        num_train_steps=steps,
+        learning_rate=muon.learning_rate,
+        optimizer_config=muon,
+        steps_per_hf_export=-1,  # disable checkpointing
+    )
+
+    lr_tag = f"_lr_x{_format_multiplier_label(lr_multiplier)}" if lr_multiplier is not None else ""
+    run_name = f"hacktx_{size}_{'attngate' if use_gate else 'stdattn'}_{seq_len}_splash_lr_sweep{lr_tag}_v5p32"
+    desc = (
+        f"Hackable Transformer ({size}); "
+        f"{'Gated Attention' if use_gate else 'Std Attention'} (Splash); "
+        f"LR sweep multiplier={lr_multiplier if lr_multiplier is not None else 1.0:g}"
+    )
+    cfg = SpeedrunConfig(author=AUTHOR, description=desc, model_config=model_cfg, train_config=train)
+    return run_name, cfg
+
+
+if __name__ == "__main__":
+    ###
+    # make the current __main__ module importable under its canonical name
+    sys.modules[_IMPORT_PATH] = sys.modules[__name__]
+    # allow the workers to import the classes
+    for _cls in (
+        HackableTransformerConfig,
+        HackableMlp,
+        HackableDecoderLayer,
+        HackableTransformer,
+        HackableEmbedding,
+        HackableLMHeadModel,
+    ):
+        _cls.__module__ = _IMPORT_PATH
+    ###
+
+    # sizes = ["130m", "300m", "520m", "1_2b"]
+    sizes = ["1_2b"]
+    use_gpu = bool(int(os.environ.get("SR_USE_GPU", "0")))
+    use_gate = "elementwise"
+    steps = []
+    # Sweep LR from 1x to 4x at 0.5x increments (paper suggests higher LR for gated attention)
+    lr_mults = _lr_multipliers(start=1.0, stop=4.0, step=0.5)
+    for s in sizes:
+        for m in lr_mults:
+            name, cfg = build_run(s, use_gate, use_gpu=use_gpu, lr_multiplier=m)
+            steps.extend(default_speedrun(name, cfg))
+    executor_main(steps=steps, description="Hackable transformer gated-attention LR sweep")
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/130m/speedrun_results.json
new file mode 100644
index 0000000000..eb3f224ba6
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1556898355484009,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:26:57 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0032,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1670497124689082e+19,
+        "training_time": 1589.1199788519991,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x1-88f232"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..fd5f073b43
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9160435795783997,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 23:16:03 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.004,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0012,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.004,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1808594941354136e+21,
+        "training_time": 160792.41477878726,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x1_v5p32-ec656c"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..9629799af5
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9160401225090027,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-17 04:43:09 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.004,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0012,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.004,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.80638282968129e+21,
+        "training_time": 122983.58045215753,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x1-ecb416"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/300m/speedrun_results.json
new file mode 100644
index 0000000000..1f74395a36
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0535272359848022,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:44:00 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.008,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0024,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.008,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.068078886783368e+19,
+        "training_time": 6900.97887633901,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x1-25ee3b"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/520m/speedrun_results.json
new file mode 100644
index 0000000000..b53ac37350
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/520m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9799903035163879,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 19:07:36 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.008,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0024,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.008,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.2386170836914753e+20,
+        "training_time": 30482.258764862137,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash_lr_sweep_lr_x1-bca683"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/130m/speedrun_results.json
new file mode 100644
index 0000000000..3a9e7c6474
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=1.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1563303470611572,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 11:00:02 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.024,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0048000000000000004,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.024,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1669741753874995e+19,
+        "training_time": 1589.0171233489916,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x1_5-c1c1a5"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..10977b1ea4
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=1.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9129290580749512,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-10 22:53:32 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.006,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0018,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.006,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1933404335361132e+21,
+        "training_time": 162491.88909805464,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x1_5_v5p32-f366f3"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..a7a662d547
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=1.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9122039675712585,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-17 00:31:25 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.006,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0018,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.006,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.8044518078036129e+21,
+        "training_time": 122852.11109774053,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x1_5-e5d647"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/300m/speedrun_results.json
new file mode 100644
index 0000000000..59fc868322
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=1.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0536423921585083,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:24:57 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.012,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0036,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.012,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.0759425868293325e+19,
+        "training_time": 6911.686528907043,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x1_5-1394e4"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/520m/speedrun_results.json
new file mode 100644
index 0000000000..01540fd63b
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1_5/520m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=1.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9817151427268982,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 19:12:38 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.012,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0036,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.012,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.2333461727415042e+20,
+        "training_time": 30410.487101600003,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash_lr_sweep_lr_x1_5-76e777"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/130m/speedrun_results.json
new file mode 100644
index 0000000000..cd1d1c9195
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=2",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1576088666915894,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:57:26 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.032,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0064,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.032,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1642901007026072e+19,
+        "training_time": 1585.3623375580164,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x2-8abf41"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..3f3bbec3e4
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=2",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.911785364151001,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 23:37:29 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.008,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0024,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.008,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.18325574649179e+21,
+        "training_time": 161118.70186435047,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x2_v5p32-0b6010"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..bc85070a0a
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=2",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9119555354118347,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-17 01:10:26 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.008,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0024,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.008,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.8017056211590617e+21,
+        "training_time": 122665.14305276837,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x2-be36f3"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/300m/speedrun_results.json
new file mode 100644
index 0000000000..5c89c5e54e
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=2",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0551481246948242,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:26:21 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0048,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.0833465300533e+19,
+        "training_time": 6921.768150944036,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x2-03a06d"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/520m/speedrun_results.json
new file mode 100644
index 0000000000..065735371c
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2/520m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=2",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9846972823143005,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 19:47:23 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0048,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.2363047585747508e+20,
+        "training_time": 30450.772856410007,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash_lr_sweep_lr_x2-083666"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/130m/speedrun_results.json
new file mode 100644
index 0000000000..29236ac9b2
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=2.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1613965034484863,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:57:45 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.04,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.008,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.04,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1718583294415854e+19,
+        "training_time": 1595.6676599150128,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x2_5-01984c"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..35a23973f5
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=2.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9130340218544006,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 23:35:44 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.01,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0029999999999999996,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.01,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.18302728568909e+21,
+        "training_time": 161087.5933672508,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x2_5_v5p32-909ccf"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..1e3262390d
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=2.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9125918745994568,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-16 23:49:35 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.01,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0029999999999999996,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.01,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.8052209610227266e+21,
+        "training_time": 122904.47719381309,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x2_5-2f4194"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/300m/speedrun_results.json
new file mode 100644
index 0000000000..3396ba42e9
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=2.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.05768620967865,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:42:55 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.02,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.005999999999999999,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.02,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.071905773521359e+19,
+        "training_time": 6906.18977876002,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x2_5-13d2c4"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/520m/speedrun_results.json
new file mode 100644
index 0000000000..0996663372
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x2_5/520m/speedrun_results.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=2.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9764590263366699,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.0163680043413694e+20,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-16 02:37:19 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 25104482304,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.02,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 47883,
+          "optimizer_config": {
+            "adam_lr": 0.005999999999999999,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.02,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 4.474616058782678e+20,
+        "training_time": 60928.86790281424,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_2048_splash_lr_sweep_lr_x2_5-71c9e9"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/130m/speedrun_results.json
new file mode 100644
index 0000000000..d89452adff
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=3",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1638695001602173,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:45:13 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.048,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.009600000000000001,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.048,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1665914456968133e+19,
+        "training_time": 1588.4959772560094,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x3-70d7ec"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..5069a17d45
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=3",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9175548553466797,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 06:17:21 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.012,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0036,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.012,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.178280086680337e+21,
+        "training_time": 160441.18827346637,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x3_v5p32-4286b4"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..2539d30ffa
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=3",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9134978652000427,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-17 00:34:36 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.012,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0036,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.012,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.81031625465692e+21,
+        "training_time": 123251.37899352668,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x3-e8942d"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/300m/speedrun_results.json
new file mode 100644
index 0000000000..d89eee804d
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=3",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.060373306274414,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:42:27 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.024,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0072,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.024,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.078015592931944e+19,
+        "training_time": 6914.509249635,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x3-6cc06b"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/520m/speedrun_results.json
new file mode 100644
index 0000000000..27112f0710
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3/520m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=3",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9944517016410828,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 19:40:08 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.024,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0072,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.024,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.2342612489445596e+20,
+        "training_time": 30422.947289550102,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash_lr_sweep_lr_x3-325c68"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/130m/speedrun_results.json
new file mode 100644
index 0000000000..5b6ee02988
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=3.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.16841721534729,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:53:41 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.056,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0112,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.056,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1712847361037679e+19,
+        "training_time": 1594.886623234978,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x3_5-4faf11"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..fef8dae8a7
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=3.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9142972230911255,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 06:32:05 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.014,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0042,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.014,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1912866285370585e+21,
+        "training_time": 162212.23155461036,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x3_5_v5p32-1038f8"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..79c7c4fa23
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=3.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9148565530776978,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-16 22:33:13 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.014,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0042,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.014,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.8114652296478665e+21,
+        "training_time": 123329.6044150236,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x3_5-b0a3b2"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/300m/speedrun_results.json
new file mode 100644
index 0000000000..898ff7c6fe
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=3.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0644729137420654,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:02:21 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.028,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0084,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.028,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.073143302509889e+19,
+        "training_time": 6907.874867252028,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x3_5-1a6ee3"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/520m/speedrun_results.json
new file mode 100644
index 0000000000..2d42e75484
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x3_5/520m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=3.5",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9976317882537842,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 19:33:19 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.028,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0084,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.028,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.2274891561232957e+20,
+        "training_time": 30330.73469666797,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash_lr_sweep_lr_x3_5-ea774d"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/130m/speedrun_results.json
new file mode 100644
index 0000000000..c5015256b4
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=4",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.171983242034912,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:44:46 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.064,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0128,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.064,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1647166759460198e+19,
+        "training_time": 1585.9431862010074,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x4-b1cb5b"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..2192aa2862
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=4",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9157812595367432,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 23:47:52 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0048,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1860119667024e+21,
+        "training_time": 161494.00418061003,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x4_v5p32-0d60e6"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/1_2b/speedrun_results_v5p64.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/1_2b/speedrun_results_v5p64.json
new file mode 100644
index 0000000000..817bcd9636
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/1_2b/speedrun_results_v5p64.json
@@ -0,0 +1,143 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=4",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9164798259735107,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 32,
+        "num_devices": 32,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-64"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-17 02:26:48 UTC",
+        "tokenized_dataset": "gs://marin-us-east5/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0048,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.051479930331274e+21,
+        "training_time": 139670.4745595911,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x4-f80807"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/300m/speedrun_results.json
new file mode 100644
index 0000000000..f1344d0d7d
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/300m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash); LR sweep multiplier=4",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0660113096237183,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 12:26:35 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.032,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0096,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.032,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.059991395179448e+19,
+        "training_time": 6889.966496704042,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash_lr_sweep_lr_x4-85c5fd"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/520m/speedrun_results.json
new file mode 100644
index 0000000000..0be866a11f
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/lr_x4/520m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash); LR sweep multiplier=4",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0015015602111816,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 19:01:46 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.032,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0096,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.032,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.236153294934872e+20,
+        "training_time": 30448.710443012962,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash_lr_sweep_lr_x4-f42244"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/130m/speedrun_results.json
new file mode 100644
index 0000000000..6ef86d1991
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/130m/speedrun_results.json
@@ -0,0 +1,140 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash)",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1564404964447021,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 4096,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-06 16:35:18 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0032,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.2199985674433722e+19,
+        "training_time": 1661.2180929239819,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash-3c6cbd"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/1_2b/speedrun_results.json
new file mode 100644
index 0000000000..c4b81a736f
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/1_2b/speedrun_results.json
@@ -0,0 +1,140 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash)",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9263789653778076,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 4096,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.5869176757309093e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-07 15:35:49 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 29969350656,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.004,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 28581,
+          "optimizer_config": {
+            "adam_lr": 0.0012,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.004,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.928678262278687e+20,
+        "training_time": 80728.18984584269,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_4096_splash-1544b5"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/300m/speedrun_results.json
new file mode 100644
index 0000000000..9e2fe8f2ea
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/300m/speedrun_results.json
@@ -0,0 +1,140 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (300m); Gated Attention (Splash)",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.0539624691009521,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 2688,
+          "layer_norm_epsilon": 1e-05,
+          "num_heads": 12,
+          "num_kv_heads": 12,
+          "num_layers": 12,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 4096,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 1.021384111077458e+19,
+        "model_flops_per_token": 555024384.0,
+        "model_size": 306727680,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-06 18:26:07 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 6134169600,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.008,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 11700,
+          "optimizer_config": {
+            "adam_lr": 0.0024,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.008,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 5.2696079184663675e+19,
+        "training_time": 7175.392045842003,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_300m_attngate_4096_splash-a5e290"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/520m/speedrun_results.json
new file mode 100644
index 0000000000..b2960ec991
--- /dev/null
+++ b/experiments/speedrun/hackable_transformer_attn_gate/naive_no_fuse_q/520m/speedrun_results.json
@@ -0,0 +1,140 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (520m); Gated Attention (Splash)",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9801320433616638,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 1024,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 3584,
+          "layer_norm_epsilon": 1e-05,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 24,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 4096,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.081733891346976e+19,
+        "model_flops_per_token": 1349517312.0,
+        "model_size": 627622912,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-07 01:18:57 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 12551979008,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.008,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 23941,
+          "optimizer_config": {
+            "adam_lr": 0.0024,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-25,
+            "haps": null,
+            "learning_rate": 0.008,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 2.2878030460560127e+20,
+        "training_time": 31152.002261111284,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_520m_attngate_4096_splash-5794f3"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/jax/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/jax/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/jax/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/jax/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/jax/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/jax/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/jax/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/jax/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/jax/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_lr/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_lr/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.25/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.25/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x0.5/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x0.5/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x2/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x2/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/1_2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/1_2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/1_2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/attnsink/splash/default_x4/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/attnsink/splash/default_x4/520m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/hackable_transformer_attn_sink.py b/experiments/speedrun/hackable_transformer_attn_sink/hackable_transformer_attn_sink.py
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/hackable_transformer_attn_sink.py
rename to experiments/speedrun/hackable_transformer_attn_sink/hackable_transformer_attn_sink.py
diff --git a/experiments/speedrun/hackable_transformer_starter/std_attn/1.2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/std_attn/1.2b/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/std_attn/1.2b/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/std_attn/1.2b/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/std_attn/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/std_attn/130m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/std_attn/130m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/std_attn/130m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/std_attn/300m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/std_attn/300m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/std_attn/300m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/std_attn/300m/speedrun_results.json
diff --git a/experiments/speedrun/hackable_transformer_starter/std_attn/520m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_sink/std_attn/520m/speedrun_results.json
similarity index 100%
rename from experiments/speedrun/hackable_transformer_starter/std_attn/520m/speedrun_results.json
rename to experiments/speedrun/hackable_transformer_attn_sink/std_attn/520m/speedrun_results.json
diff --git a/lib/levanter/src/levanter/layers/attention.py b/lib/levanter/src/levanter/layers/attention.py
index d65c449aca..3c102adad9 100644
--- a/lib/levanter/src/levanter/layers/attention.py
+++ b/lib/levanter/src/levanter/layers/attention.py
@@ -9,7 +9,7 @@
 from dataclasses import dataclass
 from enum import StrEnum
 from numbers import Integral
-from typing import Optional, Union, cast, overload
+from typing import Literal, Optional, Union, cast, overload
 
 import equinox as eqx
 import jax
@@ -1574,7 +1574,7 @@ class AttentionConfig:
     scaling_factor: Optional[float] = None
     logits_soft_cap: Optional[float] = None
     qk_norm: Optional[LayerNormConfigBase] = None
-    """Configuration for QK normalization. If None, no normalization is applied."""
+    gated: Literal["none", "headwise", "elementwise"] = "none"
 
     def __post_init__(self):
         assert (
@@ -1615,6 +1615,20 @@ def use_flash_attention(self) -> bool:
             return default_attention_type() != AttentionBackend.VANILLA
         return self.attn_backend != AttentionBackend.VANILLA
 
+    @property
+    def GateSize(self) -> Axis:
+        """Axis for the gate output size based on gating mode.
+
+        For headwise gating, returns an axis of size 1 (one scalar per head).
+        For elementwise gating, returns an axis of size head_size (one value per element).
+
+        The axis is always named "gate_size" for consistency.
+        """
+        if self.gated == "headwise":
+            return Axis("gate_size", 1)
+        else:  # elementwise
+            return Axis("gate_size", self.head_size)
+
 
 class Attention(eqx.Module):
     """A multi-head attention layer that uses dot product attention.
@@ -1622,7 +1636,7 @@ class Attention(eqx.Module):
     This is a general-purpose attention layer that can be used in various transformer architectures.
     It supports multi-head attention (MHA), multi-query attention (MQA), and grouped-query attention (GQA).
 
-    Supports ROPE and QK normalization. We should probably not add much more stuff.
+    Supports ROPE, QK normalization, and gated attention (headwise or elementwise).
     """
 
     config: AttentionConfig = eqx.field(static=True)
@@ -1633,12 +1647,14 @@ class Attention(eqx.Module):
     q_norm: Optional[LayerNormBase] = None
     k_norm: Optional[LayerNormBase] = None
     rot_embs: Optional[RotaryEmbeddings] = None
+    gate_proj: Optional[hnn.Linear] = None
 
     @staticmethod
     def init(config: AttentionConfig, *, key) -> "Attention":
         use_bias = config.use_bias
         use_output_bias = config.use_output_bias if config.use_output_bias is not None else use_bias
-        k_q, k_k, k_v, k_o = jrandom.split(key, 4)
+        k_q, k_k, k_v, k_o, k_g = jrandom.split(key, 5)
+
         q_proj = hnn.Linear.init(
             In=config.Embed,
             Out=(config.KVHeads, config.QHeadsPerGroup, config.HeadSize),
@@ -1668,6 +1684,19 @@ def init(config: AttentionConfig, *, key) -> "Attention":
             out_first=True,
         )
 
+        # For gated attention, create a separate gate projection.
+        # For headwise gating: GateSize = 1 (one scalar per head)
+        # For elementwise gating: GateSize = HeadSize (one value per element)
+        gate_proj = None
+        if config.gated != "none":
+            gate_proj = hnn.Linear.init(
+                In=config.Embed,
+                Out=(config.KVHeads, config.QHeadsPerGroup, config.GateSize),
+                key=k_g,
+                use_bias=use_bias,
+                out_first=True,
+            )
+
         q_norm = None
         k_norm = None
         if config.qk_norm is not None:
@@ -1677,7 +1706,7 @@ def init(config: AttentionConfig, *, key) -> "Attention":
         # Build rotary embeddings once during initialization if configured
         rot_embs = config.rope.build(config.HeadSize) if config.rope is not None else None
 
-        return Attention(config, q_proj, k_proj, v_proj, o_proj, q_norm, k_norm, rot_embs)
+        return Attention(config, q_proj, k_proj, v_proj, o_proj, q_norm, k_norm, rot_embs, gate_proj)
 
     def empty_page_cache(self, spec: PageTableSpec, *, dtype) -> "KvPageCache":
         return KvPageCache.init(spec, self.config.KVHeads, self.config.HeadSize, dtype=dtype)
@@ -1693,8 +1722,8 @@ def __call__(
     ) -> NamedArray:
         key_proj, key_o = maybe_rng_split(key, 2)
 
-        # Shared computation of q, k, v
-        q, k, v = self._compute_qkv(x, key=key_proj, pos_ids=pos_ids)
+        # Shared computation of q, k, v (and gate if gated)
+        q, k, v, gate = self._compute_qkv(x, key=key_proj, pos_ids=pos_ids)
 
         # Reshape for attention kernels (convert embed → heads/head_size)
         q = q.rearrange((..., "kv_head", "q_heads_per_group", "position", "head_size"))
@@ -1726,6 +1755,12 @@ def __call__(
             prng=key,
         )
 
+        if gate is not None:
+            gate = gate.rearrange((..., "kv_head", "q_heads_per_group", "position", "gate_size"))
+            gate = hax.nn.sigmoid(gate)
+            gate = gate.rename({"gate_size": "head_size"})
+            attn_output = attn_output * gate
+
         # Flatten heads and apply output projection
         attn_output = attn_output.flatten_axes(("kv_head", "q_heads_per_group"), "heads")
         attn_output = attn_output.astype(x.dtype)
@@ -1752,10 +1787,9 @@ def paged_decode(
         describes where the new keys and values should be written in ``kv_cache``.
         Currently only causal masks are supported.
         """
-
         key_proj, key_o = maybe_rng_split(key, 2)
 
-        q, k, v = self._compute_qkv(x, key=key_proj, pos_ids=pos_ids)
+        q, k, v, gate = self._compute_qkv(x, key=key_proj, pos_ids=pos_ids)
 
         kv_cache = kv_cache.update(batch_info, k, v)
 
@@ -1776,6 +1810,12 @@ def paged_decode(
             soft_cap=self.config.logits_soft_cap,
         )
 
+        if gate is not None:
+            gate = gate.rearrange((..., "kv_head", "q_heads_per_group", "position", "gate_size"))
+            gate = hax.nn.sigmoid(gate)
+            gate = gate.rename({"gate_size": "head_size"})
+            attn_tokens = attn_tokens * gate
+
         attn_output = attn_tokens.flatten_axes(("kv_head", "q_heads_per_group"), "heads")
         attn_output = attn_output.astype(x.dtype)
         attn_output = self.o_proj(attn_output, key=key_o)
@@ -1789,30 +1829,39 @@ def _compute_qkv(
         *,
         key,
         pos_ids: NamedArray | None = None,
-    ) -> tuple[NamedArray, NamedArray, NamedArray]:
-        """Project *x* to Q, K and V and apply all per-head processing."""
+    ) -> tuple[NamedArray, NamedArray, NamedArray, NamedArray | None]:
+        """Project *x* to Q, K and V (and gate if gated) and apply all per-head processing.
 
-        # Split the projection key into three – one for each of Q, K, V
-        key_q, key_k, key_v = maybe_rng_split(key, 3)
+        Returns:
+            A tuple of (q, k, v, gate) where gate is None if gating is disabled.
+        """
+
+        # Split the projection key into four – one for each of Q, K, V, and gate
+        key_q, key_k, key_v, key_g = maybe_rng_split(key, 4)
 
         # Linear projections
         q = self.q_proj(x, key=key_q)
         k = self.k_proj(x, key=key_k)
         v = self.v_proj(x, key=key_v)
 
-        # Optional QK layer-norm
+        # Compute gate if gated attention is enabled
+        gate = None
+        if self.gate_proj is not None:
+            gate = self.gate_proj(x, key=key_g)
+
+        # Optional QK layer-norm (applied only to Q, not gate)
         if self.config.qk_norm is not None:
             q = self.q_norm(q)  # type: ignore[misc]
             k = self.k_norm(k)  # type: ignore[misc]
 
-        # Apply rotary embeddings if configured
+        # Apply rotary embeddings if configured (applied only to Q, not gate)
         if self.rot_embs is not None:
             if pos_ids is None:
                 pos_ids = hax.arange(x.resolve_axis("position"))
             q = self.rot_embs(q, pos_ids).astype(q.dtype)
             k = self.rot_embs(k, pos_ids).astype(k.dtype)
 
-        return q, k, v
+        return q, k, v, gate
 
 
 @named_call
@@ -2351,6 +2400,7 @@ def init(config: AttentionConfig, *, key) -> "AttentionWithSink":
             base.q_norm,
             base.k_norm,
             base.rot_embs,
+            base.gate_proj,
             sinks,
         )
 
@@ -2363,29 +2413,15 @@ def __call__(
         key=None,
         pos_ids: NamedArray | None = None,
     ) -> NamedArray:
-        key_q, key_k, key_v, key_o = maybe_rng_split(key, 4)
+        key_proj, key_o = maybe_rng_split(key, 2)
 
-        q_proj = self.q_proj(x, key=key_q)
-        k_proj = self.k_proj(x, key=key_k)
-        v = self.v_proj(x, key=key_v)
-
-        if self.config.qk_norm is not None:
-            q = self.q_norm(q_proj)  # type: ignore[misc]
-            k = self.k_norm(k_proj)  # type: ignore[misc]
-        else:
-            q = q_proj
-            k = k_proj
+        # Compute q, k, v (and gate if gated)
+        q, k, v, gate = self._compute_qkv(x, key=key_proj, pos_ids=pos_ids)
 
         q = q.rearrange((..., "kv_head", "q_heads_per_group", "position", "head_size"))
         k = k.rearrange((..., "kv_head", "position", "head_size"))
         v = v.rearrange((..., "kv_head", "position", "head_size"))
 
-        if self.rot_embs is not None:
-            if pos_ids is None:
-                pos_ids = hax.arange(x.resolve_axis("position"), dtype=jnp.int32)
-            q = self.rot_embs(q, pos_ids)
-            k = self.rot_embs(k, pos_ids)
-
         k = k.rename({"position": "key_position"})
         v = v.rename({"position": "key_position"})
 
@@ -2411,6 +2447,13 @@ def __call__(
             attn_sink=self.sinks,
         )
 
+        if gate is not None:
+            gate = gate.rearrange((..., "kv_head", "q_heads_per_group", "position", "gate_size"))
+            gate = hax.nn.sigmoid(gate)
+            # Rename gate_size to head_size for proper broadcasting/multiplication
+            gate = gate.rename({"gate_size": "head_size"})
+            attn_output = attn_output * gate
+
         attn_output = attn_output.flatten_axes(("kv_head", "q_heads_per_group"), "heads")
         attn_output = attn_output.astype(x.dtype)
         attn_output = self.o_proj(attn_output, key=key_o)
diff --git a/lib/levanter/tests/test_attention.py b/lib/levanter/tests/test_attention.py
index 50406c25df..5bcd5eb9aa 100644
--- a/lib/levanter/tests/test_attention.py
+++ b/lib/levanter/tests/test_attention.py
@@ -21,6 +21,7 @@
 from levanter.utils.mesh import create_mesh_from_axis_specs
 
 from levanter.layers.attention import (
+    Attention,
     AttentionBackend,
     AttentionConfig,
     AttentionMask,
@@ -131,6 +132,79 @@ def test_attention_with_sink_module():
     assert_trees_all_close(out.array, expected)
 
 
+def test_attention_with_gating_module():
+    """Test elementwise gated attention.
+
+    When gated="elementwise", a separate gate_proj outputs [kv_head, q_heads_per_group, head_size].
+
+    With zero weights/biases for Q and gate, the gate output is sigmoid(0) = 0.5.
+    With v_proj bias=1 and o_proj weight=1, the attention output before gating is 1.
+    After gating: 1 * 0.5 = 0.5
+    """
+    Pos = hax.Axis("position", 2)
+    Embed = hax.Axis("embed", 1)
+
+    config = AttentionConfig(Embed=Embed, num_heads=1, num_kv_heads=1, use_bias=True, gated="elementwise")
+    attn = Attention.init(config, key=jrandom.PRNGKey(0))
+
+    # q_proj has shape [embed, kv_head, q_heads_per_group, head_size]
+    # gate_proj is a separate projection with same output shape
+    attn = eqx.tree_at(lambda a: a.q_proj.weight, attn, hax.zeros(attn.q_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.q_proj.bias, attn, hax.zeros(attn.q_proj.bias.axes))
+    attn = eqx.tree_at(lambda a: a.k_proj.weight, attn, hax.zeros(attn.k_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.k_proj.bias, attn, hax.zeros(attn.k_proj.bias.axes))
+    attn = eqx.tree_at(lambda a: a.v_proj.weight, attn, hax.zeros(attn.v_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.v_proj.bias, attn, hax.ones(attn.v_proj.bias.axes))
+    attn = eqx.tree_at(lambda a: a.o_proj.weight, attn, hax.ones(attn.o_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.o_proj.bias, attn, hax.zeros(attn.o_proj.bias.axes))
+    # Zero out gate_proj so sigmoid(0) = 0.5
+    attn = eqx.tree_at(lambda a: a.gate_proj.weight, attn, hax.zeros(attn.gate_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.gate_proj.bias, attn, hax.zeros(attn.gate_proj.bias.axes))
+
+    x = hax.zeros((Pos, Embed))
+    out = attn(x, None)
+
+    expected = np.full((2, 1), 0.5)
+    assert_trees_all_close(out.array, expected)
+
+
+def test_attention_with_headwise_gating_module():
+    """Test headwise gated attention.
+
+    When gated="headwise", a separate gate_proj outputs [kv_head, q_heads_per_group, 1]
+    (one scalar per head).
+
+    With zero weights/biases for Q and gate, the gate output is sigmoid(0) = 0.5.
+    With v_proj bias=1 and o_proj weight=1, the attention output before gating is 1.
+    After gating: 1 * 0.5 = 0.5
+    """
+    Pos = hax.Axis("position", 2)
+    Embed = hax.Axis("embed", 1)
+
+    config = AttentionConfig(Embed=Embed, num_heads=1, num_kv_heads=1, use_bias=True, gated="headwise")
+    attn = Attention.init(config, key=jrandom.PRNGKey(0))
+
+    # q_proj has shape [embed, kv_head, q_heads_per_group, head_size]
+    # gate_proj is a separate projection with output [kv_head, q_heads_per_group, 1]
+    attn = eqx.tree_at(lambda a: a.q_proj.weight, attn, hax.zeros(attn.q_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.q_proj.bias, attn, hax.zeros(attn.q_proj.bias.axes))
+    attn = eqx.tree_at(lambda a: a.k_proj.weight, attn, hax.zeros(attn.k_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.k_proj.bias, attn, hax.zeros(attn.k_proj.bias.axes))
+    attn = eqx.tree_at(lambda a: a.v_proj.weight, attn, hax.zeros(attn.v_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.v_proj.bias, attn, hax.ones(attn.v_proj.bias.axes))
+    attn = eqx.tree_at(lambda a: a.o_proj.weight, attn, hax.ones(attn.o_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.o_proj.bias, attn, hax.zeros(attn.o_proj.bias.axes))
+    # Zero out gate_proj so sigmoid(0) = 0.5
+    attn = eqx.tree_at(lambda a: a.gate_proj.weight, attn, hax.zeros(attn.gate_proj.weight.axes))
+    attn = eqx.tree_at(lambda a: a.gate_proj.bias, attn, hax.zeros(attn.gate_proj.bias.axes))
+
+    x = hax.zeros((Pos, Embed))
+    out = attn(x, None)
+
+    expected = np.full((2, 1), 0.5)
+    assert_trees_all_close(out.array, expected)
+
+
 def test_te_bin_and_group_axes_by_function():
     QPos = hax.Axis("QPos", 128)
     KPos = hax.Axis("KPos", 128)
diff --git a/lib/marin/src/marin/speedrun/paloma_local_download.py b/lib/marin/src/marin/speedrun/paloma_local_download.py
index 9628f33345..1a7a74b39e 100644
--- a/lib/marin/src/marin/speedrun/paloma_local_download.py
+++ b/lib/marin/src/marin/speedrun/paloma_local_download.py
@@ -43,4 +43,4 @@ def speedrun_paloma_tokenized(tokenizer: str = llama3_tokenizer):
 
 
 if __name__ == "__main__":
-    executor_main(steps=[paloma_speedrun, *speedrun_paloma_tokenized])
+    executor_main(steps=[paloma_speedrun, *speedrun_paloma_tokenized().values()])