marin-community · Calvin-Xu · Dec 6, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/hackable_transformer_attn_gate.py b/experiments/speedrun/hackable_transformer_attn_gate/hackable_transformer_attn_gate.py
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/130m/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/130m/speedrun_results.json
@@ -0,0 +1,141 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1556898355484009,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 1792,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 8,
+          "num_kv_heads": 8,
+          "num_layers": 6,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": true,
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 2.1289341996446515e+18,
+        "model_flops_per_token": 227868672.0,
+        "model_size": 155720192,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "topology": null,
+            "type": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-09 10:26:57 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 3114270720,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.016,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 5940,
+          "optimizer_config": {
+            "adam_lr": 0.0032,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.8,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.016,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 128,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1670497124689082e+19,
+        "training_time": 1589.1199788519991,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x1-88f232"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results.json b/experiments/speedrun/hackable_transformer_attn_gate/lr_x1/1_2b/speedrun_results.json
@@ -0,0 +1,144 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Stanford University",
+          "name": "Calvin Xu",
+          "url": "https://pinlinxu.com"
+        },
+        "description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=1",
+        "device_flops": 459000000000000.0,
+        "eval/paloma/c4_en/bpb": 0.9160435795783997,
+        "model_config": {
+          "activation_function": "silu",
+          "attn_backend": null,
+          "cross_entropy_block_size": null,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 2048,
+          "initializer_range": 0.02,
+          "input_embedding_norm": false,
+          "intermediate_dim": 7168,
+          "layer_norm_epsilon": 1e-05,
+          "max_seq_len": 4096,
+          "num_heads": 16,
+          "num_kv_heads": 8,
+          "num_layers": 16,
+          "qk_norm": null,
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "rope": {
+            "factor": 1.0,
+            "theta": 10000
+          },
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_bias": false,
+          "use_gated_attention": "elementwise",
+          "use_layer_norm_weight": true
+        },
+        "model_flops": 5.1738353514618185e+20,
+        "model_flops_per_token": 2877292544.0,
+        "model_size": 1498482688,
+        "num_chips": 16,
+        "num_devices": 16,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "kind": "tpu",
+            "topology": null,
+            "variant": "v5p-32"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2026-01-07 23:16:03 UTC",
+        "tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 59938701312,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "explicit_mesh_axes": false,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.004,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 57162,
+          "optimizer_config": {
+            "adam_lr": 0.0012,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.98,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 1,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.004,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 2,
+            "min_lr_ratio": 0,
+            "momentum": 0.98,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0.1,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 256,
+          "train_seq_len": null,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1808594941354136e+21,
+        "training_time": 160792.41477878726,
+        "wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x1_v5p32-ec656c"
+      }
+    }
+  ]
+}