marin-community · ClassicLarry · Dec 7, 2025 · Dec 12, 2025
diff --git a/experiments/speedrun/nanogpt_features_v0/150m/speedrun_results.json b/experiments/speedrun/nanogpt_features_v0/150m/speedrun_results.json
@@ -0,0 +1,139 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Independent",
+          "name": "Larry Dial",
+          "url": "https://github.com/ClassicLarry"
+        },
+        "description": "Includes subset of features from Modded-NanoGPT: Partial RoPE, QK Norm, 2.5 TPP, Relu^2 MLP, X0 Skip, exponential decay of resid, backout lambda, reduced head counts, rms_norm, 0 init out projections, boosted attn scale (150m)",
+        "device_flops": 989500000000000.0,
+        "eval/paloma/c4_en/bpb": 1.2709236145019531,
+        "model_config": {
+          "attn_backend": "JAX_FLASH",
+          "cross_entropy_block_size": 2048,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 512,
+          "initializer_range": 0.02,
+          "input_embedding_norm": true,
+          "intermediate_dim": 2048,
+          "layer_norm_epsilon": 1e-05,
+          "num_heads": 4,
+          "num_kv_heads": 4,
+          "num_layers": 6,
+          "qk_norm": {
+            "eps": 1e-05,
+            "use_bias": false,
+            "use_weight": false
+          },
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_attention_sink": false,
+          "use_bias": false,
+          "use_layer_norm_weight": false
+        },
+        "model_flops": 2.189996844908544e+17,
+        "model_flops_per_token": 194396160.0,
+        "model_size": 150208512,
+        "num_chips": 1,
+        "num_devices": 1,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "count": 1,
+            "type": "H100"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-07 19:46:51 UTC",
+        "tokenized_dataset": "/root/marin/local_store/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 375521280,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.02,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 955,
+          "optimizer_config": {
+            "adam_lr": 0.0064,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.95,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.5,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.02,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0.1,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 192,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 1.1208567897015704e+18,
+        "training_time": 1132.750671754998,
+        "wandb_run_link": "https://wandb.ai/larrydial/marin/runs/nanogpt_features_v0_150m-5c83f9"
+      }
+    }
+  ]
+}
diff --git a/experiments/speedrun/nanogpt_features_v0/270m/speedrun_results.json b/experiments/speedrun/nanogpt_features_v0/270m/speedrun_results.json
@@ -0,0 +1,139 @@
+{
+  "runs": [
+    {
+      "run_info": {
+        "author": {
+          "affiliation": "Independent",
+          "name": "Larry Dial",
+          "url": "https://github.com/ClassicLarry"
+        },
+        "description": "Includes subset of features from Modded-NanoGPT: Partial RoPE, QK Norm, 2.5 TPP, Relu^2 MLP, X0 Skip, exponential decay of resid, backout lambda, reduced head counts, rms_norm, 0 init out projections, boosted attn scale (270m)",
+        "device_flops": 989500000000000.0,
+        "eval/paloma/c4_en/bpb": 1.1750104427337646,
+        "model_config": {
+          "attn_backend": "JAX_FLASH",
+          "cross_entropy_block_size": 2048,
+          "flash_attention_block_size": null,
+          "gradient_checkpointing": true,
+          "head_dim": null,
+          "hidden_dim": 768,
+          "initializer_range": 0.02,
+          "input_embedding_norm": true,
+          "intermediate_dim": 3072,
+          "layer_norm_epsilon": 1e-05,
+          "num_heads": 6,
+          "num_kv_heads": 6,
+          "num_layers": 11,
+          "qk_norm": {
+            "eps": 1e-05,
+            "use_bias": false,
+            "use_weight": false
+          },
+          "reference_checkpoint": "NousResearch/Llama-2-7b-hf",
+          "seq_len": 2048,
+          "tie_word_embeddings": false,
+          "tokenizer": null,
+          "upcast_attn": false,
+          "use_attention_sink": false,
+          "use_bias": false,
+          "use_layer_norm_weight": false
+        },
+        "model_flops": 8.703490096701112e+17,
+        "model_flops_per_token": 422326272.0,
+        "model_size": 274857984,
+        "num_chips": 1,
+        "num_devices": 1,
+        "resources": {
+          "cpu": 1,
+          "device": {
+            "count": 1,
+            "type": "H100"
+          },
+          "disk": "1g",
+          "preemptible": true,
+          "ram": "128m",
+          "regions": null,
+          "replicas": 1
+        },
+        "run_completion_timestamp": "2025-12-07 20:38:36 UTC",
+        "tokenized_dataset": "/root/marin/local_store/tokenized/subcache/fineweb-edu-10B-ac65f6",
+        "total_tokens": 686948352,
+        "train_config": {
+          "allow_partial_checkpoint": false,
+          "beta1": null,
+          "beta2": null,
+          "cycle_length": null,
+          "data_seed": null,
+          "decay": null,
+          "ema_beta": null,
+          "epsilon": null,
+          "initialize_from_checkpoint_path": null,
+          "initialize_from_hf": null,
+          "int8": false,
+          "learning_rate": 0.02,
+          "lr_schedule": null,
+          "max_eval_batches": null,
+          "max_grad_norm": null,
+          "min_lr_ratio": null,
+          "num_train_steps": 1747,
+          "optimizer_config": {
+            "adam_lr": 0.0064,
+            "adam_weight_decay": null,
+            "backend_steps": 5,
+            "beta1": 0.8,
+            "beta2": 0.95,
+            "cooldown": null,
+            "cycle_length": null,
+            "cycles": null,
+            "decay": 0.5,
+            "default_weight_decay_mask": null,
+            "epsilon": 1e-15,
+            "haps": null,
+            "learning_rate": 0.02,
+            "lr": 0.02,
+            "lr_schedule": "linear",
+            "max_grad_norm": 1,
+            "min_lr_ratio": 0.1,
+            "momentum": 0.95,
+            "muon_epsilon": 1e-05,
+            "nesterov": true,
+            "rewarmup": 0.0,
+            "use_kimi_scaling": false,
+            "warmup": 0,
+            "weight_decay": 0,
+            "weight_decay_modules": null
+          },
+          "per_device_eval_parallelism": null,
+          "profiler": false,
+          "profiler_num_steps": 100,
+          "profiler_start_step": 5,
+          "reset_data_loader_on_init": true,
+          "rewarmup": null,
+          "skip_bad_steps": false,
+          "steps_per_eval": null,
+          "steps_per_export": 10000,
+          "steps_per_hf_export": -1,
+          "steps_per_task_eval": null,
+          "train_batch_size": 192,
+          "warmup": null,
+          "watch": {
+            "include_histograms": false,
+            "include_norms": true,
+            "include_per_parameter_norms": true,
+            "interval": 10,
+            "split_scan_layers": true,
+            "watch_targets": [
+              "grads",
+              "params"
+            ]
+          },
+          "weight_decay": null,
+          "z_loss_weight": null
+        },
+        "training_hardware_flops": 3.918996717548913e+18,
+        "training_time": 3960.5828373409936,
+        "wandb_run_link": "https://wandb.ai/larrydial/marin/runs/nanogpt_features_v0_270m-21987d"
+      }
+    }
+  ]
+}