Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
35eac90
rename attn_sink speedrun dir
Calvin-Xu Dec 6, 2025
e399064
Add Gated Attention
Calvin-Xu Dec 9, 2025
6f51c0c
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 9, 2025
2d8b1c0
fix Paloma local download
Calvin-Xu Dec 9, 2025
4dab1b1
Add Gated Attention sweep results
Calvin-Xu Dec 9, 2025
59ec26c
Improved Gated Attention impl + LR sweep
Calvin-Xu Dec 9, 2025
cb4ef2f
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 9, 2025
66cc474
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 15, 2025
b5758ab
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 16, 2025
848821b
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 18, 2025
bd9983a
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 19, 2025
97487ef
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 20, 2025
98bd063
update w/ main
Calvin-Xu Dec 21, 2025
c1b4399
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 26, 2025
c283d2b
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Dec 27, 2025
25e5a94
Initial LR sweep results
Calvin-Xu Dec 28, 2025
7d38157
tweak
Calvin-Xu Dec 30, 2025
eb16388
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Jan 4, 2026
03da330
precommit
Calvin-Xu Jan 5, 2026
313b9fa
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Jan 5, 2026
5588a9f
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Jan 8, 2026
f136350
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Jan 9, 2026
d0adcc7
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Jan 12, 2026
9156692
Check in results on v5p32
Calvin-Xu Jan 20, 2026
0cdf99e
Merge branch 'main' of https://github.com/marin-community/marin into …
Calvin-Xu Jan 20, 2026
7b4b128
revert back to separate gate proj
Calvin-Xu Jan 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
{
"runs": [
{
"run_info": {
"author": {
"affiliation": "Stanford University",
"name": "Calvin Xu",
"url": "https://pinlinxu.com"
},
"description": "Hackable Transformer (130m); Gated Attention (Splash); LR sweep multiplier=1",
"device_flops": 459000000000000.0,
"eval/paloma/c4_en/bpb": 1.1556898355484009,
"model_config": {
"activation_function": "silu",
"attn_backend": null,
"cross_entropy_block_size": null,
"flash_attention_block_size": null,
"gradient_checkpointing": true,
"head_dim": null,
"hidden_dim": 512,
"initializer_range": 0.02,
"input_embedding_norm": false,
"intermediate_dim": 1792,
"layer_norm_epsilon": 1e-05,
"max_seq_len": 4096,
"num_heads": 8,
"num_kv_heads": 8,
"num_layers": 6,
"qk_norm": null,
"reference_checkpoint": "NousResearch/Llama-2-7b-hf",
"rope": {
"factor": 1.0,
"theta": 10000
},
"tie_word_embeddings": false,
"tokenizer": null,
"upcast_attn": false,
"use_bias": false,
"use_gated_attention": true,
"use_layer_norm_weight": true
},
"model_flops": 2.1289341996446515e+18,
"model_flops_per_token": 227868672.0,
"model_size": 155720192,
"num_chips": 16,
"num_devices": 16,
"resources": {
"cpu": 1,
"device": {
"topology": null,
"type": "v5p-32"
},
"disk": "1g",
"preemptible": true,
"ram": "128m",
"regions": null,
"replicas": 1
},
"run_completion_timestamp": "2025-12-09 10:26:57 UTC",
"tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
"total_tokens": 3114270720,
"train_config": {
"allow_partial_checkpoint": false,
"beta1": null,
"beta2": null,
"cycle_length": null,
"data_seed": null,
"decay": null,
"ema_beta": null,
"epsilon": null,
"initialize_from_checkpoint_path": null,
"initialize_from_hf": null,
"int8": false,
"learning_rate": 0.016,
"lr_schedule": null,
"max_eval_batches": null,
"max_grad_norm": null,
"min_lr_ratio": null,
"num_train_steps": 5940,
"optimizer_config": {
"adam_lr": 0.0032,
"adam_weight_decay": null,
"backend_steps": 5,
"beta1": 0.8,
"beta2": 0.98,
"cooldown": null,
"cycle_length": null,
"cycles": null,
"decay": 0.8,
"default_weight_decay_mask": null,
"epsilon": 1e-15,
"haps": null,
"learning_rate": 0.016,
"lr": 0.02,
"lr_schedule": "linear",
"max_grad_norm": 1,
"min_lr_ratio": 0,
"momentum": 0.95,
"muon_epsilon": 1e-05,
"nesterov": true,
"rewarmup": 0.0,
"use_kimi_scaling": false,
"warmup": 0,
"weight_decay": 0.1,
"weight_decay_modules": null
},
"per_device_eval_parallelism": null,
"profiler": false,
"profiler_num_steps": 100,
"profiler_start_step": 5,
"reset_data_loader_on_init": true,
"rewarmup": null,
"skip_bad_steps": false,
"steps_per_eval": null,
"steps_per_export": 10000,
"steps_per_hf_export": -1,
"steps_per_task_eval": null,
"train_batch_size": 128,
"train_seq_len": null,
"warmup": null,
"watch": {
"include_histograms": false,
"include_norms": true,
"include_per_parameter_norms": true,
"interval": 10,
"split_scan_layers": true,
"watch_targets": [
"grads",
"params"
]
},
"weight_decay": null,
"z_loss_weight": null
},
"training_hardware_flops": 1.1670497124689082e+19,
"training_time": 1589.1199788519991,
"wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_130m_attngate_4096_splash_lr_sweep_lr_x1-88f232"
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
{
"runs": [
{
"run_info": {
"author": {
"affiliation": "Stanford University",
"name": "Calvin Xu",
"url": "https://pinlinxu.com"
},
"description": "Hackable Transformer (1_2b); Gated Attention (Splash); LR sweep multiplier=1",
"device_flops": 459000000000000.0,
"eval/paloma/c4_en/bpb": 0.9160435795783997,
"model_config": {
"activation_function": "silu",
"attn_backend": null,
"cross_entropy_block_size": null,
"flash_attention_block_size": null,
"gradient_checkpointing": true,
"head_dim": null,
"hidden_dim": 2048,
"initializer_range": 0.02,
"input_embedding_norm": false,
"intermediate_dim": 7168,
"layer_norm_epsilon": 1e-05,
"max_seq_len": 4096,
"num_heads": 16,
"num_kv_heads": 8,
"num_layers": 16,
"qk_norm": null,
"reference_checkpoint": "NousResearch/Llama-2-7b-hf",
"rope": {
"factor": 1.0,
"theta": 10000
},
"seq_len": 2048,
"tie_word_embeddings": false,
"tokenizer": null,
"upcast_attn": false,
"use_bias": false,
"use_gated_attention": "elementwise",
"use_layer_norm_weight": true
},
"model_flops": 5.1738353514618185e+20,
"model_flops_per_token": 2877292544.0,
"model_size": 1498482688,
"num_chips": 16,
"num_devices": 16,
"resources": {
"cpu": 1,
"device": {
"kind": "tpu",
"topology": null,
"variant": "v5p-32"
},
"disk": "1g",
"preemptible": true,
"ram": "128m",
"regions": null,
"replicas": 1
},
"run_completion_timestamp": "2026-01-07 23:16:03 UTC",
"tokenized_dataset": "gs://marin-us-central1/tokenized/subcache/fineweb-edu-10B-ac65f6",
"total_tokens": 59938701312,
"train_config": {
"allow_partial_checkpoint": false,
"beta1": null,
"beta2": null,
"cycle_length": null,
"data_seed": null,
"decay": null,
"ema_beta": null,
"epsilon": null,
"explicit_mesh_axes": false,
"initialize_from_checkpoint_path": null,
"initialize_from_hf": null,
"int8": false,
"learning_rate": 0.004,
"lr_schedule": null,
"max_eval_batches": null,
"max_grad_norm": null,
"min_lr_ratio": null,
"num_train_steps": 57162,
"optimizer_config": {
"adam_lr": 0.0012,
"adam_weight_decay": null,
"backend_steps": 5,
"beta1": 0.8,
"beta2": 0.98,
"cooldown": null,
"cycle_length": null,
"cycles": null,
"decay": 1,
"default_weight_decay_mask": null,
"epsilon": 1e-15,
"haps": null,
"learning_rate": 0.004,
"lr": 0.02,
"lr_schedule": "linear",
"max_grad_norm": 2,
"min_lr_ratio": 0,
"momentum": 0.98,
"muon_epsilon": 1e-05,
"nesterov": true,
"rewarmup": 0.0,
"use_kimi_scaling": false,
"warmup": 0,
"weight_decay": 0.1,
"weight_decay_modules": null
},
"per_device_eval_parallelism": null,
"profiler": false,
"profiler_num_steps": 100,
"profiler_start_step": 5,
"reset_data_loader_on_init": true,
"rewarmup": null,
"skip_bad_steps": false,
"steps_per_eval": null,
"steps_per_export": 10000,
"steps_per_hf_export": -1,
"steps_per_task_eval": null,
"train_batch_size": 256,
"train_seq_len": null,
"warmup": null,
"watch": {
"include_histograms": false,
"include_norms": true,
"include_per_parameter_norms": true,
"interval": 10,
"split_scan_layers": true,
"watch_targets": [
"grads",
"params"
]
},
"weight_decay": null,
"z_loss_weight": null
},
"training_hardware_flops": 1.1808594941354136e+21,
"training_time": 160792.41477878726,
"wandb_run_link": "https://wandb.ai/marin-community/marin/runs/hacktx_1_2b_attngate_2048_splash_lr_sweep_lr_x1_v5p32-ec656c"
}
}
]
}
Loading
Loading