Skip to content

Latest commit

 

History

History
3 lines (2 loc) · 934 Bytes

File metadata and controls

3 lines (2 loc) · 934 Bytes

COMPILE_MODE=off COMPUTE_DTYPE=auto EVAL_BATCH_SEQS=128 EVAL_MODE=sliding EVAL_SEQ_LEN=2048 EVAL_STRIDE=64 KEEP_FLOAT_EXTRA=tok_emb.weight MATRIX_LR=0.032 MAX_WALLCLOCK_SECONDS=600 MLP_HIDDEN=992 SCALAR_LR=0.032 SDPA_BACKEND=auto TIED_EMBED_LR=0.04 TRAIN_BATCH_TOKENS=524288 TRAIN_LOG_EVERY=50 TRAIN_SEQ_LEN=2048 VAL_LOSS_EVERY=1000 WARMDOWN_ITERS=3600 torchrun --standalone --nproc_per_node=4 train_gpt.py

ADAMW_WEIGHT_DECAY=0.01 COMPILE_MODE=off COMPUTE_DTYPE=auto EVAL_BATCH_SEQS=128 EVAL_MODE=sliding EVAL_SEQ_LEN=1024 EVAL_STRIDE=64 KEEP_FLOAT_EXTRA=tok_emb.weight MATRIX_LR=0.04 MAX_WALLCLOCK_SECONDS=600 MLP_HIDDEN=0 MUON_WEIGHT_DECAY=0.02 NUM_LAYERS=10 RESID_MIX_INIT=phase_transition SCALAR_LR=0.04 SDPA_BACKEND=auto TIED_EMBED_INIT_MODE=overtone TIED_EMBED_LR=0.10 TRAIN_BATCH_TOKENS=524288 TRAIN_LOG_EVERY=50 TRAIN_SEQ_LEN=1024 VAL_LOSS_EVERY=1000 WARMDOWN_ITERS=2500 torchrun --standalone --nproc_per_node=4 train_gpt.py