diff --git a/configs/10B/H100_simple.toml b/configs/10B/H100_simple.toml new file mode 100644 index 0000000..dafa64b --- /dev/null +++ b/configs/10B/H100_simple.toml @@ -0,0 +1,23 @@ +name_model = "10B" +project = "debug_10B_zero_band" + +[train] +micro_bs = 1 +ac_ckpt = true + +[optim] +sched_type = "wsd-sqrt" +batch_size = 128 #1M tokens bs +warmup_steps = 1000 +total_steps = 1_000_000_000_000 +lr = 7.5e-5 + +adam_betas1 = 0.9 +adam_betas2 = 0.95 +weight_decay = 0.1 + +z_loss = true + +[data] +seq_length = 8192 +num_workers = 4