diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml index d58e609..a36a0ca 100644 --- a/configs/10B/H100.toml +++ b/configs/10B/H100.toml @@ -5,6 +5,7 @@ wandb_resume = false [train] micro_bs = 1 ac_ckpt = true +torch_compile_mode = "max-autotune-no-cudagraphs" [optim] sched_type = "wsd-sqrt" diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml index 9132b1e..be35241 100644 --- a/configs/10B/H100_cooldown.toml +++ b/configs/10B/H100_cooldown.toml @@ -5,6 +5,7 @@ wandb_resume = false [train] micro_bs = 1 ac_ckpt = true +torch_compile_mode = "max-autotune-no-cudagraphs" [optim] sched_type = "wsd-sqrt" diff --git a/configs/150M_short/H100.toml b/configs/150M_short/H100.toml index af7582e..9924389 100644 --- a/configs/150M_short/H100.toml +++ b/configs/150M_short/H100.toml @@ -5,6 +5,7 @@ type_model = "llama2" [train] micro_bs = 64 # change this base on the gpu reshard_after_forward = true +torch_compile_mode = "max-autotune-no-cudagraphs" [optim] batch_size = 512