diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
index d58e609..a36a0ca 100644
--- a/configs/10B/H100.toml
+++ b/configs/10B/H100.toml
@@ -5,6 +5,7 @@ wandb_resume = false
 [train]
 micro_bs = 1
 ac_ckpt = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 sched_type = "wsd-sqrt"
diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml
index 9132b1e..be35241 100644
--- a/configs/10B/H100_cooldown.toml
+++ b/configs/10B/H100_cooldown.toml
@@ -5,6 +5,7 @@ wandb_resume = false
 [train]
 micro_bs = 1
 ac_ckpt = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 sched_type = "wsd-sqrt"
diff --git a/configs/150M_short/H100.toml b/configs/150M_short/H100.toml
index af7582e..9924389 100644
--- a/configs/150M_short/H100.toml
+++ b/configs/150M_short/H100.toml
@@ -5,6 +5,7 @@ type_model = "llama2"
 [train]
 micro_bs = 64 # change this base on the gpu
 reshard_after_forward = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 batch_size = 512