PrimeIntellect-ai · samsja · Dec 11, 2024 · Dec 11, 2024
diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
@@ -5,6 +5,7 @@ wandb_resume = false
 [train]
 micro_bs = 1
 ac_ckpt = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 sched_type = "wsd-sqrt"

diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml
@@ -5,6 +5,7 @@ wandb_resume = false
 [train]
 micro_bs = 1
 ac_ckpt = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 sched_type = "wsd-sqrt"

diff --git a/configs/10B/H100_simple.toml b/configs/10B/H100_simple.toml
@@ -4,6 +4,7 @@ project = "debug_10B_zero_band"
 [train]
 micro_bs = 1
 ac_ckpt = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 sched_type = "wsd-sqrt"

diff --git a/configs/150M_short/H100.toml b/configs/150M_short/H100.toml
@@ -5,6 +5,7 @@ type_model = "llama2"
 [train]
 micro_bs = 64 # change this base on the gpu
 reshard_after_forward = true
+torch_compile_mode = "max-autotune-no-cudagraphs"
 
 [optim]
 batch_size = 512

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -66,6 +66,7 @@ class MemoryProfilerConfig(BaseConfig):
 class TrainConfig(BaseConfig):
     micro_bs: int
     torch_compile: bool = True
+    torch_compile_mode: Literal["max-autotune-no-cudagraphs"] | None = None
     ac_ckpt: bool | int = False
     reshard_after_forward: bool = True  # old shard grad op True mean full shard
 
@@ -246,7 +247,7 @@ def train(config: Config):
 
     if config.train.torch_compile:
         # we need to compile AFTER creating the CKPT manager, DON'T ASK ME WHY
-        model = torch.compile(model)
+        model = torch.compile(model, mode=config.train.torch_compile_mode)
         logger.debug("model compiled")
 
     if config.ckpt.resume is not None: