Ability to force a memory fraction to be unused in OOMptimizer

Signed-off-by: Piotr Żelasko <[email protected]>
NVIDIA · Jul 19, 2024 · 4b009bd · 4b009bd
1 parent 9e632e4
commit 4b009bd
Showing 1 changed file with 9 additions and 1 deletion.
diff --git a/scripts/speech_recognition/oomptimizer.py b/scripts/speech_recognition/oomptimizer.py
@@ -221,6 +221,14 @@ def type_cast_value(self, ctx, value):
     "By default we force 5% memory to be unused to account for non-training-loop related CUDA memory usage"
     "in actual training scripts.",
 )
+@click.option(
+    "-d",
+    "--device",
+    default="cuda:0",
+    help="Device string to be passed to torch.device; due to MEMORY_FRACTION option, "
+    "it must specify the device index (e.g. cuda:0). "
+    "You can also leave the default index and select a specific GPU using env var CUDA_VISIBLE_DEVICES=<idx>",
+)
 def oomptimizer(
     pretrained_name: str | None,
     module_name: str | None,
@@ -231,6 +239,7 @@ def oomptimizer(
     start_batch_size: int,
     labels_per_second: int,
     memory_fraction: float,
+    device: str,
 ):
     """
     OOMptimizer finds the optimal batch sizes for training your model with bucketing dataloading.
@@ -258,7 +267,6 @@ def oomptimizer(
         )
         sys.exit(1)
     logging.setLevel(logging.CRITICAL)
-    device = "cuda"
     torch.cuda.set_per_process_memory_fraction(memory_fraction, device)
 
     trainer = pl.Trainer(barebones=True)