feat: log disk space usage info, warn if close to exhaustion

booxter · booxter · commit 2db72cf22ccc · 2025-06-09T19:01:02.000-04:00
Signed-off-by: Ihar Hrachyshka &lt;ihar.hrachyshka@gmail.com&gt;
diff --git a/src/instructlab/training/model.py b/src/instructlab/training/model.py
@@ -55,6 +55,7 @@ def __init__(
         self.noise_alpha = noise_alpha
         self.tokenizer = tokenizer
         self.distributed_framework = distributed_framework
+        self._last_checkpoint_size: int | None = None
         bnb_config = None
         if lora_config and lora_config.r > 0 and lora_quant_bits == 4:
             # Third Party
@@ -76,6 +77,14 @@ def __init__(
         if flash_enabled:
             self.base_model_args["attn_implementation"] = "flash_attention_2"
 
+    @property
+    def last_checkpoint_size(self) -> int | None:
+        return self._last_checkpoint_size
+
+    @last_checkpoint_size.setter
+    def last_checkpoint_size(self, value: int):
+        self._last_checkpoint_size = value
+
     def _post_model_init(self):
         """Common initialization steps that should happen after model initialization."""
         self.reconcile_tokenizer()
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -624,15 +624,29 @@ def get_caller(num_frames=1):
     return f"In {file_name}, line {line_number}"
 
 
-def log_rank_0(msg, include_caller=False, rank=None, to_print=False):
+def log_rank_0(
+    msg, include_caller=False, rank=None, to_print=False, level=logging.INFO
+) -> None:
     if rank is None:
         rank = get_rank() if is_initialized() else 0
-    if rank <= 0:
-        if include_caller:
-            msg = f"{get_caller(num_frames=2)}: {msg}"
-        if to_print:
-            print(msg)
-        else:
+    if rank > 0:
+        return
+
+    if include_caller:
+        msg = f"{get_caller(num_frames=2)}: {msg}"
+
+    if to_print:
+        print(msg)
+        return
+
+    match level:
+        case logging.WARNING:
+            logger.warning(msg)
+        case logging.ERROR:
+            logger.error(msg)
+        case logging.DEBUG:
+            logger.debug(msg)
+        case _:
             logger.info(msg)
 
 
@@ -673,6 +687,13 @@ def skip_precheck_loops():
     accelerator.get_state_dict = old_get_state
 
 
+def _get_checkpoint_dir(args, samples_seen) -> Path:
+    subdir = (
+        "last_epoch" if args.keep_last_checkpoint_only else f"samples_{samples_seen}"
+    )
+    return Path(args.output_dir) / "hf_format" / subdir
+
+
 def save_hf_format_accelerate(
     args,
     model,
@@ -681,13 +702,11 @@ def save_hf_format_accelerate(
     samples_seen,
     is_lora=False,
 ):
-    # Build the subdirectory name
-    subdir = (
-        "last_epoch" if args.keep_last_checkpoint_only else f"samples_{samples_seen}"
-    )
+    # Build the final output directory path
+    final_output_dir = _get_checkpoint_dir(args, samples_seen)
 
     log_rank_0(
-        f"\033[93mSaving model in huggingface format at: {subdir}\033[0m",
+        f"\033[93mSaving model in huggingface format at: {final_output_dir}\033[0m",
         to_print=True,
     )
     start = time.time()
@@ -697,9 +716,6 @@ def save_hf_format_accelerate(
     else:
         convert_dolomite = True
 
-    # Build the final output directory path
-    final_output_dir = Path(args.output_dir) / "hf_format" / subdir
-
     if args.use_dolomite and convert_dolomite:
         tmpdir = TemporaryDirectory("w")  # pylint: disable=consider-using-with
         output_dir = Path(tmpdir.name)
@@ -797,6 +813,48 @@ def set_random_seed(seed):
         torch.cuda.manual_seed_all(seed)
 
 
+def _get_checkpoint_dir_size(checkpoint_dir) -> int:
+    total = 0
+    for dirpath, _, filenames in os.walk(checkpoint_dir):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if os.path.isfile(fp):
+                total += os.path.getsize(fp)
+    return total
+
+
+def check_disk_space_for_next_checkpoint(
+    model: Model, output_dir: Path, warn_steps_ahead: int = 3
+) -> None:
+    checkpoint_size = model.last_checkpoint_size
+    if checkpoint_size is None:
+        # No previous checkpoint size to estimate, do nothing.
+        return
+
+    def _mb_size(num_bytes):
+        return f"{num_bytes / 1024 / 1024:.2f} MB"
+
+    try:
+        stat = shutil.disk_usage(output_dir)
+        free_bytes = stat.free
+        needed_bytes = checkpoint_size * warn_steps_ahead
+
+        log_rank_0(
+            f"Disk space info: free={_mb_size(free_bytes)}, last_checkpoint_size={_mb_size(checkpoint_size)} (output_dir={output_dir})"
+        )
+        if free_bytes < needed_bytes:
+            log_rank_0(
+                f"Estimated free disk space ({_mb_size(free_bytes)}) is less than the estimated size of the next {warn_steps_ahead} checkpoints ({_mb_size(needed_bytes)}). "
+                "The next checkpoint(s) may fail due to insufficient disk space.",
+                level=logging.WARNING,
+            )
+    except Exception as e:
+        log_rank_0(
+            f"Could not check disk space after checkpoint: {e}",
+            level=logging.ERROR,
+        )
+
+
 def save_checkpoint(
     args,
     accelerator: Accelerator,
@@ -827,6 +885,15 @@ def save_checkpoint(
             samples_seen=samples_seen,
         )
 
+    # Track checkpoint size and warn if disk space is low
+    output_dir = Path(args.output_dir)
+    check_disk_space_for_next_checkpoint(model, output_dir, warn_steps_ahead=3)
+
+    if hf_format:
+        checkpoint_dir = _get_checkpoint_dir(args, samples_seen)
+        if checkpoint_dir.exists():
+            model.last_checkpoint_size = _get_checkpoint_dir_size(checkpoint_dir)
+
 
 def save_full_state(args, accelerator, is_lora: bool, epoch: int, samples_seen: int):
     """