Update DataStates as decoupled checkpoint engine

amaurya · amaurya · commit aace707a2a3b · 2025-10-04T01:52:06.000Z
diff --git a/deepspeed/datastates/README.md b/deepspeed/datastates/README.md
@@ -0,0 +1,12 @@
+# DataStates-LLM checkpointing engine.
+
+This feature is not enabled by default. To enable, set the following options in ds_config.json and download the [DataStates-LLM checkpointing library](https://github.com/DataStates/datastates-llm/). A detailed tutorial is available [here](../../docs/_tutorials/datastates-async-checkpointing.md).
+
+```
+{
+    ... other deepspeed config options,
+    "datastates_ckpt": {
+        "host_cache_size": 16
+	}
+}
+```
diff --git a/deepspeed/datastates/__init__.py b/deepspeed/datastates/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# Apache-2.0 License Copyright (c) UChicago Argonne LLC, operator of Argonne National Laboratory.
+
+# DeepSpeed Team
diff --git a/deepspeed/datastates/config.py b/deepspeed/datastates/config.py
@@ -0,0 +1,21 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# Apache-2.0 License Copyright (c) UChicago Argonne LLC, operator of Argonne National Laboratory.
+
+# DeepSpeed Team
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigObject
+import copy
+
+DATASTATES_CHECKPOINTING = "datastates_ckpt"
+DATASTATES_CHECKPOINTING_ENABLED = False
+
+
+class DeepSpeedDataStatesConfig(DeepSpeedConfigObject):
+
+    def __init__(self, param_dict):
+        super(DeepSpeedDataStatesConfig, self).__init__()
+
+        self.enabled = param_dict.get(DATASTATES_CHECKPOINTING, DATASTATES_CHECKPOINTING_ENABLED) is not False
+        self.config = copy.deepcopy(param_dict.get(DATASTATES_CHECKPOINTING, None))
diff --git a/deepspeed/runtime/checkpoint_engine/README.md b/deepspeed/runtime/checkpoint_engine/README.md
@@ -35,3 +35,16 @@ class CheckpointEngine(object):
         pass
 
 ```
+
+
+### Asynchronous Lazy Checkpointing using DataStates-LLM
+
+DataStates-LLM is an asynchronous checkpointing approach optimized for LLM pre-training and can be obtained at https://github.com/DataStates/datastates-llm. A detailed tutorial is available [here](../../../docs/_tutorials/datastates-async-checkpointing.md). To enable datastates-llm checkpointing, specify the `host_cache_size` (in gigabytes) which reserves pinned host memory for asynchronous checkpoint flushing using the following lines in config.json supplied during the launch:
+```
+{
+    ... other deepspeed config options,
+    "datastates_ckpt": {
+        "host_cache_size": 16
+	}
+}
+```
diff --git a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
@@ -58,3 +58,6 @@ def get_commit_info(self):
 
     def cleanup(self):
         pass
+
+    def preserves_storage_sharing(self):
+        return True
diff --git a/deepspeed/runtime/checkpoint_engine/datastates_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/datastates_checkpoint_engine.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# Apache-2.0 License Copyright (c) UChicago Argonne LLC, operator of Argonne National Laboratory.
+
+# DeepSpeed Team
+
+from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
+    CheckpointEngine, CheckpointCommitInfo
+from datastates import CheckpointEngine as DataStatesEngine
+
+ENGINE_NAME = "DataStatesCheckpointEngine"
+
+
+class DataStatesCheckpointEngine(CheckpointEngine):
+
+    def __init__(self, deepspeed_config, rank):
+        super().__init__(deepspeed_config)
+        self.commit_info = None
+        self.ckpt_engine = DataStatesEngine(deepspeed_config, rank)
+
+    def __del__(self):
+        self.cleanup()
+
+    def create(self, info: CheckpointCommitInfo):
+        self.commit_info = info
+        return None
+
+    def save(self, state_dict, path: str):
+        return self.ckpt_engine.save(state_dict, path)
+
+    def load(self, path: str, map_location=None):
+        return self.ckpt_engine.load(path, map_location)
+
+    def commit(self, info: CheckpointCommitInfo):
+        assert info == self.commit_info
+        self.ckpt_engine.wait()
+        return self.ckpt_engine.commit(info.tag)
+
+    def cleanup(self):
+        self.commit(self.commit_info)
+        self.ckpt_engine.wait(True)
+        del self.ckpt_engine
+
+    def is_decoupled(self):
+        return True
+
+    def preserves_storage_sharing(self):
+        return False
diff --git a/deepspeed/runtime/checkpoint_engine/utils.py b/deepspeed/runtime/checkpoint_engine/utils.py
@@ -6,6 +6,7 @@
 from deepspeed.runtime.model_checkpointing.constants import *
 from deepspeed.runtime.model_checkpointing.utils import create_data_parallel_writer_config
 from deepspeed.utils import logger
+from deepspeed import comm as dist
 
 from .decoupled_checkpoint_engine import DecoupledCheckpointEngine
 from .fast_checkpoint_engine import FastCheckpointEngine
@@ -35,4 +36,14 @@ def create_checkpoint_engine(config_params, groups, zero_stage, has_moe_layers,
             else:
                 return NebulaCheckpointEngine(config_params=config_params.nebula_config)
 
+        if config_params.datastates_config.enabled:
+            try:
+                from deepspeed.runtime.checkpoint_engine.datastates_checkpoint_engine import DataStatesCheckpointEngine
+                return DataStatesCheckpointEngine(deepspeed_config=config_params, rank=dist.get_rank())
+            except ImportError as err:
+                logger.error(
+                    f"No datastates engine found! Install from https://github.com/DataStates/datastates-llm. Will fall back to torch.save. Details: {err}"
+                )
+                return TorchCheckpointEngine(config_params)
+
     return TorchCheckpointEngine(config_params)
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
@@ -52,6 +52,7 @@
 from ..profiling.config import DeepSpeedFlopsProfilerConfig
 from ..autotuning.config import DeepSpeedAutotuningConfig
 from ..nebula.config import DeepSpeedNebulaConfig
+from ..datastates.config import DeepSpeedDataStatesConfig
 
 from ..compression.config import get_compression_config, get_quantize_enabled
 from ..compression.constants import *
@@ -859,6 +860,7 @@ def _initialize_params(self, param_dict):
         self.dataloader_drop_last = get_dataloader_drop_last(param_dict)
 
         self.nebula_config = DeepSpeedNebulaConfig(param_dict)
+        self.datastates_config = DeepSpeedDataStatesConfig(param_dict)
         self.checkpoint_config = get_checkpoint_config(param_dict)
 
         self.weight_quantization_config = WeightQuantConfig(
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -2409,6 +2409,7 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                 # https://nvidia.github.io/apex/advanced.html#gradient-clipping
                 master_params = amp.master_params(self.optimizer)
                 clip_grad_norm_(parameters=master_params, max_norm=self.gradient_clipping(), mpu=self.mpu)
+
         self.optimizer.step()
 
         if hasattr(self.optimizer, '_global_grad_norm'):
@@ -3594,7 +3595,9 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
                     moe_save_path = self._get_expert_ckpt_name(save_dir, moe_layer_id, global_expert_id, tag, self.mpu)
                     if self.random_ltd_enabled():
                         expert_state_dict = remove_random_ltd_state_dict(expert_state_dict)
-                    saveable_state_dict = clone_tensors_for_torch_save(expert_state_dict)
+                    saveable_state_dict = expert_state_dict
+                    if self.checkpoint_engine.preserves_storage_sharing():
+                        saveable_state_dict = clone_tensors_for_torch_save(expert_state_dict)
                     self.checkpoint_engine.save(saveable_state_dict, moe_save_path)
                 moe_layer_id += 1
 
@@ -3616,7 +3619,9 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
         }
         # TODO: why use BufferedWriter not the path
         file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
-        saveable_state_dict = clone_tensors_for_torch_save(optimizer_state)
+        saveable_state_dict = optimizer_state
+        if self.checkpoint_engine.preserves_storage_sharing():
+            saveable_state_dict = clone_tensors_for_torch_save(optimizer_state)
         self.checkpoint_engine.save(saveable_state_dict, file_path)
 
         # Load flow uses below saved file for model parameters, RNG and more
@@ -3656,7 +3661,9 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
             }
             state.update(client_state)
             logger.info(f'Saving model checkpoint: {save_path}')
-            saveable_state_dict = clone_tensors_for_torch_save(state)
+            saveable_state_dict = state
+            if self.checkpoint_engine.preserves_storage_sharing():
+                saveable_state_dict = clone_tensors_for_torch_save(state)
             self.checkpoint_engine.save(saveable_state_dict, save_path)
 
     def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
@@ -621,6 +621,7 @@ def save_state_dict(self, save_dir, checkpoint_engine, exclude_frozen_params=Fal
         layer_list = self.forward_funcs[start:end]
 
         checkpoint_engine.makedirs(save_dir, exist_ok=True)
+        should_clone = checkpoint_engine.preserves_storage_sharing()
         for idx, layer in enumerate(layer_list):
             model_ckpt_path = self.ckpt_layer_path(save_dir, start + idx)
             if not hasattr(layer, 'state_dict'):
@@ -630,7 +631,9 @@ def save_state_dict(self, save_dir, checkpoint_engine, exclude_frozen_params=Fal
             if exclude_frozen_params:
                 for n in self._get_frozen_parameter_names(layer):
                     del orig_state_dict[n]
-            final_state_dict = clone_tensors_for_torch_save(orig_state_dict)
+            final_state_dict = orig_state_dict
+            if should_clone:
+                final_state_dict = clone_tensors_for_torch_save(orig_state_dict)
             checkpoint_engine.save(state_dict=final_state_dict, path=model_ckpt_path)
 
     def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
diff --git a/docs/_tutorials/datastates-async-checkpointing.md b/docs/_tutorials/datastates-async-checkpointing.md
@@ -0,0 +1,67 @@
+---
+title: "DataStates-LLM Checkpointing Engine"
+tags: asynchronous checkpointing for minimizing I/O overheads.
+---
+This tutorial will show how to use [DataStates-LLM](https://github.com/DataStates/datastates-llm) for asynchronous checkpointing. DataStates-LLM introduces a lazy asynchronous checkpointing mechanism tailored for LLMs, aiming to minimize I/O overhead and enhance training efficiency. This tutorial provides a guide on integrating DataStates-LLM with the DeepSpeed framework.
+
+## Overview of DataStates-LLM
+
+DataStates-LLM is designed to address the challenges of frequent checkpointing in LLM training by introducing a lazy asynchronous multi-level approach. It leverages the immutability of model parameters and optimizer states during forward and backward passes to perform non-blocking data transfers, thereby reducing interference with the training process. This method has demonstrated up to 48x faster checkpointing and 2.2x faster end-to-end training times compared to traditional approaches as outlined in [DataStates-LLM: Lazy Asynchronous Checkpointing for Large Language Models](https://arxiv.org/abs/2406.10707).
+
+## Prerequisites
+
+Before integrating DataStates-LLM with DeepSpeed, ensure the following:
+
+- **DeepSpeed Installation**: DeepSpeed should be installed in your environment. If not, refer to the [DeepSpeed Getting Started Guide](https://github.com/microsoft/DeepSpeed/blob/master/docs/_tutorials/getting-started.md) for installation instructions.
+
+- **DataStates-LLM Repository**: Access the DataStates-LLM source code from its [GitHub repository](https://github.com/DataStates/datastates-llm) and follow the installation instructions provided therein.
+
+## Configuring DeepSpeed for DataStates-LLM
+
+To enable DataStates-LLM's asynchronous checkpointing within DeepSpeed, please modify the `deepspeed_config.json` file to include specific configurations under the `datastates_ckpt` section. Below is an example configuration:
+
+```json
+{
+    // ... other DeepSpeed configuration options
+    "datastates_ckpt": {
+        "host_cache_size": 16,
+        "parser_threads": 8
+    }
+}
+```
+
+### Configuration Parameters
+
+- **`host_cache_size`**: Specifies the amount of pinned host memory (in gigabytes) reserved for asynchronous checkpoint flushing. Adjust this value based on your system's memory capacity and the size of your model checkpoints.
+
+- **`parser_threads`**: Determines the number of threads dedicated to parsing checkpoint file requests in parallel. Increasing this value can enhance parsing throughput but may also increase CPU utilization.
+
+## Implementing DataStates-LLM in Your Training Script
+
+After enabling datastates checkpointing the `deepspeed_config.json`, the frequency of checkpointing can be configured by specifying the number of iterations after which the checkpoints should be captured using command-line parameter ` --save-interval`.
+
+## Performance Results
+
+The checkpoint acceleration achieved by DataStates-LLM for various models are shown in
+
+![Higher checkpointing throughput](/assets/images/datastates-async-checkpointing/diff-models-ckpt-throughput.png){: .align-center}
+
+![Faster training iterations](/assets/images/datastates-async-checkpointing/diff-models-iter-times.png){: .align-center}
+
+
+## Limitations and Ongoing Work
+
+1. DataStates-LLM currently only supports the CUDA runtime on Nvidia-based GPUs.
+
+
+2. DataStates-LLM has only been tested with ZeRO stage-1 without offloading to any other tiers.
+
+
+3. While the checkpoint layout of datastates matches Huggingface's [safetensor](https://huggingface.co/docs/safetensors/) format, due to pickled objects required by DeepSpeed during restart, it is not fully compatible with safetensor library yet.
+
+4. DataStates-LLM does not yet support universal or elastic checkpointing.
+
+
+## Questions and Support
+
+Please use the [DataStates-LLM Github repository](https://github.com/DataStates/datastates-llm) for any questions, issues, or feature requests.
diff --git a/docs/assets/images/datastates-async-checkpointing/diff-models-ckpt-throughput.png b/docs/assets/images/datastates-async-checkpointing/diff-models-ckpt-throughput.png
diff --git a/docs/assets/images/datastates-async-checkpointing/diff-models-iter-times.png b/docs/assets/images/datastates-async-checkpointing/diff-models-iter-times.png