Skip to content

Commit 0e26179

Browse files
committed
Merge remote-tracking branch 'upstream/main' into diff
2 parents 4ff51f4 + 0bae6e0 commit 0e26179

File tree

13 files changed

+233
-164
lines changed

13 files changed

+233
-164
lines changed

README.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,25 +11,26 @@ For detailed documentation and usage information about each component, please re
1111

1212
## Core Components and Capabilities
1313

14-
- **[Fault Tolerance](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/fault_tolerance/index.rst)**
14+
- **[Fault Tolerance](https://nvidia.github.io/nvidia-resiliency-ext/fault_tolerance/index.html)**
1515
- Detection of hung ranks.
1616
- Restarting training in-job, without the need to reallocate SLURM nodes.
1717

18-
- **[In-Process Restarting](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/inprocess/index.rst)**
18+
- **[In-Process Restarting](https://nvidia.github.io/nvidia-resiliency-ext/inprocess/index.html)**
1919
- Detecting failures and enabling quick recovery.
2020

21-
- **[Async Checkpointing](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/checkpointing/async/index.rst)**
21+
- **[Async Checkpointing](https://nvidia.github.io/nvidia-resiliency-ext/checkpointing/async/index.html)**
2222
- Providing an efficient framework for asynchronous checkpointing.
2323

24-
- **[Local Checkpointing](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/checkpointing/local/index.rst)**
24+
- **[Local Checkpointing](https://nvidia.github.io/nvidia-resiliency-ext/checkpointing/local/index.html)**
2525
- Providing an efficient framework for local checkpointing.
2626

27-
- **[Straggler Detection](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/straggler_det/index.rst)**
27+
- **[Straggler Detection](https://nvidia.github.io/nvidia-resiliency-ext/straggler_det/index.html)**
2828
- Monitoring GPU and CPU performance of ranks.
2929
- Identifying slower ranks that may impede overall training efficiency.
3030

31-
- **[PyTorch Lightning Callbacks](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/fault_tolerance/integration/ptl.rst)**
32-
- Facilitating seamless NVRx integration with PyTorch Lightning.
31+
- **Framework Integration**
32+
- Facilitating seamless [fault tolerance](https://nvidia.github.io/nvidia-resiliency-ext/fault_tolerance/integration/ptl.html) and [straggler detection](https://nvidia.github.io/nvidia-resiliency-ext/straggler_det/usage_guide.html#integration-guide) integration with PyTorch Lightning based workloads.
33+
- Providing integration with NVIDIA [NeMo](https://docs.nvidia.com/nemo-framework/user-guide/latest/resiliency.html) framework, a scalable and cloud-native generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (e.g. Automatic Speech Recognition and Text-to-Speech).
3334

3435
## Installation
3536

cupti_build.py

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515

1616
import glob
1717
import os
18-
18+
import re
19+
import shutil
20+
import subprocess
1921
from pybind11.setup_helpers import Pybind11Extension, build_ext
2022

2123

@@ -43,6 +45,54 @@ def _skip_ext_build():
4345
return ans.lower() in ['1', 'on', 'yes', 'true']
4446

4547

48+
def get_cuda_path():
49+
"""
50+
Determines the path to the CUDA installation.
51+
52+
Find the CUDA root directory under stanadard paths or using nvcc
53+
as it's typically done in build systems like CMake.
54+
55+
1. Check if $CUDA_PATH is set or /usr/local/cuda exists; return it if so.
56+
2. If not, check if nvcc is in PATH. If yes, run "nvcc -v test.cu" and parse output for CUDA root.
57+
3. If neither method works, raise FileNotFoundError.
58+
59+
Returns:
60+
str: The path to the CUDA installation directory.
61+
62+
Raises:
63+
FileNotFoundError: If the CUDA installation cannot be found.
64+
"""
65+
cuda_path = os.environ.get("CUDA_PATH", "/usr/local/cuda")
66+
if os.path.isdir(cuda_path):
67+
return cuda_path
68+
69+
nvcc_path = shutil.which("nvcc")
70+
if nvcc_path:
71+
try:
72+
# try to extract CUDA root from nvcc output
73+
result = subprocess.run(
74+
[nvcc_path, "-v", "test.cu"],
75+
stdout=subprocess.PIPE,
76+
stderr=subprocess.STDOUT,
77+
check=False,
78+
universal_newlines=True,
79+
)
80+
# match "#$ TOP=..." in output
81+
match = re.search(r'#\$ TOP=([^\r\n]*)', result.stdout)
82+
if match and os.path.isdir(match.group(1)):
83+
return match.group(1)
84+
else:
85+
# fallback: get directory where nvcc is located
86+
return os.path.dirname(os.path.dirname(nvcc_path))
87+
except Exception:
88+
pass
89+
90+
raise FileNotFoundError(
91+
"CUDA installation not found in /usr/local/cuda or $CUDA_PATH, "
92+
"and could not determine CUDA path from nvcc"
93+
)
94+
95+
4696
def build(setup_kwargs):
4797

4898
if _skip_ext_build():
@@ -54,9 +104,7 @@ def build(setup_kwargs):
54104
include_dirs = None
55105
library_dirs = None
56106

57-
cuda_path = os.environ.get("CUDA_PATH", "/usr/local/cuda")
58-
if not os.path.isdir(cuda_path):
59-
raise FileNotFoundError("cuda installation not found in /usr/local/cuda or $CUDA_PATH")
107+
cuda_path = get_cuda_path()
60108

61109
cupti_h = "cupti.h"
62110
libcupti_so = "libcupti.so"

docs/source/checkpointing/async/usage_guide.rst

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,41 @@ which defines checkpoint routine, its args/kwargs and finalization steps when th
77
:py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt.TorchAsyncCheckpoint`
88
is an instatiation of the core utilities to make `torch.save` run asynchronously.
99

10+
:py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.save_state_dict_async_plan` is an instantiation of the core utilities to make `torch.distributed.save_state_dict` run asynchronously.
1011

11-
The implementation assumes all training ranks creates :py:class:`core.AsyncCallsQueue` and synchronize with :py:class:`core.AsyncCallsQueue.maybe_finalize_async_calls` by default.
12+
The implementation assumes all training ranks creates :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue` and synchronize with :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue.maybe_finalize_async_calls` by default.
1213

1314

14-
Requirements
15-
------------
16-
:py:class:`nvidia_resiliency_ext.checkpointing.utils` includes a couple of routines used for :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.core`
17-
:py:class:`nvidia_resiliency_ext.checkpointing.utils.wrap_for_async` disables garbage collection in a forked process to run user's checkpoint routine
18-
to prevent failures incurred by GC, which tries to deallocate CUDA tensors in a forked process.
19-
This routine requires the first argument of the passed user fn should be state dictionary containing tensors or objects for checkpoint
20-
21-
The current implementation uses a forked process to run pre-staged tensors in host memory by pinned memcpy.
22-
So, the routine should include :py:class:`nvidia_resiliency_ext.checkpointing.utils.preload_tensors` to stage GPU tensors in a state dictionary to host memory before it's passed to `AsyncCallsQueue`
15+
Implementation Changes and Evolution
16+
------------------------------------
17+
* We have deprecated our initial implementation of async checkpointing, :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller`, using a forked process to run the checkpointing in the background.
18+
19+
* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue` is now initialized by default to use :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` instead of :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller`.
20+
21+
* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` spawns a persistent process that runs in a separate CUDA context and forks processes optionally for intra-node parallelism.
22+
23+
* Now, we don't need :py:func:`~nvidia_resiliency_ext.checkpointing.utils.wrap_for_async` anymore because :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` is safe to call garbage collection in the spawned process.
24+
25+
* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` runs :py:func:`~nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync.preload_tensors` in the spawned process.
26+
So, we've added a new field, :py:attr:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest.preload_fn`, to pass the preload function(preload_fn) to the spawned process.
27+
28+
* The preload_fn should be self-contained with a proper list of arguments with :py:class:`functools.partial`.
29+
30+
* The preload_fn should be a function that takes a state dictionary and returns a state dictionary.
31+
32+
* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` receives GPU tensor IPC handles and prestages them to host memory through a preload_fn
33+
so dereference of GPU tensors should be done promptly inside of `preload_fn` if possible.
34+
35+
* A proper termination of the persistent process is required for graceful shutdown.
36+
37+
* Job schedulers(e.g. Slurm, torchrun) should clean up the persistent process and its child workers when the job step is terminated.
38+
39+
* The following changes will be made in the next release to the implementation of :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller`:
40+
41+
* We'll make the persistent process to be terminated when the main process is terminated.
42+
43+
* Optional child workers created by :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync` are terminated when the persistent process is terminated.
44+
2345

2446

2547
Synchronization of Asynchronous Checkpoint Requests
@@ -218,3 +240,13 @@ The following example demonstrates a complete workflow for saving and loading ch
218240
219241
# Load checkpoint synchronously
220242
loaded_state_dict = load_checkpoint(checkpoint_path, state_dict.copy())
243+
244+
245+
Best Practices
246+
--------------
247+
* Use process binding to pin the checkpointing process to a specific GPU. This is important for pre-staging tensors to host memory.
248+
249+
.. code-block:: bash
250+
251+
# Example for a 8 GPU on 2 socket CPU with SLURM
252+
numactl --cpunodebind=$((SLURM_LOCALID / 4)) --membind=$((SLURM_LOCALID / 4)) python train.py

docs/source/checkpointing/local/usage_guide.rst

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ Requirements for `BasicTensorAwareStateDict`
3737

3838
Restrictions
3939
------------
40-
Currently under review - no documented restrictions at this time.
40+
- `AsyncCallsQueue` must be initialized with `persistence=False`, because some local checkpointing routines
41+
are not pickleable. This restriction may be lifted in the future.
4142

4243
Functionality Overview
4344
----------------------
@@ -108,13 +109,17 @@ controlled by the `is_async` parameter in the `save(...)` method.
108109
performs a blocking save operation, ensuring all data is written before returning.
109110
- Asynchronous Save: When `is_async` is set to `True`, the `save(...)` method
110111
initiates a non-blocking save and returns an `AsyncRequest` object.
111-
This class is fully compatible with the `nvidia_resiliency_ext.checkpointing.async_ckpt` module.
112+
This class is compatible with the `nvidia_resiliency_ext.checkpointing.async_ckpt` module.
112113

113114
The returned `AsyncRequest` can then be submitted to an `AsyncCallsQueue`,
114115
enabling advanced asynchronous processing.
115116
The usage of `AsyncRequest` with `AsyncCallsQueue` is demonstrated in the provided example,
116117
showcasing how to efficiently manage non-blocking saves within your workflow.
117118

119+
.. note::
120+
Per the Restrictions and the included example, `AsyncCallsQueue` must be initialized with
121+
`persistence=False`. This is because some local checkpointing routines are not pickleable.
122+
118123
Logging
119124
~~~~~~~
120125
The :py:class:`LocalCheckpointManager` uses Python’s logging module to generate output messages.

docs/source/index.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
nvidia-resiliency-ext v0.4.0
2-
=============================
1+
nvidia-resiliency-ext
2+
=====================
33

44
**nvidia-resiliency-ext** is a set of tools developed by NVIDIA to improve large-scale distributed training resiliency.
55

docs/source/release-notes.md

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,36 @@
22

33
NVIDIA Resiliency Extension is a Python package for framework developers and users to implement fault-tolerant features. It improves effective training time by minimizing downtime due to failures and interruptions.
44

5+
## NVIDIA Resiliency Extension v0.4.1
6+
7+
### Highlights
8+
9+
This hotfix release includes important bug fixes, performance improvements, and minor updates to enhance stability.
10+
11+
- Checkpointing
12+
- [PR 104](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/104), [PR 106](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/106), [PR 108](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/108), [PR 111](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/111) and [PR 116](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/116) fix the asynchronous checkpointing module to switch from temporal to using the persistent worker that uses `spawn` instead of `fork`.
13+
- The fix in this release is working toward an intermediate milestone of deprecating the use of `fork` and instead using a `spawn` for asynchronous checkpointing. The complete transition to using `spawn` has the following dependencies on `fork` that will be eliminated in upcoming release:
14+
- Local checkpointing must continue to use the `fork` based asynchronous checkpointing as clarified in the usage guide.
15+
- File IO operations with multiprocessing can still trigger a `fork`
16+
17+
- In-process restart
18+
- [PR 103](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/103) fixes a case where extra CUDA contexts were created on local rank 0 after restart, consuming extra GPU memory on local rank 0.
19+
- [PR 112](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/112) fixes the workload state leaks across the restart boundary. The fix addresses a case where objects created in the wrapped function could not be garbage collected after a restart, manifesting as a memory leak.
20+
21+
### Known Issues & Limitations
22+
23+
- In a future release, we will add changes to automatically terminate the persistent process when the main process terminates.
24+
- Until this change is implemented, job schedulers must ensure proper termination of the persistent process and its child workers for a graceful shutdown.
25+
26+
527
## NVIDIA Resiliency Extension v0.4.0
628

729
### Highlights
830

931
- Checkpointing
1032
- [PR 29](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/29) - Support for storing checkpoints to cloud object stores
1133
- Leverage cloud storage provider’s multithreaded SDK for rapid loading and saving checkpoints to object stores such as AWS S3, Azure Blob
12-
Storage, Google Cloud Storage and more using NVIDIA Multi-storage Client.
34+
Storage, Google Cloud Storage and more using NVIDIA Multi-storage Client
1335
- Provide scalable, reliable, cheaper, single source of truth across clouds/regions
1436
- Provide opt-out configuration when creating FileSystemWriterAsync class instance to allow users to passthrough to the filesystem
1537
- [PR 36](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/36) - Critical bug fix to enable async checkpoint loading without errors

examples/checkpointing/local_ckpt.py

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,6 @@ def parse_args():
4141
help="If set, replication of local checkpoints is enabled"
4242
"Needs to be enabled on all ranks.",
4343
)
44-
parser.add_argument(
45-
'--no_persistent_queue',
46-
action='store_false',
47-
default=True,
48-
dest='persistent_queue',
49-
help=(
50-
"Disables a persistent version of AsyncCallsQueue. "
51-
"Effective only when --async_save is set."
52-
),
53-
)
5444
parser.add_argument(
5545
'--replication_jump',
5646
default=4,
@@ -146,12 +136,6 @@ def load(args, ckpt_manager):
146136

147137
def main():
148138
args = parse_args()
149-
assert (
150-
not args.persistent_queue or args.async_save
151-
), "--persistent_queue requires --async_save to be enabled."
152-
assert (
153-
not args.persistent_queue or not args.replication
154-
), "persistent_queue is currently incompatible with replication due to object pickling issues."
155139
logging.info(f'{args}')
156140

157141
# Initialize the distributed backend
@@ -162,7 +146,8 @@ def main():
162146

163147
# Instantiate checkpointing classess needed for local checkpointing
164148
ckpt_manager = create_checkpoint_manager(args)
165-
async_queue = AsyncCallsQueue(persistent=args.persistent_queue) if args.async_save else None
149+
# Persistent queue is incompatible with local checkpointing because some routines are not pickleable.
150+
async_queue = AsyncCallsQueue(persistent=False) if args.async_save else None
166151

167152
iteration = 123 # training iteration (used as training state id)
168153

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
[tool.poetry]
55
name = "nvidia-resiliency-ext"
66
repository = "https://github.com/NVIDIA/nvidia-resiliency-ext"
7-
version = "0.4.0"
7+
version = "0.4.1"
88
description = "NVIDIA Resiliency Package"
99
authors = ["NVIDIA Corporation"]
1010
readme = "README.md"
@@ -20,7 +20,7 @@ packages = [
2020
]
2121

2222
exclude = [
23-
"src/nvidia_resiliency_ext/straggler/cupti_src"
23+
"src/nvidia_resiliency_ext/attribution/straggler/cupti_src"
2424
]
2525

2626
[tool.poetry.build]

src/nvidia_resiliency_ext/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from importlib.metadata import PackageNotFoundError, version
18+
19+
try:
20+
__version__ = version("nvidia-resiliency-ext")
21+
except PackageNotFoundError:
22+
__version__ = "unknown"

0 commit comments

Comments
 (0)