Merge remote-tracking branch 'upstream/main' into diff

Ava-A4098 · Ava-A4098 · commit 0e261797c6f2 · 2025-08-04T16:22:17.000-07:00
diff --git a/README.md b/README.md
@@ -11,25 +11,26 @@ For detailed documentation and usage information about each component, please re
 
 ## Core Components and Capabilities
 
-- **[Fault Tolerance](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/fault_tolerance/index.rst)**
+- **[Fault Tolerance](https://nvidia.github.io/nvidia-resiliency-ext/fault_tolerance/index.html)**
   - Detection of hung ranks.  
   - Restarting training in-job, without the need to reallocate SLURM nodes.
 
-- **[In-Process Restarting](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/inprocess/index.rst)**
+- **[In-Process Restarting](https://nvidia.github.io/nvidia-resiliency-ext/inprocess/index.html)**
   - Detecting failures and enabling quick recovery.
 
-- **[Async Checkpointing](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/checkpointing/async/index.rst)**
+- **[Async Checkpointing](https://nvidia.github.io/nvidia-resiliency-ext/checkpointing/async/index.html)**
   - Providing an efficient framework for asynchronous checkpointing.
 
-- **[Local Checkpointing](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/checkpointing/local/index.rst)**
+- **[Local Checkpointing](https://nvidia.github.io/nvidia-resiliency-ext/checkpointing/local/index.html)**
   - Providing an efficient framework for local checkpointing.
 
-- **[Straggler Detection](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/straggler_det/index.rst)**
+- **[Straggler Detection](https://nvidia.github.io/nvidia-resiliency-ext/straggler_det/index.html)**
   - Monitoring GPU and CPU performance of ranks.  
   - Identifying slower ranks that may impede overall training efficiency.
 
-- **[PyTorch Lightning Callbacks](https://github.com/NVIDIA/nvidia-resiliency-ext/blob/main/docs/source/fault_tolerance/integration/ptl.rst)**
-  - Facilitating seamless NVRx integration with PyTorch Lightning.
+- **Framework Integration**
+  - Facilitating seamless [fault tolerance](https://nvidia.github.io/nvidia-resiliency-ext/fault_tolerance/integration/ptl.html) and [straggler detection](https://nvidia.github.io/nvidia-resiliency-ext/straggler_det/usage_guide.html#integration-guide) integration with PyTorch Lightning based workloads.
+  - Providing integration with NVIDIA [NeMo](https://docs.nvidia.com/nemo-framework/user-guide/latest/resiliency.html) framework, a scalable and cloud-native generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (e.g. Automatic Speech Recognition and Text-to-Speech).
 
 ## Installation
 
diff --git a/cupti_build.py b/cupti_build.py
@@ -15,7 +15,9 @@
 
 import glob
 import os
-
+import re
+import shutil
+import subprocess
 from pybind11.setup_helpers import Pybind11Extension, build_ext
 
 
@@ -43,6 +45,54 @@ def _skip_ext_build():
     return ans.lower() in ['1', 'on', 'yes', 'true']
 
 
+def get_cuda_path():
+    """
+    Determines the path to the CUDA installation.
+
+    Find the CUDA root directory under stanadard paths or using nvcc
+    as it's typically done in build systems like CMake.
+
+    1. Check if $CUDA_PATH is set or /usr/local/cuda exists; return it if so.
+    2. If not, check if nvcc is in PATH. If yes, run "nvcc -v test.cu" and parse output for CUDA root.
+    3. If neither method works, raise FileNotFoundError.
+
+    Returns:
+        str: The path to the CUDA installation directory.
+
+    Raises:
+        FileNotFoundError: If the CUDA installation cannot be found.
+    """
+    cuda_path = os.environ.get("CUDA_PATH", "/usr/local/cuda")
+    if os.path.isdir(cuda_path):
+        return cuda_path
+
+    nvcc_path = shutil.which("nvcc")
+    if nvcc_path:
+        try:
+            # try to extract CUDA root from nvcc output
+            result = subprocess.run(
+                [nvcc_path, "-v", "test.cu"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                check=False,
+                universal_newlines=True,
+            )
+            # match "#$ TOP=..." in output
+            match = re.search(r'#\$ TOP=([^\r\n]*)', result.stdout)
+            if match and os.path.isdir(match.group(1)):
+                return match.group(1)
+            else:
+                # fallback: get directory where nvcc is located
+                return os.path.dirname(os.path.dirname(nvcc_path))
+        except Exception:
+            pass
+
+    raise FileNotFoundError(
+        "CUDA installation not found in /usr/local/cuda or $CUDA_PATH, "
+        "and could not determine CUDA path from nvcc"
+    )
+
+
 def build(setup_kwargs):
 
     if _skip_ext_build():
@@ -54,9 +104,7 @@ def build(setup_kwargs):
     include_dirs = None
     library_dirs = None
 
-    cuda_path = os.environ.get("CUDA_PATH", "/usr/local/cuda")
-    if not os.path.isdir(cuda_path):
-        raise FileNotFoundError("cuda installation not found in /usr/local/cuda or $CUDA_PATH")
+    cuda_path = get_cuda_path()
 
     cupti_h = "cupti.h"
     libcupti_so = "libcupti.so"
diff --git a/docs/source/checkpointing/async/usage_guide.rst b/docs/source/checkpointing/async/usage_guide.rst
@@ -7,19 +7,41 @@ which defines checkpoint routine, its args/kwargs and finalization steps when th
 :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.torch_ckpt.TorchAsyncCheckpoint` 
            is an instatiation of the core utilities to make `torch.save` run asynchronously.
 
+:py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.state_dict_saver.save_state_dict_async_plan` is an instantiation of the core utilities to make `torch.distributed.save_state_dict` run asynchronously.
 
-The implementation assumes all training ranks creates :py:class:`core.AsyncCallsQueue` and synchronize with :py:class:`core.AsyncCallsQueue.maybe_finalize_async_calls` by default.
+The implementation assumes all training ranks creates :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue` and synchronize with :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue.maybe_finalize_async_calls` by default.
 
 
-Requirements
-------------
-:py:class:`nvidia_resiliency_ext.checkpointing.utils` includes a couple of routines used for :py:class:`nvidia_resiliency_ext.checkpointing.async_ckpt.core`
-:py:class:`nvidia_resiliency_ext.checkpointing.utils.wrap_for_async` disables garbage collection in a forked process to run user's checkpoint routine
-to prevent failures incurred by GC, which tries to deallocate CUDA tensors in a forked process.
-This routine requires the first argument of the passed user fn should be state dictionary containing tensors or objects for checkpoint
- 
-The current implementation uses a forked process to run pre-staged tensors in host memory by pinned memcpy. 
-So, the routine should include :py:class:`nvidia_resiliency_ext.checkpointing.utils.preload_tensors` to stage GPU tensors in a state dictionary to host memory before it's passed to `AsyncCallsQueue`
+Implementation Changes and Evolution
+------------------------------------
+* We have deprecated our initial implementation of async checkpointing, :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller`, using a forked process to run the checkpointing in the background. 
+
+* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncCallsQueue` is now initialized by default to use :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` instead of :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.TemporalAsyncCaller`.
+
+* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` spawns a persistent process that runs in a separate CUDA context and forks processes optionally for intra-node parallelism.
+
+* Now, we don't need :py:func:`~nvidia_resiliency_ext.checkpointing.utils.wrap_for_async` anymore because :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` is safe to call garbage collection in the spawned process.
+
+* :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` runs :py:func:`~nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync.preload_tensors` in the spawned process. 
+   So, we've added a new field, :py:attr:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.AsyncRequest.preload_fn`, to pass the preload function(preload_fn) to the spawned process.
+  
+  * The preload_fn should be self-contained with a proper list of arguments with :py:class:`functools.partial`.
+
+  * The preload_fn should be a function that takes a state dictionary and returns a state dictionary.
+
+  * :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller` receives GPU tensor IPC handles and prestages them to host memory through a preload_fn 
+    so dereference of GPU tensors should be done promptly inside of `preload_fn` if possible.
+
+* A proper termination of the persistent process is required for graceful shutdown.
+    
+  * Job schedulers(e.g. Slurm, torchrun) should clean up the persistent process and its child workers when the job step is terminated.
+
+  * The following changes will be made in the next release to the implementation of :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.core.PersistentAsyncCaller`:
+
+    * We'll make the persistent process to be terminated when the main process is terminated.
+
+    * Optional child workers created by :py:class:`~nvidia_resiliency_ext.checkpointing.async_ckpt.filesystem_async.FileSystemWriterAsync` are terminated when the persistent process is terminated.
+
 
 
 Synchronization of Asynchronous Checkpoint Requests
@@ -218,3 +240,13 @@ The following example demonstrates a complete workflow for saving and loading ch
 
     # Load checkpoint synchronously
     loaded_state_dict = load_checkpoint(checkpoint_path, state_dict.copy())
+
+
+Best Practices
+--------------
+* Use process binding to pin the checkpointing process to a specific GPU. This is important for pre-staging tensors to host memory.
+
+.. code-block:: bash
+
+    # Example for a 8 GPU on 2 socket CPU with SLURM
+    numactl --cpunodebind=$((SLURM_LOCALID / 4)) --membind=$((SLURM_LOCALID / 4)) python train.py
diff --git a/docs/source/checkpointing/local/usage_guide.rst b/docs/source/checkpointing/local/usage_guide.rst
@@ -37,7 +37,8 @@ Requirements for `BasicTensorAwareStateDict`
 
 Restrictions
 ------------
-Currently under review - no documented restrictions at this time.
+- `AsyncCallsQueue` must be initialized with `persistence=False`, because some local checkpointing routines
+  are not pickleable. This restriction may be lifted in the future.
 
 Functionality Overview
 ----------------------
@@ -108,13 +109,17 @@ controlled by the `is_async` parameter in the `save(...)` method.
   performs a blocking save operation, ensuring all data is written before returning.
 - Asynchronous Save: When `is_async` is set to `True`, the `save(...)` method
   initiates a non-blocking save and returns an `AsyncRequest` object.
-  This class is fully compatible with the `nvidia_resiliency_ext.checkpointing.async_ckpt` module.
+  This class is compatible with the `nvidia_resiliency_ext.checkpointing.async_ckpt` module.
 
 The returned `AsyncRequest` can then be submitted to an `AsyncCallsQueue`,
 enabling advanced asynchronous processing.
 The usage of `AsyncRequest` with `AsyncCallsQueue` is demonstrated in the provided example,
 showcasing how to efficiently manage non-blocking saves within your workflow.
 
+.. note::
+   Per the Restrictions and the included example, `AsyncCallsQueue` must be initialized with
+   `persistence=False`. This is because some local checkpointing routines are not pickleable.
+
 Logging
 ~~~~~~~
 The :py:class:`LocalCheckpointManager` uses Python’s logging module to generate output messages.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,5 +1,5 @@
-nvidia-resiliency-ext v0.4.0
-=============================
+nvidia-resiliency-ext
+=====================
 
 **nvidia-resiliency-ext** is a set of tools developed by NVIDIA to improve large-scale distributed training resiliency.
 
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
@@ -2,14 +2,36 @@
 
 NVIDIA Resiliency Extension is a Python package for framework developers and users to implement fault-tolerant features. It improves effective training time by minimizing downtime due to failures and interruptions.
 
+## NVIDIA Resiliency Extension v0.4.1
+
+### Highlights
+
+This hotfix release includes important bug fixes, performance improvements, and minor updates to enhance stability. 
+
+- Checkpointing
+    - [PR 104](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/104), [PR 106](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/106), [PR 108](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/108), [PR 111](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/111) and [PR 116](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/116) fix the asynchronous checkpointing module to switch from temporal to using the persistent worker that uses `spawn` instead of `fork`.
+    - The fix in this release is working toward an intermediate milestone of deprecating the use of `fork` and instead using a `spawn` for asynchronous checkpointing. The complete transition to using `spawn` has the following dependencies on `fork` that will be eliminated in upcoming release:
+        - Local checkpointing must continue to use the `fork` based asynchronous checkpointing as clarified in the usage guide.
+        - File IO operations with multiprocessing can still trigger a `fork`  
+
+- In-process restart
+    - [PR 103](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/103) fixes a case where extra CUDA contexts were created on local rank 0 after restart, consuming extra GPU memory on local rank 0.
+    - [PR 112](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/112) fixes the workload state leaks across the restart boundary. The fix addresses a case where objects created in the wrapped function could not be garbage collected after a restart, manifesting as a memory leak. 
+
+### Known Issues & Limitations
+
+- In a future release, we will add changes to automatically terminate the persistent process when the main process terminates.
+- Until this change is implemented, job schedulers must ensure proper termination of the persistent process and its child workers for a graceful shutdown. 
+  
+
 ## NVIDIA Resiliency Extension v0.4.0
 
 ### Highlights
 
 - Checkpointing
     - [PR 29](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/29) - Support for storing checkpoints to cloud object stores
         - Leverage cloud storage provider’s multithreaded SDK for rapid loading and saving checkpoints to object stores such as AWS S3, Azure Blob 
-          Storage, Google Cloud Storage and more using NVIDIA Multi-storage Client.
+          Storage, Google Cloud Storage and more using NVIDIA Multi-storage Client
         - Provide scalable, reliable, cheaper, single source of truth across clouds/regions
         - Provide opt-out configuration when creating FileSystemWriterAsync class instance to allow users to passthrough to the filesystem
     - [PR 36](https://github.com/NVIDIA/nvidia-resiliency-ext/pull/36) - Critical bug fix to enable async checkpoint loading without errors 
diff --git a/examples/checkpointing/local_ckpt.py b/examples/checkpointing/local_ckpt.py
@@ -41,16 +41,6 @@ def parse_args():
         help="If set, replication of local checkpoints is enabled"
         "Needs to be enabled on all ranks.",
     )
-    parser.add_argument(
-        '--no_persistent_queue',
-        action='store_false',
-        default=True,
-        dest='persistent_queue',
-        help=(
-            "Disables a persistent version of AsyncCallsQueue. "
-            "Effective only when --async_save is set."
-        ),
-    )
     parser.add_argument(
         '--replication_jump',
         default=4,
@@ -146,12 +136,6 @@ def load(args, ckpt_manager):
 
 def main():
     args = parse_args()
-    assert (
-        not args.persistent_queue or args.async_save
-    ), "--persistent_queue requires --async_save to be enabled."
-    assert (
-        not args.persistent_queue or not args.replication
-    ), "persistent_queue is currently incompatible with replication due to object pickling issues."
     logging.info(f'{args}')
 
     # Initialize the distributed backend
@@ -162,7 +146,8 @@ def main():
 
     # Instantiate checkpointing classess needed for local checkpointing
     ckpt_manager = create_checkpoint_manager(args)
-    async_queue = AsyncCallsQueue(persistent=args.persistent_queue) if args.async_save else None
+    # Persistent queue is incompatible with local checkpointing because some routines are not pickleable.
+    async_queue = AsyncCallsQueue(persistent=False) if args.async_save else None
 
     iteration = 123  # training iteration (used as training state id)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@
 [tool.poetry]
 name = "nvidia-resiliency-ext"
 repository = "https://github.com/NVIDIA/nvidia-resiliency-ext"
-version = "0.4.0"
+version = "0.4.1"
 description = "NVIDIA Resiliency Package"
 authors = ["NVIDIA Corporation"]
 readme = "README.md"
@@ -20,7 +20,7 @@ packages = [
 ]
 
 exclude = [
-    "src/nvidia_resiliency_ext/straggler/cupti_src"
+    "src/nvidia_resiliency_ext/attribution/straggler/cupti_src"
 ]
 
 [tool.poetry.build]
diff --git a/src/nvidia_resiliency_ext/__init__.py b/src/nvidia_resiliency_ext/__init__.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from importlib.metadata import PackageNotFoundError, version
+
+try:
+    __version__ = version("nvidia-resiliency-ext")
+except PackageNotFoundError:
+    __version__ = "unknown"
diff --git a/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py b/src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py
diff --git a/src/nvidia_resiliency_ext/inprocess/rank_assignment.py b/src/nvidia_resiliency_ext/inprocess/rank_assignment.py
diff --git a/src/nvidia_resiliency_ext/inprocess/wrap.py b/src/nvidia_resiliency_ext/inprocess/wrap.py
diff --git a/tests/fault_tolerance/unit/test_dynamic_rendezvous.py b/tests/fault_tolerance/unit/test_dynamic_rendezvous.py