ray-project · justinvyu · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025
@@ -5,9 +5,7 @@
 Configuring Persistent Storage
 ==============================
 
-A Ray Train run produces a history of :ref:`reported metrics <train-monitoring-and-logging>`,
-:ref:`checkpoints <train-checkpointing>`, and :ref:`other artifacts <train-artifacts>`.
-You can configure these to be saved to a persistent storage location.
+A Ray Train run produces :ref:`checkpoints <train-checkpointing>` that can be saved to a persistent storage location.
 
 .. figure:: ../images/persistent_storage_checkpoint.png
     :align: center
@@ -25,8 +23,8 @@ Here are some capabilities that persistent storage enables:
 - **Checkpointing and fault tolerance**: Saving checkpoints to a persistent storage location
   allows you to resume training from the last checkpoint in case of a node failure.
   See :ref:`train-checkpointing` for a detailed guide on how to set up checkpointing.
-- **Post-experiment analysis**: A consolidated location storing data from all trials is useful for post-experiment analysis
-  such as accessing the best checkpoints and hyperparameter configs after the cluster has already been terminated.
+- **Post-experiment analysis**: A consolidated location storing data such as the best checkpoints and
+  hyperparameter configs after the Ray cluster has already been terminated.
 - **Bridge training/fine-tuning with downstream serving and batch inference tasks**: You can easily access the models
   and artifacts to share them with others or use them in downstream tasks.
 
@@ -179,7 +177,7 @@ Implement custom storage upload and download logic by providing an implementatio
         run_config=train.RunConfig(
             storage_filesystem=fs,
             storage_path="bucket-name/sub-path",
-            name="experiment_name",
+            name="unique-run-id",
         )
     )
 
@@ -235,7 +233,7 @@ Note that including these as query parameters in the ``storage_path`` URI direct
         ...,
         run_config=train.RunConfig(
             storage_path="s3://bucket-name/sub-path?endpoint_override=http://localhost:9000",
-            name="experiment_name",
+            name="unique-run-id",
         )
     )
 
@@ -257,21 +255,15 @@ and how they're structured in storage.
     import os
     import tempfile
 
-    from ray import train
+    import ray.train
     from ray.train import Checkpoint
     from ray.train.torch import TorchTrainer
 
     def train_fn(config):
         for i in range(10):
             # Training logic here
-
             metrics = {"loss": ...}
 
-            # Save arbitrary artifacts to the working directory
-            rank = train.get_context().get_world_rank()
-            with open(f"artifact-rank={rank}-iter={i}.txt", "w") as f:
-                f.write("data")
-
             with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
                 torch.save(..., os.path.join(temp_checkpoint_dir, "checkpoint.pt"))
                 train.report(
@@ -281,11 +273,10 @@ and how they're structured in storage.
 
     trainer = TorchTrainer(
         train_fn,
-        scaling_config=train.ScalingConfig(num_workers=2),
-        run_config=train.RunConfig(
+        scaling_config=ray.train.ScalingConfig(num_workers=2),
+        run_config=ray.train.RunConfig(
             storage_path="s3://bucket-name/sub-path/",
-            name="experiment_name",
-            sync_config=train.SyncConfig(sync_artifacts=True),
+            name="unique-run-id",
         )
     )
     result: train.Result = trainer.fit()
@@ -295,21 +286,12 @@ Here's a rundown of all files that will be persisted to storage:
 
 .. code-block:: text
 
-    s3://bucket-name/sub-path (RunConfig.storage_path)
-    └── experiment_name (RunConfig.name)          <- The "experiment directory"
-        ├── experiment_state-*.json
-        ├── basic-variant-state-*.json
-        ├── trainer.pkl
-        ├── tuner.pkl
-        └── TorchTrainer_46367_00000_0_...        <- The "trial directory"
-            ├── events.out.tfevents...            <- Tensorboard logs of reported metrics
-            ├── result.json                       <- JSON log file of reported metrics
-            ├── checkpoint_000000/                <- Checkpoints
-            ├── checkpoint_000001/
-            ├── ...
-            ├── artifact-rank=0-iter=0.txt        <- Worker artifacts (see the next section)
-            ├── artifact-rank=1-iter=0.txt
-            └── ...
+    {RunConfig.storage_path}  (ex: "s3://bucket-name/sub-path/")
+    └── {RunConfig.name}      (ex: "unique-run-id")               <- Train run output directory
+        ├── *_snapshot.json                                       <- Train run metadata files (DeveloperAPI)
+        ├── checkpoint_epoch=0/                                   <- Checkpoints
+        ├── checkpoint_epoch=1/
+        └── ...
 
 The :class:`~ray.train.Result` and :class:`~ray.train.Checkpoint` objects returned by
 ``trainer.fit`` are the easiest way to access the data in these files:
@@ -318,19 +300,90 @@ The :class:`~ray.train.Result` and :class:`~ray.train.Checkpoint` objects return
     :skipif: True
 
     result.filesystem, result.path
-    # S3FileSystem, "bucket-name/sub-path/experiment_name/TorchTrainer_46367_00000_0_..."
+    # S3FileSystem, "bucket-name/sub-path/unique-run-id"
 
     result.checkpoint.filesystem, result.checkpoint.path
-    # S3FileSystem, "bucket-name/sub-path/experiment_name/TorchTrainer_46367_00000_0_.../checkpoint_000009"
+    # S3FileSystem, "bucket-name/sub-path/unique-run-id/checkpoint_epoch=0"
 
 
 See :ref:`train-inspect-results` for a full guide on interacting with training :class:`Results <ray.train.Result>`.
 
 
-.. _train-artifacts:
+.. _train-storage-advanced:
+
+Advanced configuration
+----------------------
+
+.. _train-working-directory:
+
+Keep the original current working directory
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To disable the default behavior of Ray Train changing the current working directory,
+set the ``RAY_CHDIR_TO_TRIAL_DIR=0`` environment variable.
+
+This is useful if you want your training workers to access relative paths from the
+directory you launched the training script from.
+
+.. tip::
+
+    When running in a distributed cluster, you will need to make sure that all workers
+    have a mirrored working directory to access the same relative paths.
+
+    One way to achieve this is setting the
+    :ref:`working directory in the Ray runtime environment <workflow-local-files>`.
+
+.. testcode::
+
+    import os
+
+    import ray
+    import ray.train
+    from ray.train.torch import TorchTrainer
+
+    os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0"
+
+    # Write some file in the current working directory
+    with open("./data.txt", "w") as f:
+        f.write("some data")
+
+    # Set the working directory in the Ray runtime environment
+    ray.init(runtime_env={"working_dir": "."})
+
+    def train_fn_per_worker(config):
+        # Check that each worker can access the working directory
+        # NOTE: The working directory is copied to each worker and is read only.
+        assert os.path.exists("./data.txt"), os.getcwd()
+
+    trainer = TorchTrainer(
+        train_fn_per_worker,
+        scaling_config=ray.train.ScalingConfig(num_workers=2),
+        run_config=ray.train.RunConfig(
+            # storage_path=...,
+        ),
+    )
+    trainer.fit()
+
+
+Deprecated
+----------
+
+The following sections describe behavior that is deprecated as of Ray 2.43 and will not be supported in Ray Train V2,
+which is an overhaul of Ray Train's implementation and select APIs.
 
-Persisting training artifacts
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+See the following resources for more information:
+
+* `Train V2 REP <https://github.com/ray-project/enhancements/blob/main/reps/2024-10-18-train-tune-api-revamp/2024-10-18-train-tune-api-revamp.md>`_: Technical details about the API change
+* `Train V2 Migration Guide <https://github.com/ray-project/ray/issues/49454>`_: Full migration guide for Train V2
+
+(Deprecated) Persisting training artifacts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+    This feature of persisting training worker artifacts is deprecated as of Ray 2.43.
+    The feature relied on Ray Tune's local working directory abstraction,
+    where the local files of each worker would be copied to storage.
+    Ray Train V2 decouples the two libraries, so this API, which already provided limited value, has been deprecated.
 
 In the example above, we saved some artifacts within the training loop to the worker's
 *current working directory*.
@@ -349,12 +402,25 @@ all artifacts saved in this directory will be persisted to storage.
 The frequency of artifact syncing can be configured via :class:`SyncConfig <ray.train.SyncConfig>`.
 Note that this behavior is off by default.
 
-.. figure:: ../images/persistent_storage_artifacts.png
-    :align: center
-    :width: 600px
+Here's an example of what the Train run output directory looks like, with the worker artifacts:
 
-    Multiple workers spread across multiple nodes save artifacts to their local
-    working directory, which is then persisted to storage.
+.. code-block:: text
+
+    s3://bucket-name/sub-path (RunConfig.storage_path)
+    └── experiment_name (RunConfig.name)          <- The "experiment directory"
+        ├── experiment_state-*.json
+        ├── basic-variant-state-*.json
+        ├── trainer.pkl
+        ├── tuner.pkl
+        └── TorchTrainer_46367_00000_0_...        <- The "trial directory"
+            ├── events.out.tfevents...            <- Tensorboard logs of reported metrics
+            ├── result.json                       <- JSON log file of reported metrics
+            ├── checkpoint_000000/                <- Checkpoints
+            ├── checkpoint_000001/
+            ├── ...
+            ├── artifact-rank=0-iter=0.txt        <- Worker artifacts
+            ├── artifact-rank=1-iter=0.txt
+            └── ...
 
 .. warning::
 
@@ -378,16 +444,13 @@ Note that this behavior is off by default.
             # Every local rank 0 worker saves artifacts.
             ...
 
-
-.. _train-storage-advanced:
-
-Advanced configuration
-----------------------
-
 .. _train-local-staging-dir:
 
-Setting the local staging directory
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+(Deprecated) Setting the local staging directory
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. note::
+    This section describes behavior depending on Ray Tune implementation details that no longer applies to Ray Train V2.
 
 .. warning::
 
@@ -429,59 +492,3 @@ Here's an example of what the local staging directory looks like:
 
     The structure of the local staging directory is subject to change
     in future versions of Ray Train -- do not rely on these local staging files in your application.
-
-
-.. _train-working-directory:
-
-Keep the original current working directory
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To disable the default behavior of Ray Train changing the current working directory,
-set the ``RAY_CHDIR_TO_TRIAL_DIR=0`` environment variable.
-
-This is useful if you want your training workers to access relative paths from the
-directory you launched the training script from.
-
-.. tip::
-
-    When running in a distributed cluster, you will need to make sure that all workers
-    have a mirrored working directory to access the same relative paths.
-
-    One way to achieve this is setting the
-    :ref:`working directory in the Ray runtime environment <workflow-local-files>`.
-
-.. testcode::
-
-    import os
-
-    import ray
-    import ray.train
-    from ray.train.torch import TorchTrainer
-
-    os.environ["RAY_CHDIR_TO_TRIAL_DIR"] = "0"
-
-    # Write some file in the current working directory
-    with open("./data.txt", "w") as f:
-        f.write("some data")
-
-    # Set the working directory in the Ray runtime environment
-    ray.init(runtime_env={"working_dir": "."})
-
-    def train_fn_per_worker(config):
-        # Check that each worker can access the working directory
-        # NOTE: The working directory is copied to each worker and is read only.
-        assert os.path.exists("./data.txt"), os.getcwd()
-
-        # To use artifact syncing with `SyncConfig(sync_artifacts=True)`,
-        # write artifacts here, instead of the current working directory:
-        ray.train.get_context().get_trial_dir()
-
-    trainer = TorchTrainer(
-        train_fn_per_worker,
-        scaling_config=ray.train.ScalingConfig(num_workers=2),
-        run_config=ray.train.RunConfig(
-            # storage_path=...,
-            sync_config=ray.train.SyncConfig(sync_artifacts=True),
-        ),
-    )
-    trainer.fit()