From bbd81fd49b75373008638e9c9ed0109166bac427 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 13:38:54 -0500
Subject: [PATCH 01/29] Remove last references to  from training

---
 recipes/lora_finetune_distributed_multi_dataset.py     | 2 +-
 torchtune/training/checkpointing/_checkpoint_client.py | 2 +-
 torchtune/training/checkpointing/_checkpointer.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/recipes/lora_finetune_distributed_multi_dataset.py b/recipes/lora_finetune_distributed_multi_dataset.py
index 7d0d442c6c..25367a4d7d 100644
--- a/recipes/lora_finetune_distributed_multi_dataset.py
+++ b/recipes/lora_finetune_distributed_multi_dataset.py
@@ -138,7 +138,7 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
 
         self._is_rank_zero = rank == 0
 
diff --git a/torchtune/training/checkpointing/_checkpoint_client.py b/torchtune/training/checkpointing/_checkpoint_client.py
index a87d59c4d0..4b9e11d1c3 100644
--- a/torchtune/training/checkpointing/_checkpoint_client.py
+++ b/torchtune/training/checkpointing/_checkpoint_client.py
@@ -72,7 +72,7 @@ def __init__(
         self._optimizer_in_bwd = self._cfg.get("optimizer_in_bwd", False)
         self._device = utils.get_device(device=self._cfg.device)
 
-        _, self._rank = training.get_world_size_and_rank()
+        _, self._rank = utils.get_world_size_and_rank()
         self._is_rank_zero = self._rank == 0
 
     def _get_checkpointer(self):
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index 7c8d2b0bed..8d28ab22e1 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -1193,7 +1193,7 @@ def __init__(
         self._checkpoint_future = None
         self._checkpoint_dir_prefix = "dist_epoch"
         self._metadata_file = ".metadata"
-        _, self._rank = training.get_world_size_and_rank()
+        _, self._rank = utils.get_world_size_and_rank()
         self._process_group: Optional[dist.ProcessGroup] = process_group
 
     def _get_latest_intermediate_checkpoint(self) -> Optional[str]:

From c04ebaf753e72cf98f5f98b1322304f8a4af0980 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 14:23:56 -0500
Subject: [PATCH 02/29] Deprecate  and use new  function

---
 recipes/full_finetune_distributed.py         | 30 ++++++++++----------
 tests/torchtune/training/test_distributed.py | 10 ++-----
 torchtune/training/_distributed.py           | 14 +++++++--
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 34ad48e938..9afd238711 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -118,7 +118,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface):
     """
 
     def __init__(self, cfg: DictConfig) -> None:
-        self._device = utils.get_device(device=cfg.device)
+        device_type = cfg.device
+        self._device = utils.get_device(device=device_type)
         self._dtype = training.get_dtype(cfg.dtype, device=self._device)
 
         if self._dtype == torch.float16:
@@ -126,7 +127,7 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
-        # logging attributes
+        # Logging attributes
         self._output_dir = cfg.output_dir
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
@@ -147,6 +148,10 @@ def __init__(self, cfg: DictConfig) -> None:
         self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
+        self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
+        self.distributed_backend = get_distributed_backend(
+            device_type, enable_cpu_offload=self.fsdp_cpu_offload
+        )
 
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
         if self._optimizer_in_bwd:
@@ -240,9 +245,16 @@ def setup(self, cfg: DictConfig) -> None:
         Setup the recipe. This includes training state (if resume_from_checkpoint is True),
         model, tokenizer, loss, optimizer, lr scheduler, sampler, and dataloader.
         """
+        # Set up the backend for distributed training (NCCL, GLOO, etc.)
+        init_process_group(self.distributed_backend)
+
+        if self.fsdp_cpu_offload:
+            # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
+            # speed up when benchmarking fused AdamW on CPU
+            training.set_torch_num_threads()
+
         if self._is_rank_zero:
             self._metric_logger = config.instantiate(cfg.metric_logger)
-
             # log config with parameter override
             self._metric_logger.log_config(cfg)
 
@@ -890,19 +902,7 @@ def recipe_main(cfg: DictConfig) -> None:
         - Parameters specified in config (see available configs through ``tune ls``)
         - Overwritten by arguments from the command-line
     """
-    if not training.is_distributed():
-        raise RuntimeError(
-            "Distributed finetune recipe should be run via a distributed launcher."
-            "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
-        )
-    init_process_group("cuda:nccl,cpu:gloo")
-    if cfg.get("fsdp_cpu_offload", False):
-        # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
-        # speed up when benchmarking fused AdamW on CPU
-        training.set_torch_num_threads()
-
     config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg)
-
     recipe = FullFinetuneRecipeDistributed(cfg=cfg)
     recipe.setup(cfg=cfg)
     recipe.train()
diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py
index 3fe2dd340d..960339fc1b 100644
--- a/tests/torchtune/training/test_distributed.py
+++ b/tests/torchtune/training/test_distributed.py
@@ -14,7 +14,7 @@
 import torch.nn as nn
 from packaging import version
 from tests.test_utils import gpu_test
-from torch.distributed import launcher
+from torch.distributed import init_process_group, launcher
 from torch.distributed._composable.fsdp import fully_shard
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     CheckpointWrapper,
@@ -37,12 +37,6 @@
 
 
 class TestDistributed:
-    def test_init_distributed(self) -> None:
-        """Integration test to confirm consistency across device initialization utilities."""
-        distributed = training.init_distributed()
-        assert (
-            not distributed
-        ), "Should return False as there are no distributed environment variables"
 
     @staticmethod
     def _test_worker_fn(init_pg_explicit: bool) -> None:
@@ -52,7 +46,7 @@ def _test_worker_fn(init_pg_explicit: bool) -> None:
         if init_pg_explicit:
             torch.distributed.init_process_group(backend="gloo")
         if not torch.distributed.is_initialized():
-            training.init_distributed(backend="gloo")
+            init_process_group(backend="gloo")
         if not torch.distributed.is_initialized():
             raise AssertionError("Expected torch.distributed to be initialized")
         pg_backend = torch.distributed.get_backend()
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
index ff959c5f23..b87ccdff20 100644
--- a/torchtune/training/_distributed.py
+++ b/torchtune/training/_distributed.py
@@ -41,8 +41,6 @@
 _log: logging.Logger = get_logger()
 
 
-_valid_distributed_single_node_nnodes = ["1:1", "1"]
-
 torch_version = torch.__version__
 _DISTRIBUTED_STATE_DICT_API_IS_AVAILABLE = (
     "dev" not in torch_version and torch_version_ge("2.6.0")
@@ -97,6 +95,18 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
         return tensor
 
 
+def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False):
+    backend = "nccl"
+    if device_type in dist.Backend.default_device_backend_map.keys():
+        backend = dist.default_device_backend_map.get(device_type)
+    if enable_cpu_offload:
+        backend = f"{device_type}:{backend},cpu:gloo"
+    return backend
+
+
+@deprecated(
+    msg="The functionality of `init_distributed` is covered by `torch.distributed.init_process_group`. "
+)
 def init_distributed(**kwargs: Dict[str, Any]) -> bool:
     """Initialize process group required for ``torch.distributed``.
 

From e02d39b534a2aa1307bb5961cdcff9b10b44ea87 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 14:32:02 -0500
Subject: [PATCH 03/29] Expose

---
 recipes/full_finetune_distributed.py |  6 +++---
 torchtune/training/__init__.py       |  2 ++
 torchtune/training/_distributed.py   | 20 +++++++++++++++++++-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 9afd238711..93e5a208cb 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -132,7 +132,7 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type != "cuda":
+        if self._log_peak_memory_stats and device_type != "cuda":
             log.info(
                 "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
@@ -149,7 +149,7 @@ def __init__(self, cfg: DictConfig) -> None:
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
         self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
-        self.distributed_backend = get_distributed_backend(
+        self.distributed_backend = training.get_distributed_backend(
             device_type, enable_cpu_offload=self.fsdp_cpu_offload
         )
 
@@ -174,7 +174,7 @@ def __init__(self, cfg: DictConfig) -> None:
             "enable_activation_offloading", False
         )
         if self._enable_activation_offloading:
-            if self._device.type != "cuda":
+            if device_type != "cuda":
                 raise RuntimeError(
                     "enable_activation_offloading should only be True when training on CUDA"
                 )
diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
index d461d84dc4..06795d4bb8 100644
--- a/torchtune/training/__init__.py
+++ b/torchtune/training/__init__.py
@@ -11,6 +11,7 @@
 from torchtune.training._compile import compile_loss, compile_model
 from torchtune.training._distributed import (
     gather_cpu_state_dict,
+    get_distributed_backend,
     get_full_optimizer_state_dict,
     get_shard_conditions,
     get_world_size_and_rank,
@@ -99,6 +100,7 @@
     "TOTAL_EPOCHS_KEY",
     "get_quantizer_mode",
     "get_cosine_schedule_with_warmup",
+    "get_distributed_backend",
     "get_lr",
     "cleanup_before_training",
     "create_optim_in_bwd_wrapper",
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
index b87ccdff20..8627d644a6 100644
--- a/torchtune/training/_distributed.py
+++ b/torchtune/training/_distributed.py
@@ -95,7 +95,25 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
         return tensor
 
 
-def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False):
+def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) -> str:
+    """Gets the PyTorch Distributed backend based on device type.
+
+    Args:
+        device_type (str): Device type to get backend for.
+        enable_cpu_offload (bool): Flag to check if offload to CPU is enabled. If it is, we will add a GLOO
+            backend to handle CPU training.
+
+    Example:
+        >>> get_distributed_backend("cuda")
+        'nccl'
+        >>> get_distributed_backend("cpu")
+        'gloo'
+        >>> get_distributed_backend("cuda", enable_cpu_offload=True)
+        'cuda:nccl,cpu:gloo'
+
+    Returns:
+        str: Distributed backend for use in ``torch.distributed.init_process_group``.
+    """
     backend = "nccl"
     if device_type in dist.Backend.default_device_backend_map.keys():
         backend = dist.default_device_backend_map.get(device_type)

From c558f27dad08301eee181ef277bf0e21d287b3b6 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 14:32:52 -0500
Subject: [PATCH 04/29] Update API docs

---
 docs/source/api_ref_training.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst
index 9cba6fb9ea..747f312447 100644
--- a/docs/source/api_ref_training.rst
+++ b/docs/source/api_ref_training.rst
@@ -53,6 +53,7 @@ Utilities for enabling and working with distributed training.
     init_distributed
     is_distributed
     gather_cpu_state_dict
+    get_distributed_backend
 
 .. _ac_label:
 

From 454536c6182bbf1ec6649f4ef941bac5e4f3031d Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 14:37:40 -0500
Subject: [PATCH 05/29] Add tests

---
 recipes/full_finetune_distributed.py         | 2 +-
 tests/torchtune/training/test_distributed.py | 8 ++++++++
 torchtune/training/_distributed.py           | 2 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 93e5a208cb..c8a6429033 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -267,7 +267,7 @@ def setup(self, cfg: DictConfig) -> None:
             enable_activation_checkpointing=self._enable_activation_checkpointing,
             enable_activation_offloading=self._enable_activation_offloading,
             custom_sharded_layers=cfg.get("custom_sharded_layers", None),
-            fsdp_cpu_offload=cfg.get("fsdp_cpu_offload", False),
+            fsdp_cpu_offload=self.fsdp_cpu_offload,
             reshard_after_forward=cfg.get("fsdp_reshard_after_forward", True),
             model_state_dict=checkpoint_dict[training.MODEL_KEY],
             ac_mode=cfg.get("ac_mode", None),
diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py
index 960339fc1b..c693e1918b 100644
--- a/tests/torchtune/training/test_distributed.py
+++ b/tests/torchtune/training/test_distributed.py
@@ -88,6 +88,14 @@ def test_validate_no_params_on_meta_device(self) -> None:
         with pytest.raises(RuntimeError, match="Unexpected param or buffer"):
             training.validate_no_params_on_meta_device(model)
 
+    def test_get_distributed_backend(self) -> None:
+        assert training.get_distributed_backend("cuda") == "nccl"
+        assert training.get_distributed_backend("cpu") == "gloo"
+        assert (
+            training.get_distributed_backend("cuda", enable_cpu_offload=True)
+            == "cuda:nccl,cpu:gloo"
+        )
+
 
 N_LAYERS = 3
 IN_DIM = 5
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
index 8627d644a6..c2179a4e80 100644
--- a/torchtune/training/_distributed.py
+++ b/torchtune/training/_distributed.py
@@ -101,7 +101,7 @@ def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False)
     Args:
         device_type (str): Device type to get backend for.
         enable_cpu_offload (bool): Flag to check if offload to CPU is enabled. If it is, we will add a GLOO
-            backend to handle CPU training.
+            backend to handle CPU training. Default is False.
 
     Example:
         >>> get_distributed_backend("cuda")

From 66b06e1983c862f81bb42aa663a17a4f7cbaf774 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 14:40:37 -0500
Subject: [PATCH 06/29] Lint

---
 tests/torchtune/training/test_distributed.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py
index c693e1918b..d4821348f7 100644
--- a/tests/torchtune/training/test_distributed.py
+++ b/tests/torchtune/training/test_distributed.py
@@ -37,7 +37,6 @@
 
 
 class TestDistributed:
-
     @staticmethod
     def _test_worker_fn(init_pg_explicit: bool) -> None:
         """

From 0d5aeb4496aca37b9f2a654fbad6c5c15a0993c1 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 15:03:45 -0500
Subject: [PATCH 07/29] Add multinode recipe and sbatch script

---
 .../configs/llama3_3/70B_full_multinode.yaml  | 104 ++++++++++++++++++
 recipes/full_finetune_multinode.slurm         |  34 ++++++
 2 files changed, 138 insertions(+)
 create mode 100644 recipes/configs/llama3_3/70B_full_multinode.yaml
 create mode 100644 recipes/full_finetune_multinode.slurm

diff --git a/recipes/configs/llama3_3/70B_full_multinode.yaml b/recipes/configs/llama3_3/70B_full_multinode.yaml
new file mode 100644
index 0000000000..4572792661
--- /dev/null
+++ b/recipes/configs/llama3_3/70B_full_multinode.yaml
@@ -0,0 +1,104 @@
+# Config for multi-node full finetuning in full_finetune_distributed.py
+# using a Llama3.3 70B Instruct model
+#
+# This config assumes that you've run the following command before launching:
+#   tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" --output-dir SHARED_CLUSTER_FS
+#
+# To launch on 2 nodes w/ 8 devices on a SLURM cluster, run the following command:
+#   sbatch full_finetune_multinode.slurm
+#
+# This config is only tested on 2 nodes w/ 8 H100 machines.
+
+output_dir: /tmp/torchtune/llama3_3_70B/full
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model
+  max_seq_len: 1024
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  packed: True  # True increases speed
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_3.llama3_3_70b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/
+  checkpoint_files:
+    filename_format: model-{}-of-{}.safetensors
+    max_filename: "00030"
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 4
+epochs: 1
+
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+  # Note: highly recommended to use fused=True optimizer flag
+  # with CPU offload for faster optimizer step.
+  fused: True
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
+custom_sharded_layers: ['tok_embeddings', 'output']  # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
+fsdp_cpu_offload: False
+clip_grad_norm: null
+compile: True  # torch.compile the model + loss, True increases speed + decreases memory
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}/logs
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 3
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm
new file mode 100644
index 0000000000..a57ee35537
--- /dev/null
+++ b/recipes/full_finetune_multinode.slurm
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# ---------- SBATCH commands ---------- #
+#SBATCH --job-name=torchtune-multi-node
+#SBATCH --ntasks=2
+#SBATCH --nodes=2
+#SBATCH --gpus-per-task=8
+#SBATCH --cpus-per-task=96
+#SBATCH --partition=train
+
+# ---------- Set env variables ---------- #
+# Grab the IP for head node:
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+echo Node IP: $head_node_ip
+export LOGLEVEL=INFO
+
+# You might need to explicitly set the network interface:
+# export NCCL_SOCKET_IFNAME=...
+# export GLOO_SOCKET_IFNAME=...
+
+export TORCH_DIST_INIT_BARRIER=1
+
+# ---------- Launch training ---------- #
+# Adjust sbatch --ntasks and sbatch --nodes above and --nnodes below to your specific node count
+srun tune run --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" \
+    full_finetune_distributed --config ${CONFIG_FILE}

From afc9c2e2a331279ee0f90de27e009e0af1596b08 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 15:12:49 -0500
Subject: [PATCH 08/29] Update launch commands

---
 recipes/full_finetune_multinode.slurm | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm
index a57ee35537..b8f7fbedef 100644
--- a/recipes/full_finetune_multinode.slurm
+++ b/recipes/full_finetune_multinode.slurm
@@ -29,6 +29,16 @@ export LOGLEVEL=INFO
 export TORCH_DIST_INIT_BARRIER=1
 
 # ---------- Launch training ---------- #
+# You probably want to load in a virtual env w/ conda...
+# module load conda
+# conda activate torchtune
+# ...or venv
+# source torchtune/bin/activate
+
+SHARED_FS=/mnt/slurm # <-- Replace w/ your filesystem
+CHECKPOINT_DIR="$SHARED_FS/Llama-3.3-70B-Instruct"
+OUTPUT_DIR="$SHARED_FS/Llama3.3-70B-fft-output"
+
 # Adjust sbatch --ntasks and sbatch --nodes above and --nnodes below to your specific node count
 srun tune run --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" \
-    full_finetune_distributed --config ${CONFIG_FILE}
+    full_finetune_distributed --config llama3_3/70B_full_multinode checkpoint_dir=$CHECKPOINT_DIR output_dir=$OUTPUT_DIR

From c4748a52e51eba79180fdd41211f07b1c5613f2c Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 15:17:14 -0500
Subject: [PATCH 09/29] Move env variables around

---
 recipes/full_finetune_multinode.slurm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm
index b8f7fbedef..6e83ba4f62 100644
--- a/recipes/full_finetune_multinode.slurm
+++ b/recipes/full_finetune_multinode.slurm
@@ -20,13 +20,13 @@ nodes_array=($nodes)
 head_node=${nodes_array[0]}
 head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
 echo Node IP: $head_node_ip
-export LOGLEVEL=INFO
 
-# You might need to explicitly set the network interface:
+# You might need to explicitly set the network interface for distributed backends:
 # export NCCL_SOCKET_IFNAME=...
 # export GLOO_SOCKET_IFNAME=...
 
 export TORCH_DIST_INIT_BARRIER=1
+export LOGLEVEL=INFO
 
 # ---------- Launch training ---------- #
 # You probably want to load in a virtual env w/ conda...

From 94440f99bbcd9d1c60359bef1d166121be5eecb4 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Mon, 27 Jan 2025 16:06:42 -0500
Subject: [PATCH 10/29] Multi-node tutorial

---
 docs/source/tutorials/multinode.rst | 422 ++++++++++++++++++++++++++++
 1 file changed, 422 insertions(+)
 create mode 100644 docs/source/tutorials/multinode.rst

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
new file mode 100644
index 0000000000..380788faaf
--- /dev/null
+++ b/docs/source/tutorials/multinode.rst
@@ -0,0 +1,422 @@
+.. _multinode_tutorial:
+
+=====================
+Multi-node finetuning
+=====================
+
+Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and
+now have access to a proper multi-node cluster. You're part of the so-called "GPU middle class". In many ways,
+your worries of yesteryear are gone. Memory efficient training? Not anymore! But in so many other ways, your problems
+are just starting because multi-node is a whole new beast. Come with me as I take you through your new life, complete with
+a big backyard, new car, and of course - a nice rack of H100s.
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn:
+
+      * How to set up the torchtune package on a SLURM cluster
+      * How to fine-tune a Llama3.3 70B model w/ full parameter updates (not LoRA)
+      * What common errors to lookout for
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+
+      * Be familiar with distributed training in torchtune
+      * Already know basic SLURM commands
+
+
+Advantages of multi-node training
+---------------------------------
+
+It's likely that if you're reading this tutorial, you don't need a refresher on the advantages of having
+MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider how big
+a 70B model is in memory.
+
+70 * 2 = 140 GB. So it definitely can't fit in a single GPU. Maybe a few GPUs? But then you have to consider the optimizer.
+Bring down the optimizer, then you have to checkpoint or offload the activations. Now it fits, but training is slow and/or
+just an approximation of "true" training.
+
+Multi-node allows you to fit larger models in memory and utilize bigger batch sizes, potentially reducing the overall training time.
+
+> Aside on FSDP on multi-node. Need all gather, might not be faster, etc.
+
+Training Llama3.3 70B on 2 nodes
+--------------------------------
+
+First, we need to install torchtune on your cluster. Although pretty much as straightforward as the <link> normal install instructions,
+it's recommended that you install into a virtual environment that is accessible from nodes in your cluster - something like a shared filesystem.
+*You should know best how to go about this as it is your cluster.*
+
+Next, using the same idea as above, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct
+credentials as noted before.) You'll also need to ensure you have internet access from your cluster (not a given)
+
+.. code-block:: bash
+    tune download meta-llama/Llama-
+
+Now that we have a downloaded model, we can launch training.
+
+.. code-block:: bash
+    tune cp full_finetune_multinode .
+
+And let's open it up to see what's inside:
+
+
+SHOW THE file
+
+we just need to point to our checkpoint and output dir and lets train! This uses the full_finetune_distributed file under the hood
+
+> You may need to set your interface which you can find with ipconfig
+
+Once we've trained, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub.
+
+Future development
+------------------
+
+2D parallelism
+
+Longer context (ring attention, etc)
+
+What else do you want?
+
+BLAH BLHAH BALSHD 很好
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. code-block:: text
+
+    <s>[INST] <<SYS>>
+    You are a helpful, respectful, and honest assistant.
+    <</SYS>>
+
+    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>
+
+Llama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_
+the template from Llama2 to better support multiturn conversations. The same text
+in the Llama3 Instruct format would look like this:
+
+.. code-block:: text
+
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+    You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+    Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+    Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant<|eot_id|>
+
+The tags are entirely different, and they are actually encoded differently than in
+Llama2. Let's walk through tokenizing an example with the Llama2 template and the
+Llama3 template to understand how.
+
+.. note::
+    The Llama3 Base model uses a `different prompt template
+    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct
+    because it has not yet been instruct tuned and the extra special tokens are untrained. If you
+    are running inference on the Llama3 Base model without fine-tuning we recommend the base
+    template for optimal performance. Generally, for instruct and chat data, we recommend using
+    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using
+    Llama3 Instruct.
+
+.. _prompt_template_vs_special_tokens:
+
+Tokenizing prompt templates & special tokens
+--------------------------------------------
+
+Let's say I have a sample of a single user-assistant turn accompanied with a system
+prompt:
+
+.. code-block:: python
+
+    sample = [
+        {
+            "role": "system",
+            "content": "You are a helpful, respectful, and honest assistant.",
+        },
+        {
+            "role": "user",
+            "content": "Who are the most influential hip-hop artists of all time?",
+        },
+        {
+            "role": "assistant",
+            "content": "Here is a list of some of the most influential hip-hop "
+            "artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.",
+        },
+    ]
+
+Now, let's format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and
+see how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**,
+which simply structures a prompt with flavor text to indicate a certain task.
+
+.. code-block:: python
+
+    from torchtune.data import Llama2ChatTemplate, Message
+
+    messages = [Message.from_dict(msg) for msg in sample]
+    formatted_messages = Llama2ChatTemplate.format(messages)
+    print(formatted_messages)
+    # [
+    #     Message(
+    #         role='user',
+    #         content='[INST] <<SYS>>\nYou are a helpful, respectful, and honest assistant.\n<</SYS>>\n\nWho are the most influential hip-hop artists of all time? [/INST] ',
+    #         ...,
+    #     ),
+    #     Message(
+    #         role='assistant',
+    #         content='Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.',
+    #         ...,
+    #     ),
+    # ]
+
+There are also special tokens used by Llama2, which are not in the prompt template.
+If you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you'll notice that
+we don't include the :code:`<s>` and :code:`</s>` tokens. These are the beginning-of-sequence
+(BOS) and end-of-sequence (EOS) tokens that are represented differently in the tokenizer
+than the rest of the prompt template. Let's tokenize this example with the
+:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see
+why.
+
+.. code-block:: python
+
+    from torchtune.models.llama2 import llama2_tokenizer
+
+    tokenizer = llama2_tokenizer("/tmp/Llama-2-7b-hf/tokenizer.model")
+    user_message = formatted_messages[0].text_content
+    tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True)
+    print(tokens)
+    # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2]
+
+We've added the BOS and EOS tokens when encoding our example text. This shows up
+as IDs 1 and 2. We can verify that these are our BOS and EOS tokens.
+
+.. code-block:: python
+
+    print(tokenizer._spm_model.spm_model.piece_to_id("<s>"))
+    # 1
+    print(tokenizer._spm_model.spm_model.piece_to_id("</s>"))
+    # 2
+
+The BOS and EOS tokens are what we call special tokens, because they have their own
+reserved token IDs. This means that they will index to their own individual vectors in
+the model's learnt embedding table. The rest of the prompt template tags, :code:`[INST]`
+and :code:`<<SYS>>` are tokenized as normal text and not their own IDs.
+
+.. code-block:: python
+
+    print(tokenizer.decode(518))
+    # '['
+    print(tokenizer.decode(25580))
+    # 'INST'
+    print(tokenizer.decode(29962))
+    # ']'
+    print(tokenizer.decode([3532, 14816, 29903, 6778]))
+    # '<<SYS>>'
+
+It's important to note that you should not place the special reserved tokens in your
+input prompts manually, as it will be treated as normal text and not as a special
+token.
+
+.. code-block:: python
+
+    print(tokenizer.encode("<s>", add_bos=False, add_eos=False))
+    # [529, 29879, 29958]
+
+Now let's take a look at Llama3's formatting to see how it's tokenized differently
+than Llama2.
+
+.. code-block:: python
+
+    from torchtune.models.llama3 import llama3_tokenizer
+
+    tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model")
+    messages = [Message.from_dict(msg) for msg in sample]
+    tokens, mask = tokenizer.tokenize_messages(messages)
+    print(tokenizer.decode(tokens))
+    # '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, respectful,
+    # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho
+    # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|>
+    # assistant<|end_header_id|>\n\nHere is a list of some of the most influential hip-hop
+    # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>'
+
+.. note::
+    We used the ``tokenize_messages`` API for Llama3, which is different than
+    encode. It simply manages adding all the special tokens in the correct
+    places after encoding the individual messages.
+
+We can see that the tokenizer handled all the formatting without us specifying a prompt
+template. It turns out that all of the additional tags are special tokens, and we don't require
+a separate prompt template. We can verify this by checking if the tags get encoded
+as their own token IDs.
+
+.. code-block:: python
+
+    print(tokenizer.special_tokens["<|begin_of_text|>"])
+    # 128000
+    print(tokenizer.special_tokens["<|eot_id|>"])
+    # 128009
+
+The best part is - all these special tokens are handled purely by the tokenizer.
+That means you won't have to worry about messing up any required prompt templates!
+
+
+When should I use a prompt template?
+------------------------------------
+
+Whether or not to use a prompt template is governed by what your desired inference
+behavior is. You should use a prompt template if you are running inference on the
+base model and it was pre-trained with a prompt template, or you want to prime a
+fine-tuned model to expect a certain prompt structure on inference for a specific task.
+
+It is not strictly necessary to fine-tune with a prompt template, but generally
+specific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate`
+provides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text.
+This would wrap around the user message, with the assistant message untouched.
+
+.. code-block:: python
+
+    f"Summarize this dialogue:\n{dialogue}\n---\nSummary:\n"
+
+You can fine-tune Llama2 with this template even though the model was originally pre-trained
+with the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model
+sees during inference. The model should be robust enough to adapt to a new template.
+
+
+Fine-tuning on a custom chat dataset
+------------------------------------
+
+Let's test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom
+chat dataset. We'll walk through how to set up our data so that it can be tokenized
+correctly and fed into our model.
+
+Let's say we have a local dataset saved as a JSON file that contains conversations
+with an AI model. How can we get something like this into a format
+Llama3 understands and tokenizes correctly?
+
+.. code-block:: python
+
+    # data/my_data.json
+    [
+        {
+            "dialogue": [
+                {
+                    "from": "human",
+                    "value": "What is your name?"
+                },
+                {
+                    "from": "gpt",
+                    "value": "I am an AI assistant, I don't have a name."
+                },
+                {
+                    "from": "human",
+                    "value": "Pretend you have a name."
+                },
+                {
+                    "from": "gpt",
+                    "value": "My name is Mark Zuckerberg."
+                }
+            ]
+        },
+    ]
+
+Let's first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we
+have conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any
+custom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset
+builder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify
+``conversation_column`` and ``conversation_style``. Our data follows the ``"sharegpt"`` format, so
+we can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should
+look like so:
+
+.. code-block:: python
+
+    from torchtune.datasets import chat_dataset
+    from torchtune.models.llama3 import llama3_tokenizer
+
+    tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model")
+    ds = chat_dataset(
+        tokenizer=tokenizer,
+        source="json",
+        data_files="data/my_data.json",
+        split="train",
+        conversation_column="dialogue",
+        conversation_style="sharegpt",
+    )
+
+.. code-block:: yaml
+
+    # In config
+    tokenizer:
+      _component_: torchtune.models.llama3.llama3_tokenizer
+      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model
+
+    dataset:
+      _component_: torchtune.datasets.chat_dataset
+      source: json
+      data_files: data/my_data.json
+      split: train
+      conversation_column: dialogue
+      conversation_style: sharegpt
+
+.. note::
+    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our
+    Dataset classes and they will honor them. This is useful for common parameters
+    such as specifying the data split with :code:`split` or configuration with
+    :code:`name`
+
+If you needed to add a prompt template, you would simply pass it into the tokenizer.
+Since we're fine-tuning Llama3, the tokenizer will handle all formatting for
+us and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,
+use a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format
+all messages according to their `recommendations <https://docs.mistral.ai/getting-started/open_weight_models/#chat-template>`_.
+
+Now we're ready to start fine-tuning! We'll use the built-in LoRA single device recipe.
+Use the :ref:`tune cp <tune_cp_cli_label>` command to get a copy of the :code:`8B_lora_single_device.yaml`
+config and update it with your dataset configuration.
+
+Launch the fine-tune!
+
+.. code-block:: bash
+
+    $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml epochs=15

From deffecaa398ff67abd1e35768cf24c7f4daa8931 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Tue, 28 Jan 2025 13:48:07 -0500
Subject: [PATCH 11/29] Updates

---
 docs/source/tutorials/multinode.rst | 48 +++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 13 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 380788faaf..512ff5537a 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -6,7 +6,7 @@ Multi-node finetuning
 
 Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and
 now have access to a proper multi-node cluster. You're part of the so-called "GPU middle class". In many ways,
-your worries of yesteryear are gone. Memory efficient training? Not anymore! But in so many other ways, your problems
+your worries of yesteryear are gone: memory efficient training? Who cares! But in many other ways, your problems
 are just starting because multi-node is a whole new beast. Come with me as I take you through your new life, complete with
 a big backyard, new car, and of course - a nice rack of H100s.
 
@@ -28,41 +28,63 @@ Advantages of multi-node training
 ---------------------------------
 
 It's likely that if you're reading this tutorial, you don't need a refresher on the advantages of having
-MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider how big
-a 70B model is in memory.
+MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider a simplified calculation
+on how much memory is required to train a 70B parameter model in bfloat16.
 
-70 * 2 = 140 GB. So it definitely can't fit in a single GPU. Maybe a few GPUs? But then you have to consider the optimizer.
-Bring down the optimizer, then you have to checkpoint or offload the activations. Now it fits, but training is slow and/or
-just an approximation of "true" training.
+.. code-block:: text
+    Weights:                            140 GB
+    + Optim state (AdamW):              280 GB
+    + Activations (bsz=8,seq_len=2048): XX
+    ------------------------------------------
+                                        280 GB
+
+Right now the average GPU has 80GB of VRAM so definitely can't fit on a single GPU and even multiple GPUs won't be up to the task.
+We have a ton of memory optimizations in torchtune that allow you to fit larger models in less resource.
 
-Multi-node allows you to fit larger models in memory and utilize bigger batch sizes, potentially reducing the overall training time.
+Why might you want to use multi-node then?
+* Larger models (like Llama 405B, Deepseek, etc)
+* Potentially faster training via larger batch sizes, no activation checkpointing
+* Potentially more accurate training with full parameter updates and non-approximate optimizers, etc
 
-> Aside on FSDP on multi-node. Need all gather, might not be faster, etc.
+.. note::
+    **Low inter-node bandwidth & FSDP**
+    We utilize <FSDP> to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation for each forward pass and an all-gather plus a scatter-reduce
+    operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced.
 
 Training Llama3.3 70B on 2 nodes
 --------------------------------
 
+With that background out of the way, let's get training! We'll be utilizing a common cluster setup called SLURM and we assume you have a decent working knowledge for this tutorial.
 First, we need to install torchtune on your cluster. Although pretty much as straightforward as the <link> normal install instructions,
 it's recommended that you install into a virtual environment that is accessible from nodes in your cluster - something like a shared filesystem.
-*You should know best how to go about this as it is your cluster.*
 
 Next, using the same idea as above, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct
-credentials as noted before.) You'll also need to ensure you have internet access from your cluster (not a given)
+credentials as noted before.)
 
 .. code-block:: bash
     tune download meta-llama/Llama-
 
-Now that we have a downloaded model, we can launch training.
+Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI,
+it's recommended that you copy the file to your machine.
 
 .. code-block:: bash
     tune cp full_finetune_multinode .
 
 And let's open it up to see what's inside:
 
+.. only:: builder_html or PyTorchdoc
+
+    Copy the recipe directly into your own script or notebook to modify and edit for yourself.
+
+.. literalinclude:: ../../../recipes/full_finetune_multinode.slurm
+    :pyobject: recipe
 
-SHOW THE file
+What are the high level parts?
+* Uses `full_finetune_distributed` to launch training
+* Can specify number of nodes, tasks, CPUs available, etc
+* Should consider several cluster-specific environment variables
 
-we just need to point to our checkpoint and output dir and lets train! This uses the full_finetune_distributed file under the hood
+We just need to point to our checkpoint and output dir and get training!
 
 > You may need to set your interface which you can find with ipconfig
 

From f4417218b183f0ef7bc4263d2e47928e2917e1ac Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Tue, 28 Jan 2025 13:54:12 -0500
Subject: [PATCH 12/29] Update code block

---
 docs/source/tutorials/multinode.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 512ff5537a..085a80a35a 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -32,9 +32,10 @@ MORE compute, but let's go over it again so you can appreciate how lucky you are
 on how much memory is required to train a 70B parameter model in bfloat16.
 
 .. code-block:: text
-    Weights:                            140 GB
-    + Optim state (AdamW):              280 GB
-    + Activations (bsz=8,seq_len=2048): XX
+
+    Weights                            140 GB
+    + Optim state (AdamW)              280 GB
+    + Activations (bsz=8,seq_len=2048) XX
     ------------------------------------------
                                         280 GB
 

From 9ba9e240508774903f1072cf1e163b8aa8854c33 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Tue, 28 Jan 2025 13:57:24 -0500
Subject: [PATCH 13/29] asdf

---
 docs/source/tutorials/multinode.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 085a80a35a..643beb0c4e 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -48,6 +48,7 @@ Why might you want to use multi-node then?
 * Potentially more accurate training with full parameter updates and non-approximate optimizers, etc
 
 .. note::
+
     **Low inter-node bandwidth & FSDP**
     We utilize <FSDP> to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation for each forward pass and an all-gather plus a scatter-reduce
     operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced.
@@ -63,12 +64,14 @@ Next, using the same idea as above, we need to download the Llama3.3 70B model t
 credentials as noted before.)
 
 .. code-block:: bash
+
     tune download meta-llama/Llama-
 
 Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI,
 it's recommended that you copy the file to your machine.
 
 .. code-block:: bash
+
     tune cp full_finetune_multinode .
 
 And let's open it up to see what's inside:

From b36325a5e2ea9e93ef0c51d5c1713d62b8394578 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Tue, 28 Jan 2025 14:10:46 -0500
Subject: [PATCH 14/29] Fix linting errors

---
 docs/source/index.rst               |   1 +
 docs/source/tutorials/multinode.rst | 351 +---------------------------
 2 files changed, 3 insertions(+), 349 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index d62ad77b63..621457c083 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -149,6 +149,7 @@ torchtune tutorials.
    tutorials/e2e_flow
    tutorials/llama_kd_tutorial
    tutorials/memory_optimizations
+   tutorials/multinode
 
 .. toctree::
    :glob:
diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 643beb0c4e..06dc2f52cc 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -65,23 +65,18 @@ credentials as noted before.)
 
 .. code-block:: bash
 
-    tune download meta-llama/Llama-
+    $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct
 
 Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI,
 it's recommended that you copy the file to your machine.
 
 .. code-block:: bash
 
-    tune cp full_finetune_multinode .
+    $ tune cp full_finetune_multinode .
 
 And let's open it up to see what's inside:
 
-.. only:: builder_html or PyTorchdoc
-
-    Copy the recipe directly into your own script or notebook to modify and edit for yourself.
-
 .. literalinclude:: ../../../recipes/full_finetune_multinode.slurm
-    :pyobject: recipe
 
 What are the high level parts?
 * Uses `full_finetune_distributed` to launch training
@@ -104,345 +99,3 @@ Longer context (ring attention, etc)
 What else do you want?
 
 BLAH BLHAH BALSHD 很好
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. code-block:: text
-
-    <s>[INST] <<SYS>>
-    You are a helpful, respectful, and honest assistant.
-    <</SYS>>
-
-    Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant </s>
-
-Llama3 Instruct `overhauled <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_
-the template from Llama2 to better support multiturn conversations. The same text
-in the Llama3 Instruct format would look like this:
-
-.. code-block:: text
-
-    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
-
-    You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-    Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-    Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant<|eot_id|>
-
-The tags are entirely different, and they are actually encoded differently than in
-Llama2. Let's walk through tokenizing an example with the Llama2 template and the
-Llama3 template to understand how.
-
-.. note::
-    The Llama3 Base model uses a `different prompt template
-    <https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3>`_ than Llama3 Instruct
-    because it has not yet been instruct tuned and the extra special tokens are untrained. If you
-    are running inference on the Llama3 Base model without fine-tuning we recommend the base
-    template for optimal performance. Generally, for instruct and chat data, we recommend using
-    Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using
-    Llama3 Instruct.
-
-.. _prompt_template_vs_special_tokens:
-
-Tokenizing prompt templates & special tokens
---------------------------------------------
-
-Let's say I have a sample of a single user-assistant turn accompanied with a system
-prompt:
-
-.. code-block:: python
-
-    sample = [
-        {
-            "role": "system",
-            "content": "You are a helpful, respectful, and honest assistant.",
-        },
-        {
-            "role": "user",
-            "content": "Who are the most influential hip-hop artists of all time?",
-        },
-        {
-            "role": "assistant",
-            "content": "Here is a list of some of the most influential hip-hop "
-            "artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.",
-        },
-    ]
-
-Now, let's format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and
-see how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**,
-which simply structures a prompt with flavor text to indicate a certain task.
-
-.. code-block:: python
-
-    from torchtune.data import Llama2ChatTemplate, Message
-
-    messages = [Message.from_dict(msg) for msg in sample]
-    formatted_messages = Llama2ChatTemplate.format(messages)
-    print(formatted_messages)
-    # [
-    #     Message(
-    #         role='user',
-    #         content='[INST] <<SYS>>\nYou are a helpful, respectful, and honest assistant.\n<</SYS>>\n\nWho are the most influential hip-hop artists of all time? [/INST] ',
-    #         ...,
-    #     ),
-    #     Message(
-    #         role='assistant',
-    #         content='Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.',
-    #         ...,
-    #     ),
-    # ]
-
-There are also special tokens used by Llama2, which are not in the prompt template.
-If you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you'll notice that
-we don't include the :code:`<s>` and :code:`</s>` tokens. These are the beginning-of-sequence
-(BOS) and end-of-sequence (EOS) tokens that are represented differently in the tokenizer
-than the rest of the prompt template. Let's tokenize this example with the
-:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see
-why.
-
-.. code-block:: python
-
-    from torchtune.models.llama2 import llama2_tokenizer
-
-    tokenizer = llama2_tokenizer("/tmp/Llama-2-7b-hf/tokenizer.model")
-    user_message = formatted_messages[0].text_content
-    tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True)
-    print(tokens)
-    # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2]
-
-We've added the BOS and EOS tokens when encoding our example text. This shows up
-as IDs 1 and 2. We can verify that these are our BOS and EOS tokens.
-
-.. code-block:: python
-
-    print(tokenizer._spm_model.spm_model.piece_to_id("<s>"))
-    # 1
-    print(tokenizer._spm_model.spm_model.piece_to_id("</s>"))
-    # 2
-
-The BOS and EOS tokens are what we call special tokens, because they have their own
-reserved token IDs. This means that they will index to their own individual vectors in
-the model's learnt embedding table. The rest of the prompt template tags, :code:`[INST]`
-and :code:`<<SYS>>` are tokenized as normal text and not their own IDs.
-
-.. code-block:: python
-
-    print(tokenizer.decode(518))
-    # '['
-    print(tokenizer.decode(25580))
-    # 'INST'
-    print(tokenizer.decode(29962))
-    # ']'
-    print(tokenizer.decode([3532, 14816, 29903, 6778]))
-    # '<<SYS>>'
-
-It's important to note that you should not place the special reserved tokens in your
-input prompts manually, as it will be treated as normal text and not as a special
-token.
-
-.. code-block:: python
-
-    print(tokenizer.encode("<s>", add_bos=False, add_eos=False))
-    # [529, 29879, 29958]
-
-Now let's take a look at Llama3's formatting to see how it's tokenized differently
-than Llama2.
-
-.. code-block:: python
-
-    from torchtune.models.llama3 import llama3_tokenizer
-
-    tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model")
-    messages = [Message.from_dict(msg) for msg in sample]
-    tokens, mask = tokenizer.tokenize_messages(messages)
-    print(tokenizer.decode(tokens))
-    # '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, respectful,
-    # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho
-    # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|>
-    # assistant<|end_header_id|>\n\nHere is a list of some of the most influential hip-hop
-    # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>'
-
-.. note::
-    We used the ``tokenize_messages`` API for Llama3, which is different than
-    encode. It simply manages adding all the special tokens in the correct
-    places after encoding the individual messages.
-
-We can see that the tokenizer handled all the formatting without us specifying a prompt
-template. It turns out that all of the additional tags are special tokens, and we don't require
-a separate prompt template. We can verify this by checking if the tags get encoded
-as their own token IDs.
-
-.. code-block:: python
-
-    print(tokenizer.special_tokens["<|begin_of_text|>"])
-    # 128000
-    print(tokenizer.special_tokens["<|eot_id|>"])
-    # 128009
-
-The best part is - all these special tokens are handled purely by the tokenizer.
-That means you won't have to worry about messing up any required prompt templates!
-
-
-When should I use a prompt template?
-------------------------------------
-
-Whether or not to use a prompt template is governed by what your desired inference
-behavior is. You should use a prompt template if you are running inference on the
-base model and it was pre-trained with a prompt template, or you want to prime a
-fine-tuned model to expect a certain prompt structure on inference for a specific task.
-
-It is not strictly necessary to fine-tune with a prompt template, but generally
-specific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate`
-provides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text.
-This would wrap around the user message, with the assistant message untouched.
-
-.. code-block:: python
-
-    f"Summarize this dialogue:\n{dialogue}\n---\nSummary:\n"
-
-You can fine-tune Llama2 with this template even though the model was originally pre-trained
-with the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model
-sees during inference. The model should be robust enough to adapt to a new template.
-
-
-Fine-tuning on a custom chat dataset
-------------------------------------
-
-Let's test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom
-chat dataset. We'll walk through how to set up our data so that it can be tokenized
-correctly and fed into our model.
-
-Let's say we have a local dataset saved as a JSON file that contains conversations
-with an AI model. How can we get something like this into a format
-Llama3 understands and tokenizes correctly?
-
-.. code-block:: python
-
-    # data/my_data.json
-    [
-        {
-            "dialogue": [
-                {
-                    "from": "human",
-                    "value": "What is your name?"
-                },
-                {
-                    "from": "gpt",
-                    "value": "I am an AI assistant, I don't have a name."
-                },
-                {
-                    "from": "human",
-                    "value": "Pretend you have a name."
-                },
-                {
-                    "from": "gpt",
-                    "value": "My name is Mark Zuckerberg."
-                }
-            ]
-        },
-    ]
-
-Let's first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we
-have conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any
-custom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset
-builder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify
-``conversation_column`` and ``conversation_style``. Our data follows the ``"sharegpt"`` format, so
-we can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should
-look like so:
-
-.. code-block:: python
-
-    from torchtune.datasets import chat_dataset
-    from torchtune.models.llama3 import llama3_tokenizer
-
-    tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model")
-    ds = chat_dataset(
-        tokenizer=tokenizer,
-        source="json",
-        data_files="data/my_data.json",
-        split="train",
-        conversation_column="dialogue",
-        conversation_style="sharegpt",
-    )
-
-.. code-block:: yaml
-
-    # In config
-    tokenizer:
-      _component_: torchtune.models.llama3.llama3_tokenizer
-      path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model
-
-    dataset:
-      _component_: torchtune.datasets.chat_dataset
-      source: json
-      data_files: data/my_data.json
-      split: train
-      conversation_column: dialogue
-      conversation_style: sharegpt
-
-.. note::
-    You can pass in any keyword argument for `load_dataset <https://huggingface.co/docs/datasets/v2.20.0/en/package_reference/loading_methods#datasets.load_dataset>`_ into all our
-    Dataset classes and they will honor them. This is useful for common parameters
-    such as specifying the data split with :code:`split` or configuration with
-    :code:`name`
-
-If you needed to add a prompt template, you would simply pass it into the tokenizer.
-Since we're fine-tuning Llama3, the tokenizer will handle all formatting for
-us and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`,
-use a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format
-all messages according to their `recommendations <https://docs.mistral.ai/getting-started/open_weight_models/#chat-template>`_.
-
-Now we're ready to start fine-tuning! We'll use the built-in LoRA single device recipe.
-Use the :ref:`tune cp <tune_cp_cli_label>` command to get a copy of the :code:`8B_lora_single_device.yaml`
-config and update it with your dataset configuration.
-
-Launch the fine-tune!
-
-.. code-block:: bash
-
-    $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml epochs=15

From fc9afbd308098cae1f4376719715ce05252c980d Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Tue, 28 Jan 2025 19:07:14 -0500
Subject: [PATCH 15/29] Updates

---
 docs/source/tutorials/multinode.rst           | 93 ++++++++++---------
 torchtune/_recipe_registry.py                 |  6 ++
 torchtune/training/_distributed.py            | 12 ++-
 .../training/checkpointing/_checkpointer.py   |  4 +-
 4 files changed, 65 insertions(+), 50 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 06dc2f52cc..2c0c58186c 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -4,19 +4,18 @@
 Multi-node finetuning
 =====================
 
-Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and
-now have access to a proper multi-node cluster. You're part of the so-called "GPU middle class". In many ways,
-your worries of yesteryear are gone: memory efficient training? Who cares! But in many other ways, your problems
-are just starting because multi-node is a whole new beast. Come with me as I take you through your new life, complete with
-a big backyard, new car, and of course - a nice rack of H100s.
+Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and graduated to the
+so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??).
+But, new problems are on the horizon for you because multi-node is a whole new beast. Come with me as I take you
+through your new life, complete with a big backyard, new car, and of course - a nice rack of H100s.
 
 .. grid:: 2
 
     .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn:
 
+      * Why multi-node training is useful
       * How to set up the torchtune package on a SLURM cluster
       * How to fine-tune a Llama3.3 70B model w/ full parameter updates (not LoRA)
-      * What common errors to lookout for
 
     .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
 
@@ -27,75 +26,77 @@ a big backyard, new car, and of course - a nice rack of H100s.
 Advantages of multi-node training
 ---------------------------------
 
-It's likely that if you're reading this tutorial, you don't need a refresher on the advantages of having
-MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider a simplified calculation
-on how much memory is required to train a 70B parameter model in bfloat16.
+More machines means more memory! This is cool for several reasons:
 
-.. code-block:: text
-
-    Weights                            140 GB
-    + Optim state (AdamW)              280 GB
-    + Activations (bsz=8,seq_len=2048) XX
-    ------------------------------------------
-                                        280 GB
-
-Right now the average GPU has 80GB of VRAM so definitely can't fit on a single GPU and even multiple GPUs won't be up to the task.
-We have a ton of memory optimizations in torchtune that allow you to fit larger models in less resource.
-
-Why might you want to use multi-node then?
-* Larger models (like Llama 405B, Deepseek, etc)
-* Potentially faster training via larger batch sizes, no activation checkpointing
-* Potentially more accurate training with full parameter updates and non-approximate optimizers, etc
+1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B <https://ai.meta.com/blog/meta-llama-3-1/>`_, Deepseek-V3, and more.
+2. **Longer data**: More many tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations.
+3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like AdamW (not low-precision optimizers),both of which can potentially improve the quality of your training.
+4. **Faster training**: With the ability to fit more data in memory, you can use higher batch sizes *and* turn off memory optimizations like :ref:`activation checkpointing<glossary_act_ckpt>` thereby decreasing the time it takes for training to complete.
 
 .. note::
 
-    **Low inter-node bandwidth & FSDP**
-    We utilize <FSDP> to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation for each forward pass and an all-gather plus a scatter-reduce
-    operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced.
+    **Low inter-node bandwidth & FSDP** We utilize Fully Sharded Data Parallel to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation
+    for each forward pass and an all-gather plus a scatter-reduce operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow
+    inter-node connection, training speed may be reduced.
 
 Training Llama3.3 70B on 2 nodes
 --------------------------------
 
-With that background out of the way, let's get training! We'll be utilizing a common cluster setup called SLURM and we assume you have a decent working knowledge for this tutorial.
-First, we need to install torchtune on your cluster. Although pretty much as straightforward as the <link> normal install instructions,
-it's recommended that you install into a virtual environment that is accessible from nodes in your cluster - something like a shared filesystem.
+Let's get training! We'll be utilizing a common cluster setup called SLURM and assume you have a decent working knowledge of SLURM for this tutorial.
+First, we need to install torchtune. Although pretty much as straightforward as the normal install instructions,
+it's recommended that you install the package into a virtual environment that is accessible from all nodes in your cluster like a shared filesystem.
 
-Next, using the same idea as above, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct
-credentials as noted before.)
+Next, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct credentials as noted before.)
 
 .. code-block:: bash
 
     $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct
 
-Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI,
-it's recommended that you copy the file to your machine.
+Now that we have a downloaded model, let's check out the bash script.
 
 .. code-block:: bash
 
     $ tune cp full_finetune_multinode .
 
-And let's open it up to see what's inside:
-
 .. literalinclude:: ../../../recipes/full_finetune_multinode.slurm
 
-What are the high level parts?
-* Uses `full_finetune_distributed` to launch training
-* Can specify number of nodes, tasks, CPUs available, etc
+**There's a lot of information in this script but here are the high-level parts:**
+
+* We utilize SLURM specific commands like number of nodes, tasks, CPUs available, etc.
+* We are using `torchrun` and the `full_finetune_distributed` recipe to train just like on single node
 * Should consider several cluster-specific environment variables
 
-We just need to point to our checkpoint and output dir and get training!
+.. note::
+
+    We may need to explicitly set the network interface for distributed backends. You can read more about that [here]
+    but it's also helpful to know that you can find your network interface by running `ipconfig` from a specific node.
+    You'll see the output.
+
+Once we update the shared filesystem in the bash script, we can launch using sbatch.
+
+.. code-block:: bash
+
+    sbatch full_finetune_multinode.slurm
+
+And the output of `squeue` should show our job running:
+
+.. code-block:: bash
 
-> You may need to set your interface which you can find with ipconfig
+    $ squeue
+    JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
+    1     train         torchtun slurm R       0:03      2 slurm-worker-[1-2]
 
-Once we've trained, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub.
+Once training has completed, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub!
 
 Future development
 ------------------
 
-2D parallelism
+We've covered the basics of how to launch a fine-tuning job with SLURM on two nodes with FSDP. There's still more things we're cooking up,
+including...
 
-Longer context (ring attention, etc)
+**2D parallelism**: Utilizing both FSDP *and* tensor parallelism will decrease memory requirements even further, allowing us to lean even harder
+into the advantages listed <above>.
 
-What else do you want?
+**Longer context (ring attention, etc)**:
 
-BLAH BLHAH BALSHD 很好
+**Want other optimizations?** Feel free to let us know by opening up a Github Issue on our repo or dropping us a line in Discord!
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index 1c41519712..177eed45c4 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -561,6 +561,12 @@ class Recipe:
         ],
         supports_distributed=True,
     ),
+    Recipe(
+        name="full_finetune_multinode",
+        file_path="full_finetune_multinode.slurm",
+        configs=[],
+        supports_distributed=True,
+    ),
 ]
 
 
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
index c2179a4e80..9acd7b8730 100644
--- a/torchtune/training/_distributed.py
+++ b/torchtune/training/_distributed.py
@@ -114,9 +114,17 @@ def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False)
     Returns:
         str: Distributed backend for use in ``torch.distributed.init_process_group``.
     """
+    # Copied from https://github.com/pytorch/pytorch/blame/4f949f282dc66c3e4c6b41322167641a60a8593a/torch/distributed/distributed_c10d.py#L267
+    default_device_backend_map = {
+        "cuda": "nccl",
+        "cpu": "gloo",
+        "xpu": "xccl",
+    }
+    # TODO: Uncomment the following line once PyTorch 2.6 is released
+    # default_device_backend_map = dist.Backend.default_device_backend_map
     backend = "nccl"
-    if device_type in dist.Backend.default_device_backend_map.keys():
-        backend = dist.default_device_backend_map.get(device_type)
+    if device_type in default_device_backend_map:
+        backend = default_device_backend_map[device_type]
     if enable_cpu_offload:
         backend = f"{device_type}:{backend},cpu:gloo"
     return backend
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index 8d28ab22e1..13a71a120e 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -43,7 +43,7 @@
     SUFFIXES_TO_NOT_COPY,
     TORCH_INDEX_FNAME,
 )
-from torchtune.utils._logging import get_logger, log_rank_zero
+from torchtune.utils import get_logger, log_rank_zero, get_world_size_and_rank
 
 logger = get_logger("DEBUG")
 
@@ -1193,7 +1193,7 @@ def __init__(
         self._checkpoint_future = None
         self._checkpoint_dir_prefix = "dist_epoch"
         self._metadata_file = ".metadata"
-        _, self._rank = utils.get_world_size_and_rank()
+        _, self._rank = get_world_size_and_rank()
         self._process_group: Optional[dist.ProcessGroup] = process_group
 
     def _get_latest_intermediate_checkpoint(self) -> Optional[str]:

From 373e0c056ce8beefb7d05b399b1825ba11a0ac1b Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Wed, 29 Jan 2025 08:23:32 -0500
Subject: [PATCH 16/29] Lint

---
 recipes/full_finetune_distributed.py              |  4 +++-
 torchtune/training/_distributed.py                | 12 ++++++------
 torchtune/training/checkpointing/_checkpointer.py |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index c8a6429033..7cc4c1342c 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -150,7 +150,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._checkpoint_client = CheckpointClient(cfg)
         self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
         self.distributed_backend = training.get_distributed_backend(
-            device_type, enable_cpu_offload=self.fsdp_cpu_offload
+            device_type,
+            offload_ops_to_cpu=self.fsdp_cpu_offload
+            or self._enable_async_checkpointing,
         )
 
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
index 9acd7b8730..266c18ab42 100644
--- a/torchtune/training/_distributed.py
+++ b/torchtune/training/_distributed.py
@@ -95,26 +95,26 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
         return tensor
 
 
-def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) -> str:
+def get_distributed_backend(device_type: str, offload_ops_to_cpu: bool = False) -> str:
     """Gets the PyTorch Distributed backend based on device type.
 
     Args:
         device_type (str): Device type to get backend for.
-        enable_cpu_offload (bool): Flag to check if offload to CPU is enabled. If it is, we will add a GLOO
-            backend to handle CPU training. Default is False.
+        offload_ops_to_cpu (bool, optional): Flag to check if any operations should be offloaded to CPU.
+            Examples of these kinds of operations are CPU offload for FSDP and asynchronous save for distributed
+            checkpointing. Defaults to False.
 
     Example:
         >>> get_distributed_backend("cuda")
         'nccl'
         >>> get_distributed_backend("cpu")
         'gloo'
-        >>> get_distributed_backend("cuda", enable_cpu_offload=True)
+        >>> get_distributed_backend("cuda", offload_ops_to_cpu=True)
         'cuda:nccl,cpu:gloo'
 
     Returns:
         str: Distributed backend for use in ``torch.distributed.init_process_group``.
     """
-    # Copied from https://github.com/pytorch/pytorch/blame/4f949f282dc66c3e4c6b41322167641a60a8593a/torch/distributed/distributed_c10d.py#L267
     default_device_backend_map = {
         "cuda": "nccl",
         "cpu": "gloo",
@@ -125,7 +125,7 @@ def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False)
     backend = "nccl"
     if device_type in default_device_backend_map:
         backend = default_device_backend_map[device_type]
-    if enable_cpu_offload:
+    if offload_ops_to_cpu:
         backend = f"{device_type}:{backend},cpu:gloo"
     return backend
 
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index 13a71a120e..67eb5f5da8 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -43,7 +43,7 @@
     SUFFIXES_TO_NOT_COPY,
     TORCH_INDEX_FNAME,
 )
-from torchtune.utils import get_logger, log_rank_zero, get_world_size_and_rank
+from torchtune.utils import get_logger, get_world_size_and_rank, log_rank_zero
 
 logger = get_logger("DEBUG")
 

From 4659938952a9805674203a9cd1b09c9e2a354471 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Wed, 29 Jan 2025 08:24:07 -0500
Subject: [PATCH 17/29] Pass test

---
 tests/torchtune/training/test_distributed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py
index d4821348f7..ca47dcb828 100644
--- a/tests/torchtune/training/test_distributed.py
+++ b/tests/torchtune/training/test_distributed.py
@@ -91,7 +91,7 @@ def test_get_distributed_backend(self) -> None:
         assert training.get_distributed_backend("cuda") == "nccl"
         assert training.get_distributed_backend("cpu") == "gloo"
         assert (
-            training.get_distributed_backend("cuda", enable_cpu_offload=True)
+            training.get_distributed_backend("cuda", offload_ops_to_cpu=True)
             == "cuda:nccl,cpu:gloo"
         )
 

From 693b8cb016596e9191c7417264b0a51b6e8d653c Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Wed, 29 Jan 2025 11:05:03 -0500
Subject: [PATCH 18/29] Updates to tutorial

---
 docs/source/tutorials/multinode.rst | 42 +++++++++++++++--------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 2c0c58186c..c162b203fa 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -22,37 +22,39 @@ through your new life, complete with a big backyard, new car, and of course - a
       * Be familiar with distributed training in torchtune
       * Already know basic SLURM commands
 
+.. _advantages_multi_node_label:
 
 Advantages of multi-node training
 ---------------------------------
 
 More machines means more memory! This is cool for several reasons:
 
-1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B <https://ai.meta.com/blog/meta-llama-3-1/>`_, Deepseek-V3, and more.
+1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B <https://ai.meta.com/blog/meta-llama-3-1/>`_, `Deepseek-V3 <https://www.deepseek.com/>`_, and more.
 2. **Longer data**: More many tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations.
-3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like AdamW (not low-precision optimizers),both of which can potentially improve the quality of your training.
+3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like `AdamW <https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html>`_ (not low-precision optimizers), both of which can potentially improve the quality of your training.
 4. **Faster training**: With the ability to fit more data in memory, you can use higher batch sizes *and* turn off memory optimizations like :ref:`activation checkpointing<glossary_act_ckpt>` thereby decreasing the time it takes for training to complete.
 
 .. note::
 
-    **Low inter-node bandwidth & FSDP** We utilize Fully Sharded Data Parallel to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation
-    for each forward pass and an all-gather plus a scatter-reduce operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow
-    inter-node connection, training speed may be reduced.
+    **Low inter-node bandwidth & FSDP** We utilize `Fully Sharded Data Parallel <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ to distribute models over multiple devices. In order to distribute training, FSDP runs an `all-gather <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather>`_ operation
+    for each forward pass and an all-gather plus a `scatter-reduce <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#reducescatter>`_ operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow
+    inter-node connection, training speed may be reduced. For more on this, please refer to `this Github Issue <https://github.com/pytorch/pytorch/issues/102434>`_.
 
 Training Llama3.3 70B on 2 nodes
 --------------------------------
 
-Let's get training! We'll be utilizing a common cluster setup called SLURM and assume you have a decent working knowledge of SLURM for this tutorial.
-First, we need to install torchtune. Although pretty much as straightforward as the normal install instructions,
+Let's get training! We'll be utilizing a common cluster workflow manager called `SLURM <https://slurm.schedmd.com/documentation.html>`_ and assume you have a decent working knowledge of SLURM for this tutorial.
+First, we need to install torchtune. Although pretty much as straightforward as the :ref:`normal install instructions<install_label>`,
 it's recommended that you install the package into a virtual environment that is accessible from all nodes in your cluster like a shared filesystem.
 
-Next, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct credentials as noted before.)
+Next, we need to download the `Llama3.3 70B <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`_ model to your shared filesystem. You'll need to make sure you have the correct credentials following the steps
+outlined :ref:`here<tune_download_label>`.
 
 .. code-block:: bash
 
     $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct
 
-Now that we have a downloaded model, let's check out the bash script.
+Now that we have a downloaded model, let's check out the SLURM bash script.
 
 .. code-block:: bash
 
@@ -63,22 +65,21 @@ Now that we have a downloaded model, let's check out the bash script.
 **There's a lot of information in this script but here are the high-level parts:**
 
 * We utilize SLURM specific commands like number of nodes, tasks, CPUs available, etc.
-* We are using `torchrun` and the `full_finetune_distributed` recipe to train just like on single node
-* Should consider several cluster-specific environment variables
+* We are using `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`_ and the `full_finetune_distributed <https://github.com/pytorch/torchtune/blob/main/recipes/full_finetune_distributed.py>`_ recipe to train just like on single node
+* You should consider several cluster-specific environment variables to maximize GPU utilization
 
 .. note::
 
-    We may need to explicitly set the network interface for distributed backends. You can read more about that [here]
-    but it's also helpful to know that you can find your network interface by running `ipconfig` from a specific node.
-    You'll see the output.
+    We may need to explicitly set the network interface for distributed backends. You can read more about `PyTorch distributed backends here <https://pytorch.org/docs/stable/distributed.html#common-environment-variables>`_
+    but it's also helpful to know that you can find your network interface by running `ipconfig <https://en.wikipedia.org/wiki/Ipconfig#:~:text=ipconfig%20(standing%20for%20%22Internet%20Protocol,ipconfig>`_ from a specific node.
 
-Once we update the shared filesystem in the bash script, we can launch using sbatch.
+After we update the shared filesystem in the bash script, we can launch using `sbatch <https://slurm.schedmd.com/sbatch.html>`_.
 
 .. code-block:: bash
 
     sbatch full_finetune_multinode.slurm
 
-And the output of `squeue` should show our job running:
+And the output of `squeue <https://slurm.schedmd.com/squeue.html>`_ should show our job running:
 
 .. code-block:: bash
 
@@ -94,9 +95,10 @@ Future development
 We've covered the basics of how to launch a fine-tuning job with SLURM on two nodes with FSDP. There's still more things we're cooking up,
 including...
 
-**2D parallelism**: Utilizing both FSDP *and* tensor parallelism will decrease memory requirements even further, allowing us to lean even harder
-into the advantages listed <above>.
+**2D parallelism**: Utilizing both FSDP *and* tensor parallelism in what is commonly referred to as `2D parallelism <https://pytorch.org/tutorials/intermediate/TP_tutorial.html>`_ will decrease memory requirements even further, allowing us to lean even harder
+into the advantages listed :ref:`above<advantages_multi_node_label>`.
 
-**Longer context (ring attention, etc)**:
+**Longer context (ring attention, etc)**: More memory and more machines means we can train on longer sequences and tag advantage of neat tricks like ring attention, where tokens are split across
+GPUs. You can read more about our plans for torchtune in `this Github RFC <https://github.com/pytorch/torchtune/issues/1244>`_.
 
-**Want other optimizations?** Feel free to let us know by opening up a Github Issue on our repo or dropping us a line in Discord!
+**Want other optimizations?** Feel free to let us know by `opening up a Github Issue <https://github.com/pytorch/torchtune/issues/new?q=sort%3Aupdated-desc+is%3Aissue+is%3Aopen&template=Blank+issue>`_ on our repo or `dropping us a line in Discord <https://discord.gg/Zsf8xgT7>`_!

From 3d8d73d192f283aacbe875a2dc2dc1f0e567722f Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Wed, 29 Jan 2025 11:19:40 -0500
Subject: [PATCH 19/29] Remove full_finetune_multinode from recipes registry

---
 docs/source/tutorials/multinode.rst | 6 +-----
 torchtune/_recipe_registry.py       | 7 +------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index c162b203fa..e4e176e65d 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -54,11 +54,7 @@ outlined :ref:`here<tune_download_label>`.
 
     $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct
 
-Now that we have a downloaded model, let's check out the SLURM bash script.
-
-.. code-block:: bash
-
-    $ tune cp full_finetune_multinode .
+Now that we have a downloaded model, let's check out our example SLURM bash script.
 
 .. literalinclude:: ../../../recipes/full_finetune_multinode.slurm
 
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index 177eed45c4..b5692e9d7c 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -107,6 +107,7 @@ class Recipe:
             Config(name="llama3/70B_full", file_path="llama3/70B_full.yaml"),
             Config(name="llama3_1/70B_full", file_path="llama3_1/70B_full.yaml"),
             Config(name="llama3_3/70B_full", file_path="llama3_3/70B_full.yaml"),
+            Config(name="llama3_3/70B_full_multinode", file_path="llama3_3/70B_full_multinode.yaml"),
             Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"),
             Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"),
             Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"),
@@ -561,12 +562,6 @@ class Recipe:
         ],
         supports_distributed=True,
     ),
-    Recipe(
-        name="full_finetune_multinode",
-        file_path="full_finetune_multinode.slurm",
-        configs=[],
-        supports_distributed=True,
-    ),
 ]
 
 

From c0345a5f2b0761a0cf24b9bfbd0a3ed1be28ab6e Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Wed, 29 Jan 2025 11:21:30 -0500
Subject: [PATCH 20/29] Lint

---
 torchtune/_recipe_registry.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index b5692e9d7c..5b5cb60d72 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -107,7 +107,10 @@ class Recipe:
             Config(name="llama3/70B_full", file_path="llama3/70B_full.yaml"),
             Config(name="llama3_1/70B_full", file_path="llama3_1/70B_full.yaml"),
             Config(name="llama3_3/70B_full", file_path="llama3_3/70B_full.yaml"),
-            Config(name="llama3_3/70B_full_multinode", file_path="llama3_3/70B_full_multinode.yaml"),
+            Config(
+                name="llama3_3/70B_full_multinode",
+                file_path="llama3_3/70B_full_multinode.yaml",
+            ),
             Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"),
             Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"),
             Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"),

From a3aaeb46dae5ca06910e422e0eef9c2f7641ab91 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Wed, 29 Jan 2025 11:26:47 -0500
Subject: [PATCH 21/29] Last link

---
 docs/source/tutorials/e2e_flow.rst  | 2 ++
 docs/source/tutorials/multinode.rst | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst
index 9f39de4b6b..8e3a098d3e 100644
--- a/docs/source/tutorials/e2e_flow.rst
+++ b/docs/source/tutorials/e2e_flow.rst
@@ -359,6 +359,8 @@ For Llama models, you can run generation directly in torchao on the quantized mo
 discussed in `this readme <https://github.com/pytorch/ao/tree/main/torchao/_models/llama>`_. This way you can compare your own results
 to those in the previously-linked table.
 
+.. _use_model_in_wild:
+
 Use your model in the wild
 --------------------------
 
diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index e4e176e65d..5ddd73cdca 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -83,7 +83,7 @@ And the output of `squeue <https://slurm.schedmd.com/squeue.html>`_ should show
     JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
     1     train         torchtun slurm R       0:03      2 slurm-worker-[1-2]
 
-Once training has completed, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub!
+Once training has completed, we can follow the :ref:`instructions here<use_model_in_wild>` in order to upload our beautiful new model to the Hugging Face Hub!
 
 Future development
 ------------------

From b56b6bee9f97ddfa47e6b815ce77c554dc18d573 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Thu, 30 Jan 2025 20:43:26 -0500
Subject: [PATCH 22/29] Evan updates

---
 docs/source/tutorials/multinode.rst              | 13 ++++++-------
 recipes/configs/llama3_3/70B_full_multinode.yaml |  2 --
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 5ddd73cdca..e200d4c82d 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -6,8 +6,7 @@ Multi-node finetuning
 
 Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and graduated to the
 so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??).
-But, new problems are on the horizon for you because multi-node is a whole new beast. Come with me as I take you
-through your new life, complete with a big backyard, new car, and of course - a nice rack of H100s.
+But new problems are on the horizon for you because multi-node can be a whole new beast.
 
 .. grid:: 2
 
@@ -30,14 +29,14 @@ Advantages of multi-node training
 More machines means more memory! This is cool for several reasons:
 
 1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B <https://ai.meta.com/blog/meta-llama-3-1/>`_, `Deepseek-V3 <https://www.deepseek.com/>`_, and more.
-2. **Longer data**: More many tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations.
+2. **Longer data**: For many fine-tuning tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations.
 3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like `AdamW <https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html>`_ (not low-precision optimizers), both of which can potentially improve the quality of your training.
 4. **Faster training**: With the ability to fit more data in memory, you can use higher batch sizes *and* turn off memory optimizations like :ref:`activation checkpointing<glossary_act_ckpt>` thereby decreasing the time it takes for training to complete.
 
 .. note::
 
-    **Low inter-node bandwidth & FSDP** We utilize `Fully Sharded Data Parallel <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/>`_ to distribute models over multiple devices. In order to distribute training, FSDP runs an `all-gather <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather>`_ operation
-    for each forward pass and an all-gather plus a `scatter-reduce <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#reducescatter>`_ operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow
+    **Low inter-node bandwidth & FSDP** We utilize PyTorch's **Fully Sharded Data Parallel** to distribute models over multiple devices. In order to distribute training, FSDP runs an `all-gather <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#allgather>`_ operation
+    for each forward pass and an all-gather (usually) plus a `reduce-scatter <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/collectives.html#reducescatter>`_ operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow
     inter-node connection, training speed may be reduced. For more on this, please refer to `this Github Issue <https://github.com/pytorch/pytorch/issues/102434>`_.
 
 Training Llama3.3 70B on 2 nodes
@@ -62,7 +61,7 @@ Now that we have a downloaded model, let's check out our example SLURM bash scri
 
 * We utilize SLURM specific commands like number of nodes, tasks, CPUs available, etc.
 * We are using `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`_ and the `full_finetune_distributed <https://github.com/pytorch/torchtune/blob/main/recipes/full_finetune_distributed.py>`_ recipe to train just like on single node
-* You should consider several cluster-specific environment variables to maximize GPU utilization
+* You can consider several cluster-specific environment variables (``NCCL_BUFFSIZE``, ``NCCL_DEBUG``, ``FI_PROVIDER``, etc.) in order to maximize GPU utilization, debug, and more.
 
 .. note::
 
@@ -83,7 +82,7 @@ And the output of `squeue <https://slurm.schedmd.com/squeue.html>`_ should show
     JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
     1     train         torchtun slurm R       0:03      2 slurm-worker-[1-2]
 
-Once training has completed, we can follow the :ref:`instructions here<use_model_in_wild>` in order to upload our beautiful new model to the Hugging Face Hub!
+Once training has completed, which should take roughly seven minutes in total with the default config, we can follow the :ref:`instructions here<use_model_in_wild>` in order to upload our beautiful new model to the Hugging Face Hub!
 
 Future development
 ------------------
diff --git a/recipes/configs/llama3_3/70B_full_multinode.yaml b/recipes/configs/llama3_3/70B_full_multinode.yaml
index 4572792661..d7a09422d1 100644
--- a/recipes/configs/llama3_3/70B_full_multinode.yaml
+++ b/recipes/configs/llama3_3/70B_full_multinode.yaml
@@ -46,8 +46,6 @@ epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-  # Note: highly recommended to use fused=True optimizer flag
-  # with CPU offload for faster optimizer step.
   fused: True
 
 loss:

From 63eb2746764a79f5be83bfb927593c38dadf5626 Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings@meta.com>
Date: Fri, 31 Jan 2025 10:29:45 -0500
Subject: [PATCH 23/29] Update comment

---
 recipes/full_finetune_multinode.slurm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm
index 6e83ba4f62..aa5c388fc8 100644
--- a/recipes/full_finetune_multinode.slurm
+++ b/recipes/full_finetune_multinode.slurm
@@ -15,6 +15,7 @@
 
 # ---------- Set env variables ---------- #
 # Grab the IP for head node:
+# You may need to set this to the fully qualified domain name of your head node
 nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
 nodes_array=($nodes)
 head_node=${nodes_array[0]}

From 4d027b07494954c384280ab592241df44915fef9 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Fri, 31 Jan 2025 13:16:59 -0800
Subject: [PATCH 24/29] Move process initialization

---
 recipes/full_finetune_distributed.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 7cc4c1342c..3aa8c36fab 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -138,9 +138,6 @@ def __init__(self, cfg: DictConfig) -> None:
             )
             self._log_peak_memory_stats = False
 
-        _, rank = utils.get_world_size_and_rank()
-        self._is_rank_zero = rank == 0
-
         # Training cfg
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
         self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False)
@@ -149,11 +146,16 @@ def __init__(self, cfg: DictConfig) -> None:
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
         self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
+
+        # Set up the backend for distributed training (NCCL, GLOO, etc.)
         self.distributed_backend = training.get_distributed_backend(
             device_type,
             offload_ops_to_cpu=self.fsdp_cpu_offload
             or self._enable_async_checkpointing,
         )
+        init_process_group(self.distributed_backend)
+        _, rank = utils.get_world_size_and_rank()
+        self._is_rank_zero = rank == 0
 
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
         if self._optimizer_in_bwd:
@@ -247,9 +249,6 @@ def setup(self, cfg: DictConfig) -> None:
         Setup the recipe. This includes training state (if resume_from_checkpoint is True),
         model, tokenizer, loss, optimizer, lr scheduler, sampler, and dataloader.
         """
-        # Set up the backend for distributed training (NCCL, GLOO, etc.)
-        init_process_group(self.distributed_backend)
-
         if self.fsdp_cpu_offload:
             # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
             # speed up when benchmarking fused AdamW on CPU

From 34aa18b8f9fe84971d9129eb8dc95e9db0113b19 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Sat, 1 Feb 2025 08:25:25 -0800
Subject: [PATCH 25/29] Move init process group to above checkpoint
 instantiation

---
 docs/source/tutorials/multinode.rst  |  2 +-
 recipes/full_finetune_distributed.py | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index e200d4c82d..6d393e3ab4 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -82,7 +82,7 @@ And the output of `squeue <https://slurm.schedmd.com/squeue.html>`_ should show
     JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
     1     train         torchtun slurm R       0:03      2 slurm-worker-[1-2]
 
-Once training has completed, which should take roughly seven minutes in total with the default config, we can follow the :ref:`instructions here<use_model_in_wild>` in order to upload our beautiful new model to the Hugging Face Hub!
+Once training has completed, which should take roughly seven minutes in total (880 tok/s) with the default config, we can follow the :ref:`instructions here<use_model_in_wild>` in order to upload our beautiful new model to the Hugging Face Hub!
 
 Future development
 ------------------
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 3aa8c36fab..8acf1f2e45 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -127,6 +127,16 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        # Set up the backend for distributed training (NCCL, GLOO, etc.)
+        self.distributed_backend = training.get_distributed_backend(
+            device_type,
+            offload_ops_to_cpu=self.fsdp_cpu_offload
+            or self._enable_async_checkpointing,
+        )
+        init_process_group(self.distributed_backend)
+        _, rank = utils.get_world_size_and_rank()
+        self._is_rank_zero = rank == 0
+
         # Logging attributes
         self._output_dir = cfg.output_dir
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
@@ -147,16 +157,6 @@ def __init__(self, cfg: DictConfig) -> None:
         self._checkpoint_client = CheckpointClient(cfg)
         self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
 
-        # Set up the backend for distributed training (NCCL, GLOO, etc.)
-        self.distributed_backend = training.get_distributed_backend(
-            device_type,
-            offload_ops_to_cpu=self.fsdp_cpu_offload
-            or self._enable_async_checkpointing,
-        )
-        init_process_group(self.distributed_backend)
-        _, rank = utils.get_world_size_and_rank()
-        self._is_rank_zero = rank == 0
-
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
         if self._optimizer_in_bwd:
             if self._clip_grad_norm is not None:

From 30b73665dabe319ae15f5320d4308e7332d64082 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Sat, 1 Feb 2025 08:32:47 -0800
Subject: [PATCH 26/29] Update intro

---
 docs/source/tutorials/multinode.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 6d393e3ab4..bada6c6708 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -4,9 +4,8 @@
 Multi-node finetuning
 =====================
 
-Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and graduated to the
-so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??).
-But new problems are on the horizon for you because multi-node can be a whole new beast.
+Congratulations! After years of being `"GPU poor"<https://huggingface.co/settings/local-apps>`_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**.
+In many ways, your worries of yesteryear are gone (memory efficient training, who??). But new problems are on the horizon because multi-node can be a whole new beast.
 
 .. grid:: 2
 

From c7fdc219270dcf4d7095b21db7c49aca976f4550 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Sat, 1 Feb 2025 08:42:25 -0800
Subject: [PATCH 27/29] Docs r dumb

---
 docs/source/tutorials/multinode.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index bada6c6708..242ce9293a 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -4,7 +4,7 @@
 Multi-node finetuning
 =====================
 
-Congratulations! After years of being `"GPU poor"<https://huggingface.co/settings/local-apps>`_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**.
+Congratulations! After years of being `"GPU poor" <https://huggingface.co/settings/local-apps>`_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**.
 In many ways, your worries of yesteryear are gone (memory efficient training, who??). But new problems are on the horizon because multi-node can be a whole new beast.
 
 .. grid:: 2

From 900d6431610efee562c2af4f0b2510da72823fc1 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Sat, 1 Feb 2025 10:02:32 -0800
Subject: [PATCH 28/29] Wow

---
 recipes/full_finetune_distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 8acf1f2e45..db4d1b59cc 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -128,6 +128,8 @@ def __init__(self, cfg: DictConfig) -> None:
             )
 
         # Set up the backend for distributed training (NCCL, GLOO, etc.)
+        self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False)
+        self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
         self.distributed_backend = training.get_distributed_backend(
             device_type,
             offload_ops_to_cpu=self.fsdp_cpu_offload
@@ -150,12 +152,10 @@ def __init__(self, cfg: DictConfig) -> None:
 
         # Training cfg
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
-        self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False)
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
         self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
         self._clip_grad_norm = cfg.get("clip_grad_norm", None)
         self._checkpoint_client = CheckpointClient(cfg)
-        self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
 
         # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
         if self._optimizer_in_bwd:

From 9e230caefe23bab270937f139211b12dec104d30 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Mon, 3 Feb 2025 08:47:51 -0800
Subject: [PATCH 29/29] Rework intro

---
 docs/source/index.rst               | 7 +++++++
 docs/source/tutorials/multinode.rst | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 621457c083..13e1bbda56 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -85,6 +85,13 @@ torchtune tutorials.
    :link: tutorials/llama_kd_tutorial.html
    :tags: finetuning,llama3,kd
 
+.. customcarditem::
+   :header: Multi-node training w/ Llama3.3 70B
+   :card_description: Fine-tuning a large model on 2+ nodes
+   :image: _static/img/generic-pytorch-logo.png
+   :link: tutorials/multinode.html
+   :tags: multinode,llama3,slurm
+
 .. customcardend::
 
 
diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst
index 242ce9293a..b4cc98d771 100644
--- a/docs/source/tutorials/multinode.rst
+++ b/docs/source/tutorials/multinode.rst
@@ -4,8 +4,8 @@
 Multi-node finetuning
 =====================
 
-Congratulations! After years of being `"GPU poor" <https://huggingface.co/settings/local-apps>`_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**.
-In many ways, your worries of yesteryear are gone (memory efficient training, who??). But new problems are on the horizon because multi-node can be a whole new beast.
+Congratulations! You've finally escaped the struggles of being "GPU poor" and now have access to a multi-node setup.
+You can bid farewell to the days of sweating over memory-efficient optimizations, but get ready for new challenges as you navigate the complexities of distributed computing.
 
 .. grid:: 2