From bbd81fd49b75373008638e9c9ed0109166bac427 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 13:38:54 -0500 Subject: [PATCH 01/29] Remove last references to from training --- recipes/lora_finetune_distributed_multi_dataset.py | 2 +- torchtune/training/checkpointing/_checkpoint_client.py | 2 +- torchtune/training/checkpointing/_checkpointer.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/recipes/lora_finetune_distributed_multi_dataset.py b/recipes/lora_finetune_distributed_multi_dataset.py index 7d0d442c6c..25367a4d7d 100644 --- a/recipes/lora_finetune_distributed_multi_dataset.py +++ b/recipes/lora_finetune_distributed_multi_dataset.py @@ -138,7 +138,7 @@ def __init__(self, cfg: DictConfig) -> None: "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead." ) - _, rank = training.get_world_size_and_rank() + _, rank = utils.get_world_size_and_rank() self._is_rank_zero = rank == 0 diff --git a/torchtune/training/checkpointing/_checkpoint_client.py b/torchtune/training/checkpointing/_checkpoint_client.py index a87d59c4d0..4b9e11d1c3 100644 --- a/torchtune/training/checkpointing/_checkpoint_client.py +++ b/torchtune/training/checkpointing/_checkpoint_client.py @@ -72,7 +72,7 @@ def __init__( self._optimizer_in_bwd = self._cfg.get("optimizer_in_bwd", False) self._device = utils.get_device(device=self._cfg.device) - _, self._rank = training.get_world_size_and_rank() + _, self._rank = utils.get_world_size_and_rank() self._is_rank_zero = self._rank == 0 def _get_checkpointer(self): diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py index 7c8d2b0bed..8d28ab22e1 100644 --- a/torchtune/training/checkpointing/_checkpointer.py +++ b/torchtune/training/checkpointing/_checkpointer.py @@ -1193,7 +1193,7 @@ def __init__( self._checkpoint_future = None self._checkpoint_dir_prefix = "dist_epoch" self._metadata_file = ".metadata" - _, self._rank = training.get_world_size_and_rank() + _, self._rank = utils.get_world_size_and_rank() self._process_group: Optional[dist.ProcessGroup] = process_group def _get_latest_intermediate_checkpoint(self) -> Optional[str]: From c04ebaf753e72cf98f5f98b1322304f8a4af0980 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 14:23:56 -0500 Subject: [PATCH 02/29] Deprecate and use new function --- recipes/full_finetune_distributed.py | 30 ++++++++++---------- tests/torchtune/training/test_distributed.py | 10 ++----- torchtune/training/_distributed.py | 14 +++++++-- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 34ad48e938..9afd238711 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -118,7 +118,8 @@ class FullFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: - self._device = utils.get_device(device=cfg.device) + device_type = cfg.device + self._device = utils.get_device(device=device_type) self._dtype = training.get_dtype(cfg.dtype, device=self._device) if self._dtype == torch.float16: @@ -126,7 +127,7 @@ def __init__(self, cfg: DictConfig) -> None: "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead." ) - # logging attributes + # Logging attributes self._output_dir = cfg.output_dir self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) @@ -147,6 +148,10 @@ def __init__(self, cfg: DictConfig) -> None: self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False) self._clip_grad_norm = cfg.get("clip_grad_norm", None) self._checkpoint_client = CheckpointClient(cfg) + self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) + self.distributed_backend = get_distributed_backend( + device_type, enable_cpu_offload=self.fsdp_cpu_offload + ) # Optimizer in backward is not compatible with gradient accumulation or gradient clipping if self._optimizer_in_bwd: @@ -240,9 +245,16 @@ def setup(self, cfg: DictConfig) -> None: Setup the recipe. This includes training state (if resume_from_checkpoint is True), model, tokenizer, loss, optimizer, lr scheduler, sampler, and dataloader. """ + # Set up the backend for distributed training (NCCL, GLOO, etc.) + init_process_group(self.distributed_backend) + + if self.fsdp_cpu_offload: + # Utilize all available CPU cores for intra-op parallelism. This provides ~2x + # speed up when benchmarking fused AdamW on CPU + training.set_torch_num_threads() + if self._is_rank_zero: self._metric_logger = config.instantiate(cfg.metric_logger) - # log config with parameter override self._metric_logger.log_config(cfg) @@ -890,19 +902,7 @@ def recipe_main(cfg: DictConfig) -> None: - Parameters specified in config (see available configs through ``tune ls``) - Overwritten by arguments from the command-line """ - if not training.is_distributed(): - raise RuntimeError( - "Distributed finetune recipe should be run via a distributed launcher." - "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]" - ) - init_process_group("cuda:nccl,cpu:gloo") - if cfg.get("fsdp_cpu_offload", False): - # Utilize all available CPU cores for intra-op parallelism. This provides ~2x - # speed up when benchmarking fused AdamW on CPU - training.set_torch_num_threads() - config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg) - recipe = FullFinetuneRecipeDistributed(cfg=cfg) recipe.setup(cfg=cfg) recipe.train() diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py index 3fe2dd340d..960339fc1b 100644 --- a/tests/torchtune/training/test_distributed.py +++ b/tests/torchtune/training/test_distributed.py @@ -14,7 +14,7 @@ import torch.nn as nn from packaging import version from tests.test_utils import gpu_test -from torch.distributed import launcher +from torch.distributed import init_process_group, launcher from torch.distributed._composable.fsdp import fully_shard from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( CheckpointWrapper, @@ -37,12 +37,6 @@ class TestDistributed: - def test_init_distributed(self) -> None: - """Integration test to confirm consistency across device initialization utilities.""" - distributed = training.init_distributed() - assert ( - not distributed - ), "Should return False as there are no distributed environment variables" @staticmethod def _test_worker_fn(init_pg_explicit: bool) -> None: @@ -52,7 +46,7 @@ def _test_worker_fn(init_pg_explicit: bool) -> None: if init_pg_explicit: torch.distributed.init_process_group(backend="gloo") if not torch.distributed.is_initialized(): - training.init_distributed(backend="gloo") + init_process_group(backend="gloo") if not torch.distributed.is_initialized(): raise AssertionError("Expected torch.distributed to be initialized") pg_backend = torch.distributed.get_backend() diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py index ff959c5f23..b87ccdff20 100644 --- a/torchtune/training/_distributed.py +++ b/torchtune/training/_distributed.py @@ -41,8 +41,6 @@ _log: logging.Logger = get_logger() -_valid_distributed_single_node_nnodes = ["1:1", "1"] - torch_version = torch.__version__ _DISTRIBUTED_STATE_DICT_API_IS_AVAILABLE = ( "dev" not in torch_version and torch_version_ge("2.6.0") @@ -97,6 +95,18 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor: return tensor +def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False): + backend = "nccl" + if device_type in dist.Backend.default_device_backend_map.keys(): + backend = dist.default_device_backend_map.get(device_type) + if enable_cpu_offload: + backend = f"{device_type}:{backend},cpu:gloo" + return backend + + +@deprecated( + msg="The functionality of `init_distributed` is covered by `torch.distributed.init_process_group`. " +) def init_distributed(**kwargs: Dict[str, Any]) -> bool: """Initialize process group required for ``torch.distributed``. From e02d39b534a2aa1307bb5961cdcff9b10b44ea87 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 14:32:02 -0500 Subject: [PATCH 03/29] Expose --- recipes/full_finetune_distributed.py | 6 +++--- torchtune/training/__init__.py | 2 ++ torchtune/training/_distributed.py | 20 +++++++++++++++++++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 9afd238711..93e5a208cb 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -132,7 +132,7 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type != "cuda": + if self._log_peak_memory_stats and device_type != "cuda": log.info( "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." ) @@ -149,7 +149,7 @@ def __init__(self, cfg: DictConfig) -> None: self._clip_grad_norm = cfg.get("clip_grad_norm", None) self._checkpoint_client = CheckpointClient(cfg) self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) - self.distributed_backend = get_distributed_backend( + self.distributed_backend = training.get_distributed_backend( device_type, enable_cpu_offload=self.fsdp_cpu_offload ) @@ -174,7 +174,7 @@ def __init__(self, cfg: DictConfig) -> None: "enable_activation_offloading", False ) if self._enable_activation_offloading: - if self._device.type != "cuda": + if device_type != "cuda": raise RuntimeError( "enable_activation_offloading should only be True when training on CUDA" ) diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py index d461d84dc4..06795d4bb8 100644 --- a/torchtune/training/__init__.py +++ b/torchtune/training/__init__.py @@ -11,6 +11,7 @@ from torchtune.training._compile import compile_loss, compile_model from torchtune.training._distributed import ( gather_cpu_state_dict, + get_distributed_backend, get_full_optimizer_state_dict, get_shard_conditions, get_world_size_and_rank, @@ -99,6 +100,7 @@ "TOTAL_EPOCHS_KEY", "get_quantizer_mode", "get_cosine_schedule_with_warmup", + "get_distributed_backend", "get_lr", "cleanup_before_training", "create_optim_in_bwd_wrapper", diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py index b87ccdff20..8627d644a6 100644 --- a/torchtune/training/_distributed.py +++ b/torchtune/training/_distributed.py @@ -95,7 +95,25 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor: return tensor -def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False): +def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) -> str: + """Gets the PyTorch Distributed backend based on device type. + + Args: + device_type (str): Device type to get backend for. + enable_cpu_offload (bool): Flag to check if offload to CPU is enabled. If it is, we will add a GLOO + backend to handle CPU training. + + Example: + >>> get_distributed_backend("cuda") + 'nccl' + >>> get_distributed_backend("cpu") + 'gloo' + >>> get_distributed_backend("cuda", enable_cpu_offload=True) + 'cuda:nccl,cpu:gloo' + + Returns: + str: Distributed backend for use in ``torch.distributed.init_process_group``. + """ backend = "nccl" if device_type in dist.Backend.default_device_backend_map.keys(): backend = dist.default_device_backend_map.get(device_type) From c558f27dad08301eee181ef277bf0e21d287b3b6 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 14:32:52 -0500 Subject: [PATCH 04/29] Update API docs --- docs/source/api_ref_training.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst index 9cba6fb9ea..747f312447 100644 --- a/docs/source/api_ref_training.rst +++ b/docs/source/api_ref_training.rst @@ -53,6 +53,7 @@ Utilities for enabling and working with distributed training. init_distributed is_distributed gather_cpu_state_dict + get_distributed_backend .. _ac_label: From 454536c6182bbf1ec6649f4ef941bac5e4f3031d Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 14:37:40 -0500 Subject: [PATCH 05/29] Add tests --- recipes/full_finetune_distributed.py | 2 +- tests/torchtune/training/test_distributed.py | 8 ++++++++ torchtune/training/_distributed.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 93e5a208cb..c8a6429033 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -267,7 +267,7 @@ def setup(self, cfg: DictConfig) -> None: enable_activation_checkpointing=self._enable_activation_checkpointing, enable_activation_offloading=self._enable_activation_offloading, custom_sharded_layers=cfg.get("custom_sharded_layers", None), - fsdp_cpu_offload=cfg.get("fsdp_cpu_offload", False), + fsdp_cpu_offload=self.fsdp_cpu_offload, reshard_after_forward=cfg.get("fsdp_reshard_after_forward", True), model_state_dict=checkpoint_dict[training.MODEL_KEY], ac_mode=cfg.get("ac_mode", None), diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py index 960339fc1b..c693e1918b 100644 --- a/tests/torchtune/training/test_distributed.py +++ b/tests/torchtune/training/test_distributed.py @@ -88,6 +88,14 @@ def test_validate_no_params_on_meta_device(self) -> None: with pytest.raises(RuntimeError, match="Unexpected param or buffer"): training.validate_no_params_on_meta_device(model) + def test_get_distributed_backend(self) -> None: + assert training.get_distributed_backend("cuda") == "nccl" + assert training.get_distributed_backend("cpu") == "gloo" + assert ( + training.get_distributed_backend("cuda", enable_cpu_offload=True) + == "cuda:nccl,cpu:gloo" + ) + N_LAYERS = 3 IN_DIM = 5 diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py index 8627d644a6..c2179a4e80 100644 --- a/torchtune/training/_distributed.py +++ b/torchtune/training/_distributed.py @@ -101,7 +101,7 @@ def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) Args: device_type (str): Device type to get backend for. enable_cpu_offload (bool): Flag to check if offload to CPU is enabled. If it is, we will add a GLOO - backend to handle CPU training. + backend to handle CPU training. Default is False. Example: >>> get_distributed_backend("cuda") From 66b06e1983c862f81bb42aa663a17a4f7cbaf774 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 14:40:37 -0500 Subject: [PATCH 06/29] Lint --- tests/torchtune/training/test_distributed.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py index c693e1918b..d4821348f7 100644 --- a/tests/torchtune/training/test_distributed.py +++ b/tests/torchtune/training/test_distributed.py @@ -37,7 +37,6 @@ class TestDistributed: - @staticmethod def _test_worker_fn(init_pg_explicit: bool) -> None: """ From 0d5aeb4496aca37b9f2a654fbad6c5c15a0993c1 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 15:03:45 -0500 Subject: [PATCH 07/29] Add multinode recipe and sbatch script --- .../configs/llama3_3/70B_full_multinode.yaml | 104 ++++++++++++++++++ recipes/full_finetune_multinode.slurm | 34 ++++++ 2 files changed, 138 insertions(+) create mode 100644 recipes/configs/llama3_3/70B_full_multinode.yaml create mode 100644 recipes/full_finetune_multinode.slurm diff --git a/recipes/configs/llama3_3/70B_full_multinode.yaml b/recipes/configs/llama3_3/70B_full_multinode.yaml new file mode 100644 index 0000000000..4572792661 --- /dev/null +++ b/recipes/configs/llama3_3/70B_full_multinode.yaml @@ -0,0 +1,104 @@ +# Config for multi-node full finetuning in full_finetune_distributed.py +# using a Llama3.3 70B Instruct model +# +# This config assumes that you've run the following command before launching: +# tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" --output-dir SHARED_CLUSTER_FS +# +# To launch on 2 nodes w/ 8 devices on a SLURM cluster, run the following command: +# sbatch full_finetune_multinode.slurm +# +# This config is only tested on 2 nodes w/ 8 H100 machines. + +output_dir: /tmp/torchtune/llama3_3_70B/full + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model + max_seq_len: 1024 + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + packed: True # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama3_3.llama3_3_70b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/ + checkpoint_files: + filename_format: model-{}-of-{}.safetensors + max_filename: "00030" + recipe_checkpoint: null + output_dir: ${output_dir} + model_type: LLAMA3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 4 +epochs: 1 + +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 + # Note: highly recommended to use fused=True optimizer flag + # with CPU offload for faster optimizer step. + fused: True + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase effective batch size + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory +custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed. +fsdp_cpu_offload: False +clip_grad_norm: null +compile: True # torch.compile the model + loss, True increases speed + decreases memory +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir}/logs +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm new file mode 100644 index 0000000000..a57ee35537 --- /dev/null +++ b/recipes/full_finetune_multinode.slurm @@ -0,0 +1,34 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# ---------- SBATCH commands ---------- # +#SBATCH --job-name=torchtune-multi-node +#SBATCH --ntasks=2 +#SBATCH --nodes=2 +#SBATCH --gpus-per-task=8 +#SBATCH --cpus-per-task=96 +#SBATCH --partition=train + +# ---------- Set env variables ---------- # +# Grab the IP for head node: +nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +nodes_array=($nodes) +head_node=${nodes_array[0]} +head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) +echo Node IP: $head_node_ip +export LOGLEVEL=INFO + +# You might need to explicitly set the network interface: +# export NCCL_SOCKET_IFNAME=... +# export GLOO_SOCKET_IFNAME=... + +export TORCH_DIST_INIT_BARRIER=1 + +# ---------- Launch training ---------- # +# Adjust sbatch --ntasks and sbatch --nodes above and --nnodes below to your specific node count +srun tune run --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" \ + full_finetune_distributed --config ${CONFIG_FILE} From afc9c2e2a331279ee0f90de27e009e0af1596b08 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 15:12:49 -0500 Subject: [PATCH 08/29] Update launch commands --- recipes/full_finetune_multinode.slurm | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm index a57ee35537..b8f7fbedef 100644 --- a/recipes/full_finetune_multinode.slurm +++ b/recipes/full_finetune_multinode.slurm @@ -29,6 +29,16 @@ export LOGLEVEL=INFO export TORCH_DIST_INIT_BARRIER=1 # ---------- Launch training ---------- # +# You probably want to load in a virtual env w/ conda... +# module load conda +# conda activate torchtune +# ...or venv +# source torchtune/bin/activate + +SHARED_FS=/mnt/slurm # <-- Replace w/ your filesystem +CHECKPOINT_DIR="$SHARED_FS/Llama-3.3-70B-Instruct" +OUTPUT_DIR="$SHARED_FS/Llama3.3-70B-fft-output" + # Adjust sbatch --ntasks and sbatch --nodes above and --nnodes below to your specific node count srun tune run --nnodes 2 --nproc_per_node 8 --rdzv_id 101 --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:29500" \ - full_finetune_distributed --config ${CONFIG_FILE} + full_finetune_distributed --config llama3_3/70B_full_multinode checkpoint_dir=$CHECKPOINT_DIR output_dir=$OUTPUT_DIR From c4748a52e51eba79180fdd41211f07b1c5613f2c Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 15:17:14 -0500 Subject: [PATCH 09/29] Move env variables around --- recipes/full_finetune_multinode.slurm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm index b8f7fbedef..6e83ba4f62 100644 --- a/recipes/full_finetune_multinode.slurm +++ b/recipes/full_finetune_multinode.slurm @@ -20,13 +20,13 @@ nodes_array=($nodes) head_node=${nodes_array[0]} head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) echo Node IP: $head_node_ip -export LOGLEVEL=INFO -# You might need to explicitly set the network interface: +# You might need to explicitly set the network interface for distributed backends: # export NCCL_SOCKET_IFNAME=... # export GLOO_SOCKET_IFNAME=... export TORCH_DIST_INIT_BARRIER=1 +export LOGLEVEL=INFO # ---------- Launch training ---------- # # You probably want to load in a virtual env w/ conda... From 94440f99bbcd9d1c60359bef1d166121be5eecb4 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Mon, 27 Jan 2025 16:06:42 -0500 Subject: [PATCH 10/29] Multi-node tutorial --- docs/source/tutorials/multinode.rst | 422 ++++++++++++++++++++++++++++ 1 file changed, 422 insertions(+) create mode 100644 docs/source/tutorials/multinode.rst diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst new file mode 100644 index 0000000000..380788faaf --- /dev/null +++ b/docs/source/tutorials/multinode.rst @@ -0,0 +1,422 @@ +.. _multinode_tutorial: + +===================== +Multi-node finetuning +===================== + +Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and +now have access to a proper multi-node cluster. You're part of the so-called "GPU middle class". In many ways, +your worries of yesteryear are gone. Memory efficient training? Not anymore! But in so many other ways, your problems +are just starting because multi-node is a whole new beast. Come with me as I take you through your new life, complete with +a big backyard, new car, and of course - a nice rack of H100s. + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn: + + * How to set up the torchtune package on a SLURM cluster + * How to fine-tune a Llama3.3 70B model w/ full parameter updates (not LoRA) + * What common errors to lookout for + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + + * Be familiar with distributed training in torchtune + * Already know basic SLURM commands + + +Advantages of multi-node training +--------------------------------- + +It's likely that if you're reading this tutorial, you don't need a refresher on the advantages of having +MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider how big +a 70B model is in memory. + +70 * 2 = 140 GB. So it definitely can't fit in a single GPU. Maybe a few GPUs? But then you have to consider the optimizer. +Bring down the optimizer, then you have to checkpoint or offload the activations. Now it fits, but training is slow and/or +just an approximation of "true" training. + +Multi-node allows you to fit larger models in memory and utilize bigger batch sizes, potentially reducing the overall training time. + +> Aside on FSDP on multi-node. Need all gather, might not be faster, etc. + +Training Llama3.3 70B on 2 nodes +-------------------------------- + +First, we need to install torchtune on your cluster. Although pretty much as straightforward as the normal install instructions, +it's recommended that you install into a virtual environment that is accessible from nodes in your cluster - something like a shared filesystem. +*You should know best how to go about this as it is your cluster.* + +Next, using the same idea as above, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct +credentials as noted before.) You'll also need to ensure you have internet access from your cluster (not a given) + +.. code-block:: bash + tune download meta-llama/Llama- + +Now that we have a downloaded model, we can launch training. + +.. code-block:: bash + tune cp full_finetune_multinode . + +And let's open it up to see what's inside: + + +SHOW THE file + +we just need to point to our checkpoint and output dir and lets train! This uses the full_finetune_distributed file under the hood + +> You may need to set your interface which you can find with ipconfig + +Once we've trained, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub. + +Future development +------------------ + +2D parallelism + +Longer context (ring attention, etc) + +What else do you want? + +BLAH BLHAH BALSHD 很好 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. code-block:: text + + [INST] <> + You are a helpful, respectful, and honest assistant. + <> + + Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant + +Llama3 Instruct `overhauled `_ +the template from Llama2 to better support multiturn conversations. The same text +in the Llama3 Instruct format would look like this: + +.. code-block:: text + + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + + You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|> + + Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant<|eot_id|> + +The tags are entirely different, and they are actually encoded differently than in +Llama2. Let's walk through tokenizing an example with the Llama2 template and the +Llama3 template to understand how. + +.. note:: + The Llama3 Base model uses a `different prompt template + `_ than Llama3 Instruct + because it has not yet been instruct tuned and the extra special tokens are untrained. If you + are running inference on the Llama3 Base model without fine-tuning we recommend the base + template for optimal performance. Generally, for instruct and chat data, we recommend using + Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using + Llama3 Instruct. + +.. _prompt_template_vs_special_tokens: + +Tokenizing prompt templates & special tokens +-------------------------------------------- + +Let's say I have a sample of a single user-assistant turn accompanied with a system +prompt: + +.. code-block:: python + + sample = [ + { + "role": "system", + "content": "You are a helpful, respectful, and honest assistant.", + }, + { + "role": "user", + "content": "Who are the most influential hip-hop artists of all time?", + }, + { + "role": "assistant", + "content": "Here is a list of some of the most influential hip-hop " + "artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.", + }, + ] + +Now, let's format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and +see how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**, +which simply structures a prompt with flavor text to indicate a certain task. + +.. code-block:: python + + from torchtune.data import Llama2ChatTemplate, Message + + messages = [Message.from_dict(msg) for msg in sample] + formatted_messages = Llama2ChatTemplate.format(messages) + print(formatted_messages) + # [ + # Message( + # role='user', + # content='[INST] <>\nYou are a helpful, respectful, and honest assistant.\n<>\n\nWho are the most influential hip-hop artists of all time? [/INST] ', + # ..., + # ), + # Message( + # role='assistant', + # content='Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.', + # ..., + # ), + # ] + +There are also special tokens used by Llama2, which are not in the prompt template. +If you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you'll notice that +we don't include the :code:`` and :code:`` tokens. These are the beginning-of-sequence +(BOS) and end-of-sequence (EOS) tokens that are represented differently in the tokenizer +than the rest of the prompt template. Let's tokenize this example with the +:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see +why. + +.. code-block:: python + + from torchtune.models.llama2 import llama2_tokenizer + + tokenizer = llama2_tokenizer("/tmp/Llama-2-7b-hf/tokenizer.model") + user_message = formatted_messages[0].text_content + tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True) + print(tokens) + # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2] + +We've added the BOS and EOS tokens when encoding our example text. This shows up +as IDs 1 and 2. We can verify that these are our BOS and EOS tokens. + +.. code-block:: python + + print(tokenizer._spm_model.spm_model.piece_to_id("")) + # 1 + print(tokenizer._spm_model.spm_model.piece_to_id("")) + # 2 + +The BOS and EOS tokens are what we call special tokens, because they have their own +reserved token IDs. This means that they will index to their own individual vectors in +the model's learnt embedding table. The rest of the prompt template tags, :code:`[INST]` +and :code:`<>` are tokenized as normal text and not their own IDs. + +.. code-block:: python + + print(tokenizer.decode(518)) + # '[' + print(tokenizer.decode(25580)) + # 'INST' + print(tokenizer.decode(29962)) + # ']' + print(tokenizer.decode([3532, 14816, 29903, 6778])) + # '<>' + +It's important to note that you should not place the special reserved tokens in your +input prompts manually, as it will be treated as normal text and not as a special +token. + +.. code-block:: python + + print(tokenizer.encode("", add_bos=False, add_eos=False)) + # [529, 29879, 29958] + +Now let's take a look at Llama3's formatting to see how it's tokenized differently +than Llama2. + +.. code-block:: python + + from torchtune.models.llama3 import llama3_tokenizer + + tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model") + messages = [Message.from_dict(msg) for msg in sample] + tokens, mask = tokenizer.tokenize_messages(messages) + print(tokenizer.decode(tokens)) + # '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, respectful, + # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho + # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|> + # assistant<|end_header_id|>\n\nHere is a list of some of the most influential hip-hop + # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>' + +.. note:: + We used the ``tokenize_messages`` API for Llama3, which is different than + encode. It simply manages adding all the special tokens in the correct + places after encoding the individual messages. + +We can see that the tokenizer handled all the formatting without us specifying a prompt +template. It turns out that all of the additional tags are special tokens, and we don't require +a separate prompt template. We can verify this by checking if the tags get encoded +as their own token IDs. + +.. code-block:: python + + print(tokenizer.special_tokens["<|begin_of_text|>"]) + # 128000 + print(tokenizer.special_tokens["<|eot_id|>"]) + # 128009 + +The best part is - all these special tokens are handled purely by the tokenizer. +That means you won't have to worry about messing up any required prompt templates! + + +When should I use a prompt template? +------------------------------------ + +Whether or not to use a prompt template is governed by what your desired inference +behavior is. You should use a prompt template if you are running inference on the +base model and it was pre-trained with a prompt template, or you want to prime a +fine-tuned model to expect a certain prompt structure on inference for a specific task. + +It is not strictly necessary to fine-tune with a prompt template, but generally +specific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate` +provides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text. +This would wrap around the user message, with the assistant message untouched. + +.. code-block:: python + + f"Summarize this dialogue:\n{dialogue}\n---\nSummary:\n" + +You can fine-tune Llama2 with this template even though the model was originally pre-trained +with the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model +sees during inference. The model should be robust enough to adapt to a new template. + + +Fine-tuning on a custom chat dataset +------------------------------------ + +Let's test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom +chat dataset. We'll walk through how to set up our data so that it can be tokenized +correctly and fed into our model. + +Let's say we have a local dataset saved as a JSON file that contains conversations +with an AI model. How can we get something like this into a format +Llama3 understands and tokenizes correctly? + +.. code-block:: python + + # data/my_data.json + [ + { + "dialogue": [ + { + "from": "human", + "value": "What is your name?" + }, + { + "from": "gpt", + "value": "I am an AI assistant, I don't have a name." + }, + { + "from": "human", + "value": "Pretend you have a name." + }, + { + "from": "gpt", + "value": "My name is Mark Zuckerberg." + } + ] + }, + ] + +Let's first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we +have conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any +custom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset +builder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify +``conversation_column`` and ``conversation_style``. Our data follows the ``"sharegpt"`` format, so +we can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should +look like so: + +.. code-block:: python + + from torchtune.datasets import chat_dataset + from torchtune.models.llama3 import llama3_tokenizer + + tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model") + ds = chat_dataset( + tokenizer=tokenizer, + source="json", + data_files="data/my_data.json", + split="train", + conversation_column="dialogue", + conversation_style="sharegpt", + ) + +.. code-block:: yaml + + # In config + tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model + + dataset: + _component_: torchtune.datasets.chat_dataset + source: json + data_files: data/my_data.json + split: train + conversation_column: dialogue + conversation_style: sharegpt + +.. note:: + You can pass in any keyword argument for `load_dataset `_ into all our + Dataset classes and they will honor them. This is useful for common parameters + such as specifying the data split with :code:`split` or configuration with + :code:`name` + +If you needed to add a prompt template, you would simply pass it into the tokenizer. +Since we're fine-tuning Llama3, the tokenizer will handle all formatting for +us and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`, +use a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format +all messages according to their `recommendations `_. + +Now we're ready to start fine-tuning! We'll use the built-in LoRA single device recipe. +Use the :ref:`tune cp ` command to get a copy of the :code:`8B_lora_single_device.yaml` +config and update it with your dataset configuration. + +Launch the fine-tune! + +.. code-block:: bash + + $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml epochs=15 From deffecaa398ff67abd1e35768cf24c7f4daa8931 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Tue, 28 Jan 2025 13:48:07 -0500 Subject: [PATCH 11/29] Updates --- docs/source/tutorials/multinode.rst | 48 +++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 380788faaf..512ff5537a 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -6,7 +6,7 @@ Multi-node finetuning Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and now have access to a proper multi-node cluster. You're part of the so-called "GPU middle class". In many ways, -your worries of yesteryear are gone. Memory efficient training? Not anymore! But in so many other ways, your problems +your worries of yesteryear are gone: memory efficient training? Who cares! But in many other ways, your problems are just starting because multi-node is a whole new beast. Come with me as I take you through your new life, complete with a big backyard, new car, and of course - a nice rack of H100s. @@ -28,41 +28,63 @@ Advantages of multi-node training --------------------------------- It's likely that if you're reading this tutorial, you don't need a refresher on the advantages of having -MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider how big -a 70B model is in memory. +MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider a simplified calculation +on how much memory is required to train a 70B parameter model in bfloat16. -70 * 2 = 140 GB. So it definitely can't fit in a single GPU. Maybe a few GPUs? But then you have to consider the optimizer. -Bring down the optimizer, then you have to checkpoint or offload the activations. Now it fits, but training is slow and/or -just an approximation of "true" training. +.. code-block:: text + Weights: 140 GB + + Optim state (AdamW): 280 GB + + Activations (bsz=8,seq_len=2048): XX + ------------------------------------------ + 280 GB + +Right now the average GPU has 80GB of VRAM so definitely can't fit on a single GPU and even multiple GPUs won't be up to the task. +We have a ton of memory optimizations in torchtune that allow you to fit larger models in less resource. -Multi-node allows you to fit larger models in memory and utilize bigger batch sizes, potentially reducing the overall training time. +Why might you want to use multi-node then? +* Larger models (like Llama 405B, Deepseek, etc) +* Potentially faster training via larger batch sizes, no activation checkpointing +* Potentially more accurate training with full parameter updates and non-approximate optimizers, etc -> Aside on FSDP on multi-node. Need all gather, might not be faster, etc. +.. note:: + **Low inter-node bandwidth & FSDP** + We utilize to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation for each forward pass and an all-gather plus a scatter-reduce + operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced. Training Llama3.3 70B on 2 nodes -------------------------------- +With that background out of the way, let's get training! We'll be utilizing a common cluster setup called SLURM and we assume you have a decent working knowledge for this tutorial. First, we need to install torchtune on your cluster. Although pretty much as straightforward as the normal install instructions, it's recommended that you install into a virtual environment that is accessible from nodes in your cluster - something like a shared filesystem. -*You should know best how to go about this as it is your cluster.* Next, using the same idea as above, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct -credentials as noted before.) You'll also need to ensure you have internet access from your cluster (not a given) +credentials as noted before.) .. code-block:: bash tune download meta-llama/Llama- -Now that we have a downloaded model, we can launch training. +Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI, +it's recommended that you copy the file to your machine. .. code-block:: bash tune cp full_finetune_multinode . And let's open it up to see what's inside: +.. only:: builder_html or PyTorchdoc + + Copy the recipe directly into your own script or notebook to modify and edit for yourself. + +.. literalinclude:: ../../../recipes/full_finetune_multinode.slurm + :pyobject: recipe -SHOW THE file +What are the high level parts? +* Uses `full_finetune_distributed` to launch training +* Can specify number of nodes, tasks, CPUs available, etc +* Should consider several cluster-specific environment variables -we just need to point to our checkpoint and output dir and lets train! This uses the full_finetune_distributed file under the hood +We just need to point to our checkpoint and output dir and get training! > You may need to set your interface which you can find with ipconfig From f4417218b183f0ef7bc4263d2e47928e2917e1ac Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Tue, 28 Jan 2025 13:54:12 -0500 Subject: [PATCH 12/29] Update code block --- docs/source/tutorials/multinode.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 512ff5537a..085a80a35a 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -32,9 +32,10 @@ MORE compute, but let's go over it again so you can appreciate how lucky you are on how much memory is required to train a 70B parameter model in bfloat16. .. code-block:: text - Weights: 140 GB - + Optim state (AdamW): 280 GB - + Activations (bsz=8,seq_len=2048): XX + + Weights 140 GB + + Optim state (AdamW) 280 GB + + Activations (bsz=8,seq_len=2048) XX ------------------------------------------ 280 GB From 9ba9e240508774903f1072cf1e163b8aa8854c33 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Tue, 28 Jan 2025 13:57:24 -0500 Subject: [PATCH 13/29] asdf --- docs/source/tutorials/multinode.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 085a80a35a..643beb0c4e 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -48,6 +48,7 @@ Why might you want to use multi-node then? * Potentially more accurate training with full parameter updates and non-approximate optimizers, etc .. note:: + **Low inter-node bandwidth & FSDP** We utilize to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation for each forward pass and an all-gather plus a scatter-reduce operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced. @@ -63,12 +64,14 @@ Next, using the same idea as above, we need to download the Llama3.3 70B model t credentials as noted before.) .. code-block:: bash + tune download meta-llama/Llama- Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI, it's recommended that you copy the file to your machine. .. code-block:: bash + tune cp full_finetune_multinode . And let's open it up to see what's inside: From b36325a5e2ea9e93ef0c51d5c1713d62b8394578 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Tue, 28 Jan 2025 14:10:46 -0500 Subject: [PATCH 14/29] Fix linting errors --- docs/source/index.rst | 1 + docs/source/tutorials/multinode.rst | 351 +--------------------------- 2 files changed, 3 insertions(+), 349 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index d62ad77b63..621457c083 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -149,6 +149,7 @@ torchtune tutorials. tutorials/e2e_flow tutorials/llama_kd_tutorial tutorials/memory_optimizations + tutorials/multinode .. toctree:: :glob: diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 643beb0c4e..06dc2f52cc 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -65,23 +65,18 @@ credentials as noted before.) .. code-block:: bash - tune download meta-llama/Llama- + $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI, it's recommended that you copy the file to your machine. .. code-block:: bash - tune cp full_finetune_multinode . + $ tune cp full_finetune_multinode . And let's open it up to see what's inside: -.. only:: builder_html or PyTorchdoc - - Copy the recipe directly into your own script or notebook to modify and edit for yourself. - .. literalinclude:: ../../../recipes/full_finetune_multinode.slurm - :pyobject: recipe What are the high level parts? * Uses `full_finetune_distributed` to launch training @@ -104,345 +99,3 @@ Longer context (ring attention, etc) What else do you want? BLAH BLHAH BALSHD 很好 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. code-block:: text - - [INST] <> - You are a helpful, respectful, and honest assistant. - <> - - Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant - -Llama3 Instruct `overhauled `_ -the template from Llama2 to better support multiturn conversations. The same text -in the Llama3 Instruct format would look like this: - -.. code-block:: text - - <|begin_of_text|><|start_header_id|>system<|end_header_id|> - - You are a helpful, respectful, and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|> - - Hi! I am a human.<|eot_id|><|start_header_id|>assistant<|end_header_id|> - - Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant<|eot_id|> - -The tags are entirely different, and they are actually encoded differently than in -Llama2. Let's walk through tokenizing an example with the Llama2 template and the -Llama3 template to understand how. - -.. note:: - The Llama3 Base model uses a `different prompt template - `_ than Llama3 Instruct - because it has not yet been instruct tuned and the extra special tokens are untrained. If you - are running inference on the Llama3 Base model without fine-tuning we recommend the base - template for optimal performance. Generally, for instruct and chat data, we recommend using - Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using - Llama3 Instruct. - -.. _prompt_template_vs_special_tokens: - -Tokenizing prompt templates & special tokens --------------------------------------------- - -Let's say I have a sample of a single user-assistant turn accompanied with a system -prompt: - -.. code-block:: python - - sample = [ - { - "role": "system", - "content": "You are a helpful, respectful, and honest assistant.", - }, - { - "role": "user", - "content": "Who are the most influential hip-hop artists of all time?", - }, - { - "role": "assistant", - "content": "Here is a list of some of the most influential hip-hop " - "artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.", - }, - ] - -Now, let's format this with the :class:`~torchtune.models.llama2.Llama2ChatTemplate` class and -see how it gets tokenized. The Llama2ChatTemplate is an example of a **prompt template**, -which simply structures a prompt with flavor text to indicate a certain task. - -.. code-block:: python - - from torchtune.data import Llama2ChatTemplate, Message - - messages = [Message.from_dict(msg) for msg in sample] - formatted_messages = Llama2ChatTemplate.format(messages) - print(formatted_messages) - # [ - # Message( - # role='user', - # content='[INST] <>\nYou are a helpful, respectful, and honest assistant.\n<>\n\nWho are the most influential hip-hop artists of all time? [/INST] ', - # ..., - # ), - # Message( - # role='assistant', - # content='Here is a list of some of the most influential hip-hop artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.', - # ..., - # ), - # ] - -There are also special tokens used by Llama2, which are not in the prompt template. -If you look at our :class:`~torchtune.models.llama2.Llama2ChatTemplate` class, you'll notice that -we don't include the :code:`` and :code:`` tokens. These are the beginning-of-sequence -(BOS) and end-of-sequence (EOS) tokens that are represented differently in the tokenizer -than the rest of the prompt template. Let's tokenize this example with the -:func:`~torchtune.models.llama2.llama2_tokenizer` used by Llama2 to see -why. - -.. code-block:: python - - from torchtune.models.llama2 import llama2_tokenizer - - tokenizer = llama2_tokenizer("/tmp/Llama-2-7b-hf/tokenizer.model") - user_message = formatted_messages[0].text_content - tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True) - print(tokens) - # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2] - -We've added the BOS and EOS tokens when encoding our example text. This shows up -as IDs 1 and 2. We can verify that these are our BOS and EOS tokens. - -.. code-block:: python - - print(tokenizer._spm_model.spm_model.piece_to_id("")) - # 1 - print(tokenizer._spm_model.spm_model.piece_to_id("")) - # 2 - -The BOS and EOS tokens are what we call special tokens, because they have their own -reserved token IDs. This means that they will index to their own individual vectors in -the model's learnt embedding table. The rest of the prompt template tags, :code:`[INST]` -and :code:`<>` are tokenized as normal text and not their own IDs. - -.. code-block:: python - - print(tokenizer.decode(518)) - # '[' - print(tokenizer.decode(25580)) - # 'INST' - print(tokenizer.decode(29962)) - # ']' - print(tokenizer.decode([3532, 14816, 29903, 6778])) - # '<>' - -It's important to note that you should not place the special reserved tokens in your -input prompts manually, as it will be treated as normal text and not as a special -token. - -.. code-block:: python - - print(tokenizer.encode("", add_bos=False, add_eos=False)) - # [529, 29879, 29958] - -Now let's take a look at Llama3's formatting to see how it's tokenized differently -than Llama2. - -.. code-block:: python - - from torchtune.models.llama3 import llama3_tokenizer - - tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model") - messages = [Message.from_dict(msg) for msg in sample] - tokens, mask = tokenizer.tokenize_messages(messages) - print(tokenizer.decode(tokens)) - # '<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful, respectful, - # and honest assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho - # are the most influential hip-hop artists of all time?<|eot_id|><|start_header_id|> - # assistant<|end_header_id|>\n\nHere is a list of some of the most influential hip-hop - # artists of all time: 2Pac, Rakim, N.W.A., Run-D.M.C., and Nas.<|eot_id|>' - -.. note:: - We used the ``tokenize_messages`` API for Llama3, which is different than - encode. It simply manages adding all the special tokens in the correct - places after encoding the individual messages. - -We can see that the tokenizer handled all the formatting without us specifying a prompt -template. It turns out that all of the additional tags are special tokens, and we don't require -a separate prompt template. We can verify this by checking if the tags get encoded -as their own token IDs. - -.. code-block:: python - - print(tokenizer.special_tokens["<|begin_of_text|>"]) - # 128000 - print(tokenizer.special_tokens["<|eot_id|>"]) - # 128009 - -The best part is - all these special tokens are handled purely by the tokenizer. -That means you won't have to worry about messing up any required prompt templates! - - -When should I use a prompt template? ------------------------------------- - -Whether or not to use a prompt template is governed by what your desired inference -behavior is. You should use a prompt template if you are running inference on the -base model and it was pre-trained with a prompt template, or you want to prime a -fine-tuned model to expect a certain prompt structure on inference for a specific task. - -It is not strictly necessary to fine-tune with a prompt template, but generally -specific tasks will require specific templates. For example, the :class:`~torchtune.data.SummarizeTemplate` -provides a lightweight structure to prime your fine-tuned model for prompts asking to summarize text. -This would wrap around the user message, with the assistant message untouched. - -.. code-block:: python - - f"Summarize this dialogue:\n{dialogue}\n---\nSummary:\n" - -You can fine-tune Llama2 with this template even though the model was originally pre-trained -with the :class:`~torchtune.models.llama2.Llama2ChatTemplate`, as long as this is what the model -sees during inference. The model should be robust enough to adapt to a new template. - - -Fine-tuning on a custom chat dataset ------------------------------------- - -Let's test our understanding by trying to fine-tune the Llama3-8B instruct model with a custom -chat dataset. We'll walk through how to set up our data so that it can be tokenized -correctly and fed into our model. - -Let's say we have a local dataset saved as a JSON file that contains conversations -with an AI model. How can we get something like this into a format -Llama3 understands and tokenizes correctly? - -.. code-block:: python - - # data/my_data.json - [ - { - "dialogue": [ - { - "from": "human", - "value": "What is your name?" - }, - { - "from": "gpt", - "value": "I am an AI assistant, I don't have a name." - }, - { - "from": "human", - "value": "Pretend you have a name." - }, - { - "from": "gpt", - "value": "My name is Mark Zuckerberg." - } - ] - }, - ] - -Let's first take a look at the :ref:`dataset_builders` and see which fits our use case. Since we -have conversational data, :func:`~torchtune.datasets.chat_dataset` seems to be a good fit. For any -custom local dataset we always need to specify ``source``, ``data_files``, and ``split`` for any dataset -builder in torchtune. For :func:`~torchtune.datasets.chat_dataset`, we additionally need to specify -``conversation_column`` and ``conversation_style``. Our data follows the ``"sharegpt"`` format, so -we can specify that here. Altogether, our :func:`~torchtune.datasets.chat_dataset` call should -look like so: - -.. code-block:: python - - from torchtune.datasets import chat_dataset - from torchtune.models.llama3 import llama3_tokenizer - - tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model") - ds = chat_dataset( - tokenizer=tokenizer, - source="json", - data_files="data/my_data.json", - split="train", - conversation_column="dialogue", - conversation_style="sharegpt", - ) - -.. code-block:: yaml - - # In config - tokenizer: - _component_: torchtune.models.llama3.llama3_tokenizer - path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model - - dataset: - _component_: torchtune.datasets.chat_dataset - source: json - data_files: data/my_data.json - split: train - conversation_column: dialogue - conversation_style: sharegpt - -.. note:: - You can pass in any keyword argument for `load_dataset `_ into all our - Dataset classes and they will honor them. This is useful for common parameters - such as specifying the data split with :code:`split` or configuration with - :code:`name` - -If you needed to add a prompt template, you would simply pass it into the tokenizer. -Since we're fine-tuning Llama3, the tokenizer will handle all formatting for -us and prompt templates are optional. Other models such as Mistral's :class:`~torchtune.models.mistral._tokenizer.MistralTokenizer`, -use a chat template by default (:class:`~torchtune.models.mistral.MistralChatTemplate`) to format -all messages according to their `recommendations `_. - -Now we're ready to start fine-tuning! We'll use the built-in LoRA single device recipe. -Use the :ref:`tune cp ` command to get a copy of the :code:`8B_lora_single_device.yaml` -config and update it with your dataset configuration. - -Launch the fine-tune! - -.. code-block:: bash - - $ tune run lora_finetune_single_device --config custom_8B_lora_single_device.yaml epochs=15 From fc9afbd308098cae1f4376719715ce05252c980d Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Tue, 28 Jan 2025 19:07:14 -0500 Subject: [PATCH 15/29] Updates --- docs/source/tutorials/multinode.rst | 93 ++++++++++--------- torchtune/_recipe_registry.py | 6 ++ torchtune/training/_distributed.py | 12 ++- .../training/checkpointing/_checkpointer.py | 4 +- 4 files changed, 65 insertions(+), 50 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 06dc2f52cc..2c0c58186c 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -4,19 +4,18 @@ Multi-node finetuning ===================== -Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and -now have access to a proper multi-node cluster. You're part of the so-called "GPU middle class". In many ways, -your worries of yesteryear are gone: memory efficient training? Who cares! But in many other ways, your problems -are just starting because multi-node is a whole new beast. Come with me as I take you through your new life, complete with -a big backyard, new car, and of course - a nice rack of H100s. +Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and graduated to the +so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??). +But, new problems are on the horizon for you because multi-node is a whole new beast. Come with me as I take you +through your new life, complete with a big backyard, new car, and of course - a nice rack of H100s. .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn: + * Why multi-node training is useful * How to set up the torchtune package on a SLURM cluster * How to fine-tune a Llama3.3 70B model w/ full parameter updates (not LoRA) - * What common errors to lookout for .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites @@ -27,75 +26,77 @@ a big backyard, new car, and of course - a nice rack of H100s. Advantages of multi-node training --------------------------------- -It's likely that if you're reading this tutorial, you don't need a refresher on the advantages of having -MORE compute, but let's go over it again so you can appreciate how lucky you are. Let's consider a simplified calculation -on how much memory is required to train a 70B parameter model in bfloat16. +More machines means more memory! This is cool for several reasons: -.. code-block:: text - - Weights 140 GB - + Optim state (AdamW) 280 GB - + Activations (bsz=8,seq_len=2048) XX - ------------------------------------------ - 280 GB - -Right now the average GPU has 80GB of VRAM so definitely can't fit on a single GPU and even multiple GPUs won't be up to the task. -We have a ton of memory optimizations in torchtune that allow you to fit larger models in less resource. - -Why might you want to use multi-node then? -* Larger models (like Llama 405B, Deepseek, etc) -* Potentially faster training via larger batch sizes, no activation checkpointing -* Potentially more accurate training with full parameter updates and non-approximate optimizers, etc +1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B `_, Deepseek-V3, and more. +2. **Longer data**: More many tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations. +3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like AdamW (not low-precision optimizers),both of which can potentially improve the quality of your training. +4. **Faster training**: With the ability to fit more data in memory, you can use higher batch sizes *and* turn off memory optimizations like :ref:`activation checkpointing` thereby decreasing the time it takes for training to complete. .. note:: - **Low inter-node bandwidth & FSDP** - We utilize to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation for each forward pass and an all-gather plus a scatter-reduce - operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced. + **Low inter-node bandwidth & FSDP** We utilize Fully Sharded Data Parallel to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation + for each forward pass and an all-gather plus a scatter-reduce operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow + inter-node connection, training speed may be reduced. Training Llama3.3 70B on 2 nodes -------------------------------- -With that background out of the way, let's get training! We'll be utilizing a common cluster setup called SLURM and we assume you have a decent working knowledge for this tutorial. -First, we need to install torchtune on your cluster. Although pretty much as straightforward as the normal install instructions, -it's recommended that you install into a virtual environment that is accessible from nodes in your cluster - something like a shared filesystem. +Let's get training! We'll be utilizing a common cluster setup called SLURM and assume you have a decent working knowledge of SLURM for this tutorial. +First, we need to install torchtune. Although pretty much as straightforward as the normal install instructions, +it's recommended that you install the package into a virtual environment that is accessible from all nodes in your cluster like a shared filesystem. -Next, using the same idea as above, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct -credentials as noted before.) +Next, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct credentials as noted before.) .. code-block:: bash $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct -Now that we have a downloaded model, we can launch training. Although you can *technically* launch the multinode bash script from the tune CLI, -it's recommended that you copy the file to your machine. +Now that we have a downloaded model, let's check out the bash script. .. code-block:: bash $ tune cp full_finetune_multinode . -And let's open it up to see what's inside: - .. literalinclude:: ../../../recipes/full_finetune_multinode.slurm -What are the high level parts? -* Uses `full_finetune_distributed` to launch training -* Can specify number of nodes, tasks, CPUs available, etc +**There's a lot of information in this script but here are the high-level parts:** + +* We utilize SLURM specific commands like number of nodes, tasks, CPUs available, etc. +* We are using `torchrun` and the `full_finetune_distributed` recipe to train just like on single node * Should consider several cluster-specific environment variables -We just need to point to our checkpoint and output dir and get training! +.. note:: + + We may need to explicitly set the network interface for distributed backends. You can read more about that [here] + but it's also helpful to know that you can find your network interface by running `ipconfig` from a specific node. + You'll see the output. + +Once we update the shared filesystem in the bash script, we can launch using sbatch. + +.. code-block:: bash + + sbatch full_finetune_multinode.slurm + +And the output of `squeue` should show our job running: + +.. code-block:: bash -> You may need to set your interface which you can find with ipconfig + $ squeue + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 1 train torchtun slurm R 0:03 2 slurm-worker-[1-2] -Once we've trained, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub. +Once training has completed, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub! Future development ------------------ -2D parallelism +We've covered the basics of how to launch a fine-tuning job with SLURM on two nodes with FSDP. There's still more things we're cooking up, +including... -Longer context (ring attention, etc) +**2D parallelism**: Utilizing both FSDP *and* tensor parallelism will decrease memory requirements even further, allowing us to lean even harder +into the advantages listed . -What else do you want? +**Longer context (ring attention, etc)**: -BLAH BLHAH BALSHD 很好 +**Want other optimizations?** Feel free to let us know by opening up a Github Issue on our repo or dropping us a line in Discord! diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 1c41519712..177eed45c4 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -561,6 +561,12 @@ class Recipe: ], supports_distributed=True, ), + Recipe( + name="full_finetune_multinode", + file_path="full_finetune_multinode.slurm", + configs=[], + supports_distributed=True, + ), ] diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py index c2179a4e80..9acd7b8730 100644 --- a/torchtune/training/_distributed.py +++ b/torchtune/training/_distributed.py @@ -114,9 +114,17 @@ def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) Returns: str: Distributed backend for use in ``torch.distributed.init_process_group``. """ + # Copied from https://github.com/pytorch/pytorch/blame/4f949f282dc66c3e4c6b41322167641a60a8593a/torch/distributed/distributed_c10d.py#L267 + default_device_backend_map = { + "cuda": "nccl", + "cpu": "gloo", + "xpu": "xccl", + } + # TODO: Uncomment the following line once PyTorch 2.6 is released + # default_device_backend_map = dist.Backend.default_device_backend_map backend = "nccl" - if device_type in dist.Backend.default_device_backend_map.keys(): - backend = dist.default_device_backend_map.get(device_type) + if device_type in default_device_backend_map: + backend = default_device_backend_map[device_type] if enable_cpu_offload: backend = f"{device_type}:{backend},cpu:gloo" return backend diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py index 8d28ab22e1..13a71a120e 100644 --- a/torchtune/training/checkpointing/_checkpointer.py +++ b/torchtune/training/checkpointing/_checkpointer.py @@ -43,7 +43,7 @@ SUFFIXES_TO_NOT_COPY, TORCH_INDEX_FNAME, ) -from torchtune.utils._logging import get_logger, log_rank_zero +from torchtune.utils import get_logger, log_rank_zero, get_world_size_and_rank logger = get_logger("DEBUG") @@ -1193,7 +1193,7 @@ def __init__( self._checkpoint_future = None self._checkpoint_dir_prefix = "dist_epoch" self._metadata_file = ".metadata" - _, self._rank = utils.get_world_size_and_rank() + _, self._rank = get_world_size_and_rank() self._process_group: Optional[dist.ProcessGroup] = process_group def _get_latest_intermediate_checkpoint(self) -> Optional[str]: From 373e0c056ce8beefb7d05b399b1825ba11a0ac1b Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 29 Jan 2025 08:23:32 -0500 Subject: [PATCH 16/29] Lint --- recipes/full_finetune_distributed.py | 4 +++- torchtune/training/_distributed.py | 12 ++++++------ torchtune/training/checkpointing/_checkpointer.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index c8a6429033..7cc4c1342c 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -150,7 +150,9 @@ def __init__(self, cfg: DictConfig) -> None: self._checkpoint_client = CheckpointClient(cfg) self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) self.distributed_backend = training.get_distributed_backend( - device_type, enable_cpu_offload=self.fsdp_cpu_offload + device_type, + offload_ops_to_cpu=self.fsdp_cpu_offload + or self._enable_async_checkpointing, ) # Optimizer in backward is not compatible with gradient accumulation or gradient clipping diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py index 9acd7b8730..266c18ab42 100644 --- a/torchtune/training/_distributed.py +++ b/torchtune/training/_distributed.py @@ -95,26 +95,26 @@ def _broadcast_tensor(tensor: torch.Tensor, src: int = 0) -> torch.Tensor: return tensor -def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) -> str: +def get_distributed_backend(device_type: str, offload_ops_to_cpu: bool = False) -> str: """Gets the PyTorch Distributed backend based on device type. Args: device_type (str): Device type to get backend for. - enable_cpu_offload (bool): Flag to check if offload to CPU is enabled. If it is, we will add a GLOO - backend to handle CPU training. Default is False. + offload_ops_to_cpu (bool, optional): Flag to check if any operations should be offloaded to CPU. + Examples of these kinds of operations are CPU offload for FSDP and asynchronous save for distributed + checkpointing. Defaults to False. Example: >>> get_distributed_backend("cuda") 'nccl' >>> get_distributed_backend("cpu") 'gloo' - >>> get_distributed_backend("cuda", enable_cpu_offload=True) + >>> get_distributed_backend("cuda", offload_ops_to_cpu=True) 'cuda:nccl,cpu:gloo' Returns: str: Distributed backend for use in ``torch.distributed.init_process_group``. """ - # Copied from https://github.com/pytorch/pytorch/blame/4f949f282dc66c3e4c6b41322167641a60a8593a/torch/distributed/distributed_c10d.py#L267 default_device_backend_map = { "cuda": "nccl", "cpu": "gloo", @@ -125,7 +125,7 @@ def get_distributed_backend(device_type: str, enable_cpu_offload: bool = False) backend = "nccl" if device_type in default_device_backend_map: backend = default_device_backend_map[device_type] - if enable_cpu_offload: + if offload_ops_to_cpu: backend = f"{device_type}:{backend},cpu:gloo" return backend diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py index 13a71a120e..67eb5f5da8 100644 --- a/torchtune/training/checkpointing/_checkpointer.py +++ b/torchtune/training/checkpointing/_checkpointer.py @@ -43,7 +43,7 @@ SUFFIXES_TO_NOT_COPY, TORCH_INDEX_FNAME, ) -from torchtune.utils import get_logger, log_rank_zero, get_world_size_and_rank +from torchtune.utils import get_logger, get_world_size_and_rank, log_rank_zero logger = get_logger("DEBUG") From 4659938952a9805674203a9cd1b09c9e2a354471 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 29 Jan 2025 08:24:07 -0500 Subject: [PATCH 17/29] Pass test --- tests/torchtune/training/test_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py index d4821348f7..ca47dcb828 100644 --- a/tests/torchtune/training/test_distributed.py +++ b/tests/torchtune/training/test_distributed.py @@ -91,7 +91,7 @@ def test_get_distributed_backend(self) -> None: assert training.get_distributed_backend("cuda") == "nccl" assert training.get_distributed_backend("cpu") == "gloo" assert ( - training.get_distributed_backend("cuda", enable_cpu_offload=True) + training.get_distributed_backend("cuda", offload_ops_to_cpu=True) == "cuda:nccl,cpu:gloo" ) From 693b8cb016596e9191c7417264b0a51b6e8d653c Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 29 Jan 2025 11:05:03 -0500 Subject: [PATCH 18/29] Updates to tutorial --- docs/source/tutorials/multinode.rst | 42 +++++++++++++++-------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 2c0c58186c..c162b203fa 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -22,37 +22,39 @@ through your new life, complete with a big backyard, new car, and of course - a * Be familiar with distributed training in torchtune * Already know basic SLURM commands +.. _advantages_multi_node_label: Advantages of multi-node training --------------------------------- More machines means more memory! This is cool for several reasons: -1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B `_, Deepseek-V3, and more. +1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B `_, `Deepseek-V3 `_, and more. 2. **Longer data**: More many tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations. -3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like AdamW (not low-precision optimizers),both of which can potentially improve the quality of your training. +3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like `AdamW `_ (not low-precision optimizers), both of which can potentially improve the quality of your training. 4. **Faster training**: With the ability to fit more data in memory, you can use higher batch sizes *and* turn off memory optimizations like :ref:`activation checkpointing` thereby decreasing the time it takes for training to complete. .. note:: - **Low inter-node bandwidth & FSDP** We utilize Fully Sharded Data Parallel to distribute models over multiple devices. In order to distribute training, FSDP runs an all-gather operation - for each forward pass and an all-gather plus a scatter-reduce operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow - inter-node connection, training speed may be reduced. + **Low inter-node bandwidth & FSDP** We utilize `Fully Sharded Data Parallel `_ to distribute models over multiple devices. In order to distribute training, FSDP runs an `all-gather `_ operation + for each forward pass and an all-gather plus a `scatter-reduce `_ operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow + inter-node connection, training speed may be reduced. For more on this, please refer to `this Github Issue `_. Training Llama3.3 70B on 2 nodes -------------------------------- -Let's get training! We'll be utilizing a common cluster setup called SLURM and assume you have a decent working knowledge of SLURM for this tutorial. -First, we need to install torchtune. Although pretty much as straightforward as the normal install instructions, +Let's get training! We'll be utilizing a common cluster workflow manager called `SLURM `_ and assume you have a decent working knowledge of SLURM for this tutorial. +First, we need to install torchtune. Although pretty much as straightforward as the :ref:`normal install instructions`, it's recommended that you install the package into a virtual environment that is accessible from all nodes in your cluster like a shared filesystem. -Next, we need to download the Llama3.3 70B model to the shared fs. (You'll need to make sure you have the correct credentials as noted before.) +Next, we need to download the `Llama3.3 70B `_ model to your shared filesystem. You'll need to make sure you have the correct credentials following the steps +outlined :ref:`here`. .. code-block:: bash $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct -Now that we have a downloaded model, let's check out the bash script. +Now that we have a downloaded model, let's check out the SLURM bash script. .. code-block:: bash @@ -63,22 +65,21 @@ Now that we have a downloaded model, let's check out the bash script. **There's a lot of information in this script but here are the high-level parts:** * We utilize SLURM specific commands like number of nodes, tasks, CPUs available, etc. -* We are using `torchrun` and the `full_finetune_distributed` recipe to train just like on single node -* Should consider several cluster-specific environment variables +* We are using `torchrun `_ and the `full_finetune_distributed `_ recipe to train just like on single node +* You should consider several cluster-specific environment variables to maximize GPU utilization .. note:: - We may need to explicitly set the network interface for distributed backends. You can read more about that [here] - but it's also helpful to know that you can find your network interface by running `ipconfig` from a specific node. - You'll see the output. + We may need to explicitly set the network interface for distributed backends. You can read more about `PyTorch distributed backends here `_ + but it's also helpful to know that you can find your network interface by running `ipconfig `_ from a specific node. -Once we update the shared filesystem in the bash script, we can launch using sbatch. +After we update the shared filesystem in the bash script, we can launch using `sbatch `_. .. code-block:: bash sbatch full_finetune_multinode.slurm -And the output of `squeue` should show our job running: +And the output of `squeue `_ should show our job running: .. code-block:: bash @@ -94,9 +95,10 @@ Future development We've covered the basics of how to launch a fine-tuning job with SLURM on two nodes with FSDP. There's still more things we're cooking up, including... -**2D parallelism**: Utilizing both FSDP *and* tensor parallelism will decrease memory requirements even further, allowing us to lean even harder -into the advantages listed . +**2D parallelism**: Utilizing both FSDP *and* tensor parallelism in what is commonly referred to as `2D parallelism `_ will decrease memory requirements even further, allowing us to lean even harder +into the advantages listed :ref:`above`. -**Longer context (ring attention, etc)**: +**Longer context (ring attention, etc)**: More memory and more machines means we can train on longer sequences and tag advantage of neat tricks like ring attention, where tokens are split across +GPUs. You can read more about our plans for torchtune in `this Github RFC `_. -**Want other optimizations?** Feel free to let us know by opening up a Github Issue on our repo or dropping us a line in Discord! +**Want other optimizations?** Feel free to let us know by `opening up a Github Issue `_ on our repo or `dropping us a line in Discord `_! From 3d8d73d192f283aacbe875a2dc2dc1f0e567722f Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 29 Jan 2025 11:19:40 -0500 Subject: [PATCH 19/29] Remove full_finetune_multinode from recipes registry --- docs/source/tutorials/multinode.rst | 6 +----- torchtune/_recipe_registry.py | 7 +------ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index c162b203fa..e4e176e65d 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -54,11 +54,7 @@ outlined :ref:`here`. $ tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "consolidated/*.pth" --output-dir SHARED_FS/Llama-3.3-70B-Instruct -Now that we have a downloaded model, let's check out the SLURM bash script. - -.. code-block:: bash - - $ tune cp full_finetune_multinode . +Now that we have a downloaded model, let's check out our example SLURM bash script. .. literalinclude:: ../../../recipes/full_finetune_multinode.slurm diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 177eed45c4..b5692e9d7c 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -107,6 +107,7 @@ class Recipe: Config(name="llama3/70B_full", file_path="llama3/70B_full.yaml"), Config(name="llama3_1/70B_full", file_path="llama3_1/70B_full.yaml"), Config(name="llama3_3/70B_full", file_path="llama3_3/70B_full.yaml"), + Config(name="llama3_3/70B_full_multinode", file_path="llama3_3/70B_full_multinode.yaml"), Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"), Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"), Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"), @@ -561,12 +562,6 @@ class Recipe: ], supports_distributed=True, ), - Recipe( - name="full_finetune_multinode", - file_path="full_finetune_multinode.slurm", - configs=[], - supports_distributed=True, - ), ] From c0345a5f2b0761a0cf24b9bfbd0a3ed1be28ab6e Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 29 Jan 2025 11:21:30 -0500 Subject: [PATCH 20/29] Lint --- torchtune/_recipe_registry.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index b5692e9d7c..5b5cb60d72 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -107,7 +107,10 @@ class Recipe: Config(name="llama3/70B_full", file_path="llama3/70B_full.yaml"), Config(name="llama3_1/70B_full", file_path="llama3_1/70B_full.yaml"), Config(name="llama3_3/70B_full", file_path="llama3_3/70B_full.yaml"), - Config(name="llama3_3/70B_full_multinode", file_path="llama3_3/70B_full_multinode.yaml"), + Config( + name="llama3_3/70B_full_multinode", + file_path="llama3_3/70B_full_multinode.yaml", + ), Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"), Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"), Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"), From a3aaeb46dae5ca06910e422e0eef9c2f7641ab91 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Wed, 29 Jan 2025 11:26:47 -0500 Subject: [PATCH 21/29] Last link --- docs/source/tutorials/e2e_flow.rst | 2 ++ docs/source/tutorials/multinode.rst | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst index 9f39de4b6b..8e3a098d3e 100644 --- a/docs/source/tutorials/e2e_flow.rst +++ b/docs/source/tutorials/e2e_flow.rst @@ -359,6 +359,8 @@ For Llama models, you can run generation directly in torchao on the quantized mo discussed in `this readme `_. This way you can compare your own results to those in the previously-linked table. +.. _use_model_in_wild: + Use your model in the wild -------------------------- diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index e4e176e65d..5ddd73cdca 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -83,7 +83,7 @@ And the output of `squeue `_ should show JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 1 train torchtun slurm R 0:03 2 slurm-worker-[1-2] -Once training has completed, we can follow the instructions [here] in order to upload our beautiful new model to the Hugging Face Hub! +Once training has completed, we can follow the :ref:`instructions here` in order to upload our beautiful new model to the Hugging Face Hub! Future development ------------------ From b56b6bee9f97ddfa47e6b815ce77c554dc18d573 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Thu, 30 Jan 2025 20:43:26 -0500 Subject: [PATCH 22/29] Evan updates --- docs/source/tutorials/multinode.rst | 13 ++++++------- recipes/configs/llama3_3/70B_full_multinode.yaml | 2 -- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 5ddd73cdca..e200d4c82d 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -6,8 +6,7 @@ Multi-node finetuning Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and graduated to the so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??). -But, new problems are on the horizon for you because multi-node is a whole new beast. Come with me as I take you -through your new life, complete with a big backyard, new car, and of course - a nice rack of H100s. +But new problems are on the horizon for you because multi-node can be a whole new beast. .. grid:: 2 @@ -30,14 +29,14 @@ Advantages of multi-node training More machines means more memory! This is cool for several reasons: 1. **Bigger models**: With more memory, you can train larger models such as `Llama3.1 405B `_, `Deepseek-V3 `_, and more. -2. **Longer data**: More many tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations. +2. **Longer data**: For many fine-tuning tasks like writing code, it's helpful to have long context lengths; however longer context length means more memory needed for activations. 3. **Higher quality**: With more memory, you can do full parameter updates (not LoRA) and use optimizers like `AdamW `_ (not low-precision optimizers), both of which can potentially improve the quality of your training. 4. **Faster training**: With the ability to fit more data in memory, you can use higher batch sizes *and* turn off memory optimizations like :ref:`activation checkpointing` thereby decreasing the time it takes for training to complete. .. note:: - **Low inter-node bandwidth & FSDP** We utilize `Fully Sharded Data Parallel `_ to distribute models over multiple devices. In order to distribute training, FSDP runs an `all-gather `_ operation - for each forward pass and an all-gather plus a `scatter-reduce `_ operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow + **Low inter-node bandwidth & FSDP** We utilize PyTorch's **Fully Sharded Data Parallel** to distribute models over multiple devices. In order to distribute training, FSDP runs an `all-gather `_ operation + for each forward pass and an all-gather (usually) plus a `reduce-scatter `_ operation for each backwards pass. These operations (usually) block training from continuing until completed and with a slow inter-node connection, training speed may be reduced. For more on this, please refer to `this Github Issue `_. Training Llama3.3 70B on 2 nodes @@ -62,7 +61,7 @@ Now that we have a downloaded model, let's check out our example SLURM bash scri * We utilize SLURM specific commands like number of nodes, tasks, CPUs available, etc. * We are using `torchrun `_ and the `full_finetune_distributed `_ recipe to train just like on single node -* You should consider several cluster-specific environment variables to maximize GPU utilization +* You can consider several cluster-specific environment variables (``NCCL_BUFFSIZE``, ``NCCL_DEBUG``, ``FI_PROVIDER``, etc.) in order to maximize GPU utilization, debug, and more. .. note:: @@ -83,7 +82,7 @@ And the output of `squeue `_ should show JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 1 train torchtun slurm R 0:03 2 slurm-worker-[1-2] -Once training has completed, we can follow the :ref:`instructions here` in order to upload our beautiful new model to the Hugging Face Hub! +Once training has completed, which should take roughly seven minutes in total with the default config, we can follow the :ref:`instructions here` in order to upload our beautiful new model to the Hugging Face Hub! Future development ------------------ diff --git a/recipes/configs/llama3_3/70B_full_multinode.yaml b/recipes/configs/llama3_3/70B_full_multinode.yaml index 4572792661..d7a09422d1 100644 --- a/recipes/configs/llama3_3/70B_full_multinode.yaml +++ b/recipes/configs/llama3_3/70B_full_multinode.yaml @@ -46,8 +46,6 @@ epochs: 1 optimizer: _component_: torch.optim.AdamW lr: 2e-5 - # Note: highly recommended to use fused=True optimizer flag - # with CPU offload for faster optimizer step. fused: True loss: From 63eb2746764a79f5be83bfb927593c38dadf5626 Mon Sep 17 00:00:00 2001 From: Joe Cummings Date: Fri, 31 Jan 2025 10:29:45 -0500 Subject: [PATCH 23/29] Update comment --- recipes/full_finetune_multinode.slurm | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/full_finetune_multinode.slurm b/recipes/full_finetune_multinode.slurm index 6e83ba4f62..aa5c388fc8 100644 --- a/recipes/full_finetune_multinode.slurm +++ b/recipes/full_finetune_multinode.slurm @@ -15,6 +15,7 @@ # ---------- Set env variables ---------- # # Grab the IP for head node: +# You may need to set this to the fully qualified domain name of your head node nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) nodes_array=($nodes) head_node=${nodes_array[0]} From 4d027b07494954c384280ab592241df44915fef9 Mon Sep 17 00:00:00 2001 From: joecummings Date: Fri, 31 Jan 2025 13:16:59 -0800 Subject: [PATCH 24/29] Move process initialization --- recipes/full_finetune_distributed.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 7cc4c1342c..3aa8c36fab 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -138,9 +138,6 @@ def __init__(self, cfg: DictConfig) -> None: ) self._log_peak_memory_stats = False - _, rank = utils.get_world_size_and_rank() - self._is_rank_zero = rank == 0 - # Training cfg self._resume_from_checkpoint = cfg.resume_from_checkpoint self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False) @@ -149,11 +146,16 @@ def __init__(self, cfg: DictConfig) -> None: self._clip_grad_norm = cfg.get("clip_grad_norm", None) self._checkpoint_client = CheckpointClient(cfg) self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) + + # Set up the backend for distributed training (NCCL, GLOO, etc.) self.distributed_backend = training.get_distributed_backend( device_type, offload_ops_to_cpu=self.fsdp_cpu_offload or self._enable_async_checkpointing, ) + init_process_group(self.distributed_backend) + _, rank = utils.get_world_size_and_rank() + self._is_rank_zero = rank == 0 # Optimizer in backward is not compatible with gradient accumulation or gradient clipping if self._optimizer_in_bwd: @@ -247,9 +249,6 @@ def setup(self, cfg: DictConfig) -> None: Setup the recipe. This includes training state (if resume_from_checkpoint is True), model, tokenizer, loss, optimizer, lr scheduler, sampler, and dataloader. """ - # Set up the backend for distributed training (NCCL, GLOO, etc.) - init_process_group(self.distributed_backend) - if self.fsdp_cpu_offload: # Utilize all available CPU cores for intra-op parallelism. This provides ~2x # speed up when benchmarking fused AdamW on CPU From 34aa18b8f9fe84971d9129eb8dc95e9db0113b19 Mon Sep 17 00:00:00 2001 From: joecummings Date: Sat, 1 Feb 2025 08:25:25 -0800 Subject: [PATCH 25/29] Move init process group to above checkpoint instantiation --- docs/source/tutorials/multinode.rst | 2 +- recipes/full_finetune_distributed.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index e200d4c82d..6d393e3ab4 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -82,7 +82,7 @@ And the output of `squeue `_ should show JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 1 train torchtun slurm R 0:03 2 slurm-worker-[1-2] -Once training has completed, which should take roughly seven minutes in total with the default config, we can follow the :ref:`instructions here` in order to upload our beautiful new model to the Hugging Face Hub! +Once training has completed, which should take roughly seven minutes in total (880 tok/s) with the default config, we can follow the :ref:`instructions here` in order to upload our beautiful new model to the Hugging Face Hub! Future development ------------------ diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 3aa8c36fab..8acf1f2e45 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -127,6 +127,16 @@ def __init__(self, cfg: DictConfig) -> None: "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead." ) + # Set up the backend for distributed training (NCCL, GLOO, etc.) + self.distributed_backend = training.get_distributed_backend( + device_type, + offload_ops_to_cpu=self.fsdp_cpu_offload + or self._enable_async_checkpointing, + ) + init_process_group(self.distributed_backend) + _, rank = utils.get_world_size_and_rank() + self._is_rank_zero = rank == 0 + # Logging attributes self._output_dir = cfg.output_dir self._log_every_n_steps = cfg.get("log_every_n_steps", 1) @@ -147,16 +157,6 @@ def __init__(self, cfg: DictConfig) -> None: self._checkpoint_client = CheckpointClient(cfg) self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) - # Set up the backend for distributed training (NCCL, GLOO, etc.) - self.distributed_backend = training.get_distributed_backend( - device_type, - offload_ops_to_cpu=self.fsdp_cpu_offload - or self._enable_async_checkpointing, - ) - init_process_group(self.distributed_backend) - _, rank = utils.get_world_size_and_rank() - self._is_rank_zero = rank == 0 - # Optimizer in backward is not compatible with gradient accumulation or gradient clipping if self._optimizer_in_bwd: if self._clip_grad_norm is not None: From 30b73665dabe319ae15f5320d4308e7332d64082 Mon Sep 17 00:00:00 2001 From: joecummings Date: Sat, 1 Feb 2025 08:32:47 -0800 Subject: [PATCH 26/29] Update intro --- docs/source/tutorials/multinode.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 6d393e3ab4..bada6c6708 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -4,9 +4,8 @@ Multi-node finetuning ===================== -Congratulations! After years of being "GPU poor", you've worked hard, saved your hard earned Bitcoin and graduated to the -so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??). -But new problems are on the horizon for you because multi-node can be a whole new beast. +Congratulations! After years of being `"GPU poor"`_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**. +In many ways, your worries of yesteryear are gone (memory efficient training, who??). But new problems are on the horizon because multi-node can be a whole new beast. .. grid:: 2 From c7fdc219270dcf4d7095b21db7c49aca976f4550 Mon Sep 17 00:00:00 2001 From: joecummings Date: Sat, 1 Feb 2025 08:42:25 -0800 Subject: [PATCH 27/29] Docs r dumb --- docs/source/tutorials/multinode.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index bada6c6708..242ce9293a 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -4,7 +4,7 @@ Multi-node finetuning ===================== -Congratulations! After years of being `"GPU poor"`_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**. +Congratulations! After years of being `"GPU poor" `_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**. In many ways, your worries of yesteryear are gone (memory efficient training, who??). But new problems are on the horizon because multi-node can be a whole new beast. .. grid:: 2 From 900d6431610efee562c2af4f0b2510da72823fc1 Mon Sep 17 00:00:00 2001 From: joecummings Date: Sat, 1 Feb 2025 10:02:32 -0800 Subject: [PATCH 28/29] Wow --- recipes/full_finetune_distributed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 8acf1f2e45..db4d1b59cc 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -128,6 +128,8 @@ def __init__(self, cfg: DictConfig) -> None: ) # Set up the backend for distributed training (NCCL, GLOO, etc.) + self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False) + self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) self.distributed_backend = training.get_distributed_backend( device_type, offload_ops_to_cpu=self.fsdp_cpu_offload @@ -150,12 +152,10 @@ def __init__(self, cfg: DictConfig) -> None: # Training cfg self._resume_from_checkpoint = cfg.resume_from_checkpoint - self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False) self._gradient_accumulation_steps = cfg.gradient_accumulation_steps self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False) self._clip_grad_norm = cfg.get("clip_grad_norm", None) self._checkpoint_client = CheckpointClient(cfg) - self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False) # Optimizer in backward is not compatible with gradient accumulation or gradient clipping if self._optimizer_in_bwd: From 9e230caefe23bab270937f139211b12dec104d30 Mon Sep 17 00:00:00 2001 From: joecummings Date: Mon, 3 Feb 2025 08:47:51 -0800 Subject: [PATCH 29/29] Rework intro --- docs/source/index.rst | 7 +++++++ docs/source/tutorials/multinode.rst | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 621457c083..13e1bbda56 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,6 +85,13 @@ torchtune tutorials. :link: tutorials/llama_kd_tutorial.html :tags: finetuning,llama3,kd +.. customcarditem:: + :header: Multi-node training w/ Llama3.3 70B + :card_description: Fine-tuning a large model on 2+ nodes + :image: _static/img/generic-pytorch-logo.png + :link: tutorials/multinode.html + :tags: multinode,llama3,slurm + .. customcardend:: diff --git a/docs/source/tutorials/multinode.rst b/docs/source/tutorials/multinode.rst index 242ce9293a..b4cc98d771 100644 --- a/docs/source/tutorials/multinode.rst +++ b/docs/source/tutorials/multinode.rst @@ -4,8 +4,8 @@ Multi-node finetuning ===================== -Congratulations! After years of being `"GPU poor" `_, you've cobbled together more than a single node of GPUs and therefore graduated to the so-called **"GPU middle class"**. -In many ways, your worries of yesteryear are gone (memory efficient training, who??). But new problems are on the horizon because multi-node can be a whole new beast. +Congratulations! You've finally escaped the struggles of being "GPU poor" and now have access to a multi-node setup. +You can bid farewell to the days of sweating over memory-efficient optimizations, but get ready for new challenges as you navigate the complexities of distributed computing. .. grid:: 2