-
Notifications
You must be signed in to change notification settings - Fork 446
/
72B_lora.yaml
147 lines (131 loc) · 4.67 KB
/
72B_lora.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
# using a Qwen2.5 72B model
#
# This config assumes that you've run the following command before launching
# this run:
# tune download Qwen/Qwen2.5-72B-Instruct --output-dir /tmp/Qwen2_5-72B-Instruct
#
# To launch on 8 devices, run the following command from root:
# tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training
# you can run:
# tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
# Model Arguments
model:
_component_: torchtune.models.qwen2_5.lora_qwen2_5_72b_instruct
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 8 # higher increases accuracy and memory
lora_alpha: 16 # usually alpha=2*rank
lora_dropout: 0.0
tokenizer:
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
path: /tmp/Qwen2_5-72B-Instruct/vocab.json
merges_file: /tmp/Qwen2_5-72B-Instruct/merges.txt
max_seq_len: null
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen2_5-72B-Instruct
checkpoint_files: [
model-00001-of-00037.safetensors,
model-00002-of-00037.safetensors,
model-00003-of-00037.safetensors,
model-00004-of-00037.safetensors,
model-00005-of-00037.safetensors,
model-00006-of-00037.safetensors,
model-00007-of-00037.safetensors,
model-00008-of-00037.safetensors,
model-00009-of-00037.safetensors,
model-00010-of-00037.safetensors,
model-00011-of-00037.safetensors,
model-00012-of-00037.safetensors,
model-00013-of-00037.safetensors,
model-00014-of-00037.safetensors,
model-00015-of-00037.safetensors,
model-00016-of-00037.safetensors,
model-00017-of-00037.safetensors,
model-00018-of-00037.safetensors,
model-00019-of-00037.safetensors,
model-00020-of-00037.safetensors,
model-00021-of-00037.safetensors,
model-00022-of-00037.safetensors,
model-00023-of-00037.safetensors,
model-00024-of-00037.safetensors,
model-00025-of-00037.safetensors,
model-00026-of-00037.safetensors,
model-00027-of-00037.safetensors,
model-00028-of-00037.safetensors,
model-00029-of-00037.safetensors,
model-00030-of-00037.safetensors,
model-00031-of-00037.safetensors,
model-00032-of-00037.safetensors,
model-00033-of-00037.safetensors,
model-00034-of-00037.safetensors,
model-00035-of-00037.safetensors,
model-00036-of-00037.safetensors,
model-00037-of-00037.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/Qwen2_5-72B-Instruct-lora-finetune
model_type: QWEN2
resume_from_checkpoint: False
# Dataset and Sampler
dataset:
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True
batch_size: 2
# Optimizer and Scheduler
optimizer:
_component_: torch.optim.AdamW
fused: True
weight_decay: 0.01
lr: 3e-4
lr_scheduler:
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
# Training
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
# Logging
output_dir: /tmp/Qwen2_5-72B-Instruct-lora-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_every_n_steps: 1
log_peak_memory_stats: False
# Environment
device: cuda
dtype: bf16
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
# custom_sharded_layers: ['tok_embeddings'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False
#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs
#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True
#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False
# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 5
active_steps: 2
num_cycles: 1