-
Notifications
You must be signed in to change notification settings - Fork 143
/
Copy pathspec.yaml
352 lines (301 loc) · 13.6 KB
/
spec.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: translation_finetune
version: 0.0.72
type: command
is_deterministic: true
display_name: Translation Finetune
description: Component to finetune Hugging Face pretrained models for translation task. The component supports optimizations such as LoRA, Deepspeed and ONNXRuntime for performance enhancement. See [docs](https://aka.ms/azureml/components/translation_finetune) to learn more.
environment: azureml://registries/azureml/environments/acft-hf-nlp-gpu/versions/87
code: ../../../src/finetune
distribution:
type: pytorch
inputs:
# Lora parameters
# LoRA reduces the number of trainable parameters by learning pairs of rank-decompostion matrices while freezing the original weights. This vastly reduces the storage requirement for large language models adapted to specific tasks and enables efficient task-switching during deployment all without introducing inference latency. LoRA also outperforms several other adaptation methods including adapter, prefix-tuning, and fine-tuning. Currently, LoRA is supported for gpt2, bert, roberta, deberta, distilbert, t5, bart, mbart and camembert model families
apply_lora:
type: string
enum:
- "true"
- "false"
default: "false"
optional: true
description: If "true" enables lora.
merge_lora_weights:
type: string
enum:
- "true"
- "false"
default: "true"
optional: true
description: If "true", the lora weights are merged with the base Hugging Face model weights before saving.
lora_alpha:
type: integer
default: 128
optional: true
description: alpha attention parameter for lora.
lora_r:
type: integer
default: 8
optional: true
description: lora dimension
lora_dropout:
type: number
default: 0.0
optional: true
description: lora dropout value
# Training parameters
num_train_epochs:
type: integer
min: 1
default: 1
optional: true
description: Number of epochs to run for finetune.
max_steps:
type: integer
default: -1
optional: true
description: If set to a positive number, the total number of training steps to perform. Overrides 'epochs'. In case of using a finite iterable dataset the training may stop before reaching the set number of steps when all data is exhausted.
per_device_train_batch_size:
type: integer
min: 1
default: 1
optional: true
description: Per gpu batch size used for training. The effective training batch size is _per_device_train_batch_size_ * _num_gpus_ * _num_nodes_
per_device_eval_batch_size:
type: integer
min: 1
default: 1
optional: true
description: Per gpu batch size used for validation. The default value is 1. The effective validation batch size is _per_device_eval_batch_size_ * _num_gpus_ * _num_nodes_.
auto_find_batch_size:
type: string
enum:
- "true"
- "false"
default: "false"
optional: true
description: If set to "true" and if the provided 'per_device_train_batch_size' goes into Out Of Memory (OOM) auto_find_batch_size will find the correct batch size by iteratively reducing batch size by a factor of 2 till the OOM is fixed
optim:
type: string
default: adamw_hf
optional: true
enum:
- adamw_hf
- adamw_torch
# - adamw_apex_fused
- adafactor
description: Optimizer to be used while training
learning_rate:
type: number
default: 0.00002
optional: true
description: Start learning rate used for training.
warmup_steps:
type: integer
default: 0
optional: true
description: Number of steps for the learning rate scheduler warmup phase.
weight_decay:
type: number
default: 0.0
optional: true
description: Weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer
adam_beta1:
type: number
default: 0.9
optional: true
description: beta1 hyperparameter for the AdamW optimizer
adam_beta2:
type: number
default: 0.999
optional: true
description: beta2 hyperparameter for the AdamW optimizer
adam_epsilon:
type: number
default: 1e-8
optional: true
description: epsilon hyperparameter for the AdamW optimizer
gradient_accumulation_steps:
type: integer
default: 1
optional: true
description: Number of updates steps to accumulate the gradients for, before performing a backward/update pass
eval_accumulation_steps:
type: integer
default: -1
optional: true
description: Number of predictions steps to accumulate before moving the tensors to the CPU, will be passed as None if set to -1
lr_scheduler_type:
type: string
default: linear
optional: true
enum:
- linear
- cosine
- cosine_with_restarts
- polynomial
- constant
- constant_with_warmup
description: learning rate scheduler to use.
precision:
type: string
enum:
- "32"
- "16"
default: "32"
optional: true
description: Apply mixed precision training. This can reduce memory footprint by performing operations in half-precision.
seed:
type: integer
default: 42
optional: true
description: Random seed that will be set at the beginning of training
enable_full_determinism:
type: string
enum:
- "true"
- "false"
default: "false"
optional: true
description: Ensure reproducible behavior during distributed training. Check this link https://pytorch.org/docs/stable/notes/randomness.html for more details.
dataloader_num_workers:
type: integer
default: 0
optional: true
description: Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
ignore_mismatched_sizes:
type: string
enum:
- "true"
- "false"
default: "true"
optional: true
description: Not setting this flag will raise an error if some of the weights from the checkpoint do not have the same size as the weights of the model.
max_grad_norm:
type: number
default: 1.0
optional: true
description: Maximum gradient norm (for gradient clipping)
evaluation_strategy:
type: string
default: epoch
optional: true
enum:
- epoch
- steps
description: The evaluation strategy to adopt during training. If set to "steps", either the `evaluation_steps_interval` or `eval_steps` needs to be specified, which helps to determine the step at which the model evaluation needs to be computed else evaluation happens at end of each epoch.
evaluation_steps_interval:
type: number
default: 0.0
optional: true
description: The evaluation steps in fraction of an epoch steps to adopt during training. Overwrites eval_steps if not 0.
eval_steps:
type: integer
default: 500
optional: true
description: Number of update steps between two evals if evaluation_strategy='steps'
logging_strategy:
type: string
default: steps
optional: true
enum:
- epoch
- steps
description: The logging strategy to adopt during training. If set to "steps", the `logging_steps` will decide the frequency of logging else logging happens at the end of epoch.
logging_steps:
type: integer
default: 10
optional: true
description: Number of update steps between two logs if logging_strategy='steps'
metric_for_best_model:
type: string
default: loss
optional: true
enum:
- loss
- bleu
description: metric to use to compare two different model checkpoints
resume_from_checkpoint:
type: string
default: "false"
optional: true
enum:
- "true"
- "false"
description: If set to "true", resumes the training from last saved checkpoint. Along with loading the saved weights, saved optimizer, scheduler and random states will be loaded if exist. The default value is "false"
save_total_limit:
type: integer
default: -1
optional: true
description: If a positive value is passed, it will limit the total number of checkpoints saved. The value of -1 saves all the checkpoints, otherwise if the number of checkpoints exceed the _save_total_limit_, the older checkpoints gets deleted.
# Early Stopping Parameters
apply_early_stopping:
type: string
default: "false"
optional: true
enum:
- "true"
- "false"
description: If set to "true", early stopping is enabled.
early_stopping_patience:
type: integer
default: 1
optional: true
description: Stop training when the metric specified through _metric_for_best_model_ worsens for _early_stopping_patience_ evaluation calls.This value is only valid if _apply_early_stopping_ is set to true.
early_stopping_threshold:
type: number
default: 0.0
optional: true
description: Denotes how much the specified metric must improve to satisfy early stopping conditions. This value is only valid if _apply_early_stopping_ is set to true.
# Deepspeed Parameters
# Deepspeed config is a JSON file that can be used to configure optimizer, scheduler, batch size and other training related parameters. A default deepspeed config is used when _apply_deepspeed_ is set to `true`. Alternatively, you can pass your custom deepspeed config. Please follow the [deepspeed docs](https://www.deepspeed.ai/docs/config-json/) to create the custom config.
# Please note that to enable deepspeed, `apply_deepspeed` must be set to true, only passing the `deepspeed input` will not suffice
apply_deepspeed:
type: string
enum:
- "true"
- "false"
default: "false"
optional: true
description: If set to true, will enable deepspeed for training
deepspeed:
type: uri_file
optional: true
description: Deepspeed config to be used for finetuning
mode: rw_mount
deepspeed_stage:
type: string
optional: true
default: "2"
enum:
- "2"
- "3"
description: This parameter configures which DEFAULT deepspeed config to be used - stage2 or stage3. The default choice is stage2. Note that, this parameter is ONLY applicable when user doesn't pass any config information via deepspeed port.
# ORT Parameters
# ONNX Runtime is a cross-platform machine-learning model accelerator, with a flexible interface to integrate hardware-specific libraries.
apply_ort:
type: string
enum:
- "true"
- "false"
default: "false"
optional: true
description: If set to true, will use the ONNXRunTime training
# Data and Model inputs
preprocess_output:
type: uri_folder
optional: false
description: output folder of preprocess component containing encoded train, valid and test data. The tokenizer is also saved as part of preprocess output
mode: rw_mount
model_selector_output:
type: uri_folder
optional: false
description: output folder of model import component containing model artifacts and a metadata file.
mode: rw_mount
outputs:
pytorch_model_folder:
type: uri_folder
description: output folder containing _best_ model as defined by _metric_for_best_model_. Along with the best model, output folder contains checkpoints saved after every evaluation which is defined by the _evaluation_strategy_. Each checkpoint contains the model weight(s), config, tokenizer, optimzer, scheduler and random number states.
mode: rw_mount
command: >-
python finetune.py $[[--apply_lora '${{inputs.apply_lora}}']] $[[--merge_lora_weights '${{inputs.merge_lora_weights}}']] $[[--lora_alpha '${{inputs.lora_alpha}}']] $[[--lora_r '${{inputs.lora_r}}']] $[[--lora_dropout '${{inputs.lora_dropout}}']] $[[--num_train_epochs '${{inputs.num_train_epochs}}']] $[[--max_steps '${{inputs.max_steps}}']] $[[--per_device_train_batch_size '${{inputs.per_device_train_batch_size}}']] $[[--per_device_eval_batch_size '${{inputs.per_device_eval_batch_size}}']] $[[--auto_find_batch_size '${{inputs.auto_find_batch_size}}']] $[[--optim '${{inputs.optim}}']] $[[--learning_rate '${{inputs.learning_rate}}']] $[[--warmup_steps '${{inputs.warmup_steps}}']] $[[--weight_decay '${{inputs.weight_decay}}']] $[[--adam_beta1 '${{inputs.adam_beta1}}']] $[[--adam_beta2 '${{inputs.adam_beta2}}']] $[[--adam_epsilon '${{inputs.adam_epsilon}}']] $[[--gradient_accumulation_steps '${{inputs.gradient_accumulation_steps}}']] $[[--eval_accumulation_steps '${{inputs.eval_accumulation_steps}}']] $[[--lr_scheduler_type '${{inputs.lr_scheduler_type}}']] $[[--precision '${{inputs.precision}}']] $[[--seed '${{inputs.seed}}']] $[[--enable_full_determinism '${{inputs.enable_full_determinism}}']] $[[--dataloader_num_workers '${{inputs.dataloader_num_workers}}']] $[[--ignore_mismatched_sizes '${{inputs.ignore_mismatched_sizes}}']] $[[--max_grad_norm '${{inputs.max_grad_norm}}']] $[[--evaluation_strategy '${{inputs.evaluation_strategy}}']] $[[--evaluation_steps_interval '${{inputs.evaluation_steps_interval}}']] $[[--eval_steps '${{inputs.eval_steps}}']] $[[--logging_strategy '${{inputs.logging_strategy}}']] $[[--logging_steps '${{inputs.logging_steps}}']] $[[--metric_for_best_model '${{inputs.metric_for_best_model}}']] $[[--resume_from_checkpoint '${{inputs.resume_from_checkpoint}}']] $[[--save_total_limit '${{inputs.save_total_limit}}']] $[[--apply_early_stopping '${{inputs.apply_early_stopping}}']] $[[--early_stopping_patience '${{inputs.early_stopping_patience}}']] $[[--early_stopping_threshold '${{inputs.early_stopping_threshold}}']] $[[--apply_ort '${{inputs.apply_ort}}']] $[[--apply_deepspeed '${{inputs.apply_deepspeed}}']] $[[--deepspeed '${{inputs.deepspeed}}']] $[[--deepspeed_stage '${{inputs.deepspeed_stage}}']] --model_selector_output '${{inputs.model_selector_output}}' --preprocess_output '${{inputs.preprocess_output}}' --pytorch_model_folder '${{outputs.pytorch_model_folder}}'