Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions examples/llama3/conf/train/1b_lora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
system:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 2
disable_bias_linear: True
use_flash_attn: True
sequence_parallel: False
use_distributed_optimizer: True
precision:
bf16: True
attention_softmax_in_fp32: true
accumulate_allreduce_grads_in_fp32: false
num_layers_at_start_in_bf16: 0
num_layers_at_end_in_bf16: 0
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: ${experiment.exp_name}
wandb_exp_name: ${experiment.exp_name}
checkpoint:
load: xxx
load_adapter: false
ckpt_format: nemo_zarr
auto_detect_ckpt_format: False
save_interval: 1
finetune: True


model:
peft_type: lora
lora_target_modules: ["linear_proj", "linear_qkv"]
lora_dim: 16
lora_alpha: 32
lora_dropout: 0.1
lora_dropout_position: pre
lora_in_init_method: kaiming
lora_out_init_method: zero

no_gradient_accumulation_fusion: True
use_mcore_models: True
transformer_impl: transformer_engine
num_layers: 16
hidden_size: 2048
ffn_hidden_size: 8192
num_attention_heads: 32
seq_length: 8192
group_query_attention: True
num_query_groups: 8
max_position_embeddings: 8192

norm_epsilon: 1e-5
use_rotary_position_embeddings: True
no_position_embedding: True
swiglu: True
normalization: RMSNorm
position_embedding_type: rope
use_rope_scaling: True
rope_scaling_factor: 32.0
rotary_base: 500000
untie_embeddings_and_output_weights: False
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
clip_grad: 0.3

train_iters: 10000
eval_iters: 100
eval_interval: 1000
micro_batch_size: 1
global_batch_size: 2

no_load_optim: True
no_load_rng: True
optimizer:
weight_decay: 1e-4
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-08
main_grads_dtype: bf16
main_params_dtype: fp16
use_distributed_optimizer: True
use_precision_aware_optimizer: True
lr_scheduler:
lr: 0.0004
min_lr: 0
lr_decay_style: cosine
seed: 1234


data:
data_path: xxx
dataloader_type: external
split: 1
num_workers: 1
tokenizer:
tokenizer_type: Llama3TokenizerFS
tokenizer_path: xxx
vocab_size: 128256
make_vocab_size_divisible_by: 64
93 changes: 93 additions & 0 deletions examples/llama3/conf/train/70b_lora.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
system:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
disable_bias_linear: True
use_flash_attn: True
sequence_parallel: True
use_distributed_optimizer: True
precision:
bf16: True
attention_softmax_in_fp32: true
accumulate_allreduce_grads_in_fp32: false
num_layers_at_start_in_bf16: 0
num_layers_at_end_in_bf16: 0
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: ${experiment.exp_name}
wandb_exp_name: ${experiment.exp_name}
checkpoint:
load: xxx
load_adapter: false
ckpt_format: nemo_zarr
save_interval: 1
auto_detect_ckpt_format: False
save_interval: 20
finetune: True

model:
peft_type: lora
lora_target_modules: ["linear_proj", "linear_qkv"]
lora_dim: 16
lora_alpha: 32
lora_dropout: 0.1
lora_dropout_position: pre
lora_in_init_method: kaiming
lora_out_init_method: zero

no_gradient_accumulation_fusion: True
use_mcore_models: True
transformer_impl: transformer_engine
num_layers: 80
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 64
seq_length: 8192
group_query_attention: True
num_query_groups: 8
max_position_embeddings: 8192

norm_epsilon: 1e-5
use_rotary_position_embeddings: True
no_position_embedding: True
swiglu: True
normalization: RMSNorm
position_embedding_type: rope
rotary_base: 500000
untie_embeddings_and_output_weights: True
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
clip_grad: 0.3

train_samples: 10000
eval_iters: 100
eval_interval: 1000
micro_batch_size: 1
global_batch_size: 2

optimizer:
weight_decay: 1e-4
adam_beta1: 0.9
adam_beta2: 0.999
adam_eps: 1e-08
main_grads_dtype: bf16
main_params_dtype: fp16
use_distributed_optimizer: True
use_precision_aware_optimizer: True
lr_scheduler:
lr: 0.0004
min_lr: 0
lr_decay_style: cosine
seed: 1234

data:
data_path: xxx
dataloader_type: external
split: 1
tokenizer:
tokenizer_type: Llama3TokenizerFS
tokenizer_path: xxx
vocab_size: 128256
make_vocab_size_divisible_by: 64
29 changes: 29 additions & 0 deletions examples/llama3/conf/train_nemo_llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
defaults:
- train: 1b_lora
- _self_

experiment:
exp_name: llama3
exp_dir: ./outputs_llama3_1b_lora
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_nemo_llama.py
runner:
backend: torchrun
nnodes: 1
nproc_per_node: 2
hostfile: null
envs:
CUDA_VISIBLE_DEVICES: 0,1
CUDA_DEVICE_MAX_CONNECTIONS: 1
NVTE_APPLY_QK_LAYER_SCALING: 0
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
MKL_SERVICE_FORCE_INTEL: 1
CUBLAS_WORKSPACE_CONFIG: :4096:8
NCCL_ALGO: Ring
action: run

hydra:
run:
dir: ${experiment.exp_dir}/hydra
Loading