examples/llama3/conf/train/1b_lora.yaml

-Original file line number
+Diff line change
@@ -0,0 +1,99 @@
+    system:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      context_parallel_size: 2
+      disable_bias_linear: True
+      use_flash_attn: True
+      sequence_parallel: False
+      use_distributed_optimizer: True
+      precision:
+        bf16: True
+        attention_softmax_in_fp32: true
+        accumulate_allreduce_grads_in_fp32: false
+        num_layers_at_start_in_bf16: 0
+        num_layers_at_end_in_bf16: 0
+      logging:
+        log_interval: 1
+        tensorboard_log_interval: 1
+        wandb_project: ${experiment.exp_name}
+        wandb_exp_name: ${experiment.exp_name}
+      checkpoint:
+        load: xxx
+        load_adapter: false
+        ckpt_format: nemo_zarr
+        auto_detect_ckpt_format: False
+        save_interval: 1
+        finetune: True
+    model:
+      peft_type: lora
+      lora_target_modules: ["linear_proj", "linear_qkv"]
+      lora_dim: 16
+      lora_alpha: 32
+      lora_dropout: 0.1
+      lora_dropout_position: pre
+      lora_in_init_method: kaiming
+      lora_out_init_method: zero
+      no_gradient_accumulation_fusion: True
+      use_mcore_models: True
+      transformer_impl: transformer_engine
+      num_layers: 16
+      hidden_size: 2048
+      ffn_hidden_size: 8192
+      num_attention_heads: 32
+      seq_length: 8192
+      group_query_attention: True
+      num_query_groups: 8
+      max_position_embeddings: 8192
+      norm_epsilon: 1e-5
+      use_rotary_position_embeddings: True
+      no_position_embedding: True
+      swiglu: True
+      normalization: RMSNorm
+      position_embedding_type: rope
+      use_rope_scaling: True
+      rope_scaling_factor: 32.0
+      rotary_base: 500000
+      untie_embeddings_and_output_weights: False
+      init_method_std: 0.02
+      attention_dropout: 0.0
+      hidden_dropout: 0.0
+      clip_grad: 0.3
+      train_iters: 10000
+      eval_iters: 100
+      eval_interval: 1000
+      micro_batch_size: 1
+      global_batch_size: 2
+      no_load_optim: True
+      no_load_rng: True
+      optimizer:
+        weight_decay: 1e-4
+        adam_beta1: 0.9
+        adam_beta2: 0.999
+        adam_eps: 1e-08
+        main_grads_dtype: bf16
+        main_params_dtype: fp16
+        use_distributed_optimizer: True
+        use_precision_aware_optimizer: True
+        lr_scheduler:
+          lr: 0.0004
+          min_lr: 0
+          lr_decay_style: cosine
+      seed: 1234
+    data:
+      data_path: xxx
+      dataloader_type: external
+      split: 1
+      num_workers: 1
+      tokenizer:
+        tokenizer_type: Llama3TokenizerFS
+        tokenizer_path: xxx
+        vocab_size: 128256
+        make_vocab_size_divisible_by: 64

examples/llama3/conf/train/70b_lora.yaml

-Original file line number
+Diff line change
@@ -0,0 +1,93 @@
+    system:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      context_parallel_size: 1
+      disable_bias_linear: True
+      use_flash_attn: True
+      sequence_parallel: True
+      use_distributed_optimizer: True
+      precision:
+        bf16: True
+        attention_softmax_in_fp32: true
+        accumulate_allreduce_grads_in_fp32: false
+        num_layers_at_start_in_bf16: 0
+        num_layers_at_end_in_bf16: 0
+      logging:
+        log_interval: 1
+        tensorboard_log_interval: 1
+        wandb_project:  ${experiment.exp_name}
+        wandb_exp_name:  ${experiment.exp_name}
+      checkpoint:
+        load: xxx
+        load_adapter: false
+        ckpt_format: nemo_zarr
+        save_interval: 1
+        auto_detect_ckpt_format: False
+        save_interval: 20
+        finetune: True
+    model:
+      peft_type: lora
+      lora_target_modules: ["linear_proj", "linear_qkv"]
+      lora_dim: 16
+      lora_alpha: 32
+      lora_dropout: 0.1
+      lora_dropout_position: pre
+      lora_in_init_method: kaiming
+      lora_out_init_method: zero
+      no_gradient_accumulation_fusion: True
+      use_mcore_models: True
+      transformer_impl: transformer_engine
+      num_layers: 80
+      hidden_size: 8192
+      ffn_hidden_size: 28672
+      num_attention_heads: 64
+      seq_length: 8192
+      group_query_attention: True
+      num_query_groups: 8
+      max_position_embeddings: 8192
+      norm_epsilon: 1e-5
+      use_rotary_position_embeddings: True
+      no_position_embedding: True
+      swiglu: True
+      normalization: RMSNorm
+      position_embedding_type: rope
+      rotary_base: 500000
+      untie_embeddings_and_output_weights: True
+      init_method_std: 0.02
+      attention_dropout: 0.0
+      hidden_dropout: 0.0
+      clip_grad: 0.3
+      train_samples: 10000
+      eval_iters: 100
+      eval_interval: 1000
+      micro_batch_size: 1
+      global_batch_size: 2
+      optimizer:
+        weight_decay: 1e-4
+        adam_beta1: 0.9
+        adam_beta2: 0.999
+        adam_eps: 1e-08
+        main_grads_dtype: bf16
+        main_params_dtype: fp16
+        use_distributed_optimizer: True
+        use_precision_aware_optimizer: True
+        lr_scheduler:
+          lr: 0.0004
+          min_lr: 0
+          lr_decay_style: cosine
+      seed: 1234
+    data:
+      data_path: xxx
+      dataloader_type: external
+      split: 1
+      tokenizer:
+        tokenizer_type: Llama3TokenizerFS
+        tokenizer_path: xxx
+        vocab_size: 128256
+        make_vocab_size_divisible_by: 64

examples/llama3/conf/train_nemo_llama.yaml

-Original file line number
+Diff line change
@@ -0,0 +1,29 @@
+    defaults:
+      - train: 1b_lora
+      - _self_
+    experiment:
+      exp_name: llama3
+      exp_dir: ./outputs_llama3_1b_lora
+      task:
+        type: train
+        backend: megatron
+        entrypoint: ./flagscale/train/train_nemo_llama.py
+      runner:
+        backend: torchrun
+        nnodes: 1
+        nproc_per_node: 2
+        hostfile: null
+      envs:
+        CUDA_VISIBLE_DEVICES: 0,1
+        CUDA_DEVICE_MAX_CONNECTIONS: 1
+        NVTE_APPLY_QK_LAYER_SCALING: 0
+        NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+        MKL_SERVICE_FORCE_INTEL: 1
+        CUBLAS_WORKSPACE_CONFIG: :4096:8
+        NCCL_ALGO: Ring
+    action: run
+    hydra:
+      run:
+        dir: ${experiment.exp_dir}/hydra

support nemo llama 70b lora train #1084

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

youth123 wants to merge 1 commit into flagos-ai:main-legacy from youth123:support_mlperf_nemo_v2

-Original file line number
+Diff line change
@@ -0,0 +1,99 @@
+    system:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      context_parallel_size: 2
+      disable_bias_linear: True
+      use_flash_attn: True
+      sequence_parallel: False
+      use_distributed_optimizer: True
+      precision:
+        bf16: True
+        attention_softmax_in_fp32: true
+        accumulate_allreduce_grads_in_fp32: false
+        num_layers_at_start_in_bf16: 0
+        num_layers_at_end_in_bf16: 0
+      logging:
+        log_interval: 1
+        tensorboard_log_interval: 1
+        wandb_project: ${experiment.exp_name}
+        wandb_exp_name: ${experiment.exp_name}
+      checkpoint:
+        load: xxx
+        load_adapter: false
+        ckpt_format: nemo_zarr
+        auto_detect_ckpt_format: False
+        save_interval: 1
+        finetune: True
+    model:
+      peft_type: lora
+      lora_target_modules: ["linear_proj", "linear_qkv"]
+      lora_dim: 16
+      lora_alpha: 32
+      lora_dropout: 0.1
+      lora_dropout_position: pre
+      lora_in_init_method: kaiming
+      lora_out_init_method: zero
+      no_gradient_accumulation_fusion: True
+      use_mcore_models: True
+      transformer_impl: transformer_engine
+      num_layers: 16
+      hidden_size: 2048
+      ffn_hidden_size: 8192
+      num_attention_heads: 32
+      seq_length: 8192
+      group_query_attention: True
+      num_query_groups: 8
+      max_position_embeddings: 8192
+      norm_epsilon: 1e-5
+      use_rotary_position_embeddings: True
+      no_position_embedding: True
+      swiglu: True
+      normalization: RMSNorm
+      position_embedding_type: rope
+      use_rope_scaling: True
+      rope_scaling_factor: 32.0
+      rotary_base: 500000
+      untie_embeddings_and_output_weights: False
+      init_method_std: 0.02
+      attention_dropout: 0.0
+      hidden_dropout: 0.0
+      clip_grad: 0.3
+      train_iters: 10000
+      eval_iters: 100
+      eval_interval: 1000
+      micro_batch_size: 1
+      global_batch_size: 2
+      no_load_optim: True
+      no_load_rng: True
+      optimizer:
+        weight_decay: 1e-4
+        adam_beta1: 0.9
+        adam_beta2: 0.999
+        adam_eps: 1e-08
+        main_grads_dtype: bf16
+        main_params_dtype: fp16
+        use_distributed_optimizer: True
+        use_precision_aware_optimizer: True
+        lr_scheduler:
+          lr: 0.0004
+          min_lr: 0
+          lr_decay_style: cosine
+      seed: 1234
+    data:
+      data_path: xxx
+      dataloader_type: external
+      split: 1
+      num_workers: 1
+      tokenizer:
+        tokenizer_type: Llama3TokenizerFS
+        tokenizer_path: xxx
+        vocab_size: 128256
+        make_vocab_size_divisible_by: 64

-Original file line number
+Diff line change
@@ -0,0 +1,93 @@
+    system:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      context_parallel_size: 1
+      disable_bias_linear: True
+      use_flash_attn: True
+      sequence_parallel: True
+      use_distributed_optimizer: True
+      precision:
+        bf16: True
+        attention_softmax_in_fp32: true
+        accumulate_allreduce_grads_in_fp32: false
+        num_layers_at_start_in_bf16: 0
+        num_layers_at_end_in_bf16: 0
+      logging:
+        log_interval: 1
+        tensorboard_log_interval: 1
+        wandb_project:  ${experiment.exp_name}
+        wandb_exp_name:  ${experiment.exp_name}
+      checkpoint:
+        load: xxx
+        load_adapter: false
+        ckpt_format: nemo_zarr
+        save_interval: 1
+        auto_detect_ckpt_format: False
+        save_interval: 20
+        finetune: True
+    model:
+      peft_type: lora
+      lora_target_modules: ["linear_proj", "linear_qkv"]
+      lora_dim: 16
+      lora_alpha: 32
+      lora_dropout: 0.1
+      lora_dropout_position: pre
+      lora_in_init_method: kaiming
+      lora_out_init_method: zero
+      no_gradient_accumulation_fusion: True
+      use_mcore_models: True
+      transformer_impl: transformer_engine
+      num_layers: 80
+      hidden_size: 8192
+      ffn_hidden_size: 28672
+      num_attention_heads: 64
+      seq_length: 8192
+      group_query_attention: True
+      num_query_groups: 8
+      max_position_embeddings: 8192
+      norm_epsilon: 1e-5
+      use_rotary_position_embeddings: True
+      no_position_embedding: True
+      swiglu: True
+      normalization: RMSNorm
+      position_embedding_type: rope
+      rotary_base: 500000
+      untie_embeddings_and_output_weights: True
+      init_method_std: 0.02
+      attention_dropout: 0.0
+      hidden_dropout: 0.0
+      clip_grad: 0.3
+      train_samples: 10000
+      eval_iters: 100
+      eval_interval: 1000
+      micro_batch_size: 1
+      global_batch_size: 2
+      optimizer:
+        weight_decay: 1e-4
+        adam_beta1: 0.9
+        adam_beta2: 0.999
+        adam_eps: 1e-08
+        main_grads_dtype: bf16
+        main_params_dtype: fp16
+        use_distributed_optimizer: True
+        use_precision_aware_optimizer: True
+        lr_scheduler:
+          lr: 0.0004
+          min_lr: 0
+          lr_decay_style: cosine
+      seed: 1234
+    data:
+      data_path: xxx
+      dataloader_type: external
+      split: 1
+      tokenizer:
+        tokenizer_type: Llama3TokenizerFS
+        tokenizer_path: xxx
+        vocab_size: 128256
+        make_vocab_size_divisible_by: 64

-Original file line number
+Diff line change
@@ -0,0 +1,29 @@
+    defaults:
+      - train: 1b_lora
+      - _self_
+    experiment:
+      exp_name: llama3
+      exp_dir: ./outputs_llama3_1b_lora
+      task:
+        type: train
+        backend: megatron
+        entrypoint: ./flagscale/train/train_nemo_llama.py
+      runner:
+        backend: torchrun
+        nnodes: 1
+        nproc_per_node: 2
+        hostfile: null
+      envs:
+        CUDA_VISIBLE_DEVICES: 0,1
+        CUDA_DEVICE_MAX_CONNECTIONS: 1
+        NVTE_APPLY_QK_LAYER_SCALING: 0
+        NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+        MKL_SERVICE_FORCE_INTEL: 1
+        CUBLAS_WORKSPACE_CONFIG: :4096:8
+        NCCL_ALGO: Ring
+    action: run
+    hydra:
+      run:
+        dir: ${experiment.exp_dir}/hydra

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

support nemo llama 70b lora train #1084

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

support nemo llama 70b lora train #1084

Are you sure you want to change the base?

Uh oh!

support nemo llama 70b lora train #1084

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!