NVIDIA-NeMo · joyang-nv · Sep 18, 2025 · Sep 18, 2025 · Oct 10, 2025 · Oct 12, 2025
@@ -0,0 +1,248 @@
+# GRPO Algorithm Configuration
+grpo:
+  num_prompts_per_step: 32
+  num_generations_per_prompt: 16
+  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
+  max_num_epochs: 1
+  max_num_steps: 1000000
+  normalize_rewards: true
+  use_leave_one_out_baseline: true
+  val_period: 10
+  val_at_start: false
+  overlong_filtering: false
+  max_val_samples: 256
+  val_batch_size: 256
+  seed: 42
+
+loss_fn:
+  reference_policy_kl_penalty: 0.01
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
+  ratio_clip_c: null
+  # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
+  use_on_policy_kl_approximation: false
+  use_importance_sampling_correction: false
+  sequence_level_importance_ratios: false
+  token_level_loss: true
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/grpo"
+  metric_name: "val_reward"
+  higher_is_better: true
+  keep_top_k: 3
+  save_period: 10
+  checkpoint_must_save_by: null
+
+policy:
+  model_name: "moonshotai/Moonlight-16B-A3B"
+  tokenizer:
+    name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  train_global_batch_size: 256
+  train_micro_batch_size: 4
+  generation_batch_size: 32 # Only used when generating using HF backend
+  logprob_batch_size: 4
+  max_total_sequence_length: 2048
+  precision: "bfloat16"
+  logprob_chunk_size: null
+
+  dtensor_cfg:
+    _v2: true
+    enabled: true
+    cpu_offload: False
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+    pipeline_parallel_size: 1
+    expert_parallel_size: 8
+    custom_parallel_plan: null
+
-  dtensor_cfg:
-    _v2: true
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_parallel_size: 1
-    expert_parallel_size: 8
-    custom_parallel_plan: null
+  dtensor_cfg:
+    _v2: true
+    enabled: true
+    cpu_offload: False
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+    pipeline_parallel_size: 1
+    expert_parallel_size: 1
+    custom_parallel_plan: null
-  dtensor_cfg:
-    _v2: true
-    enabled: true
-    cpu_offload: False
-    sequence_parallel: false
-    activation_checkpointing: false
-    tensor_parallel_size: 1
-    context_parallel_size: 1
-    pipeline_parallel_size: 1
-    expert_parallel_size: 8
-    custom_parallel_plan: null
+  dtensor_cfg:
+    _v2: true
+    enabled: true
+    cpu_offload: False
+    sequence_parallel: false
+    activation_checkpointing: false
+    tensor_parallel_size: 1
+    context_parallel_size: 1
+    pipeline_parallel_size: 1
+    expert_parallel_size: 1
+    custom_parallel_plan: null
+    # v2 only attributes
+    parallelize_fn: nemo_automodel.components.models.deepseek_v3.parallelizer.parallelize_model
+    model:
+      _target_: nemo_automodel.components.models.deepseek_v3.model.DeepseekV3ForCausalLM.from_config
+      num_layers: null  # Optionally override the number of hidden layers (null = use model default)
+      backend:
+        _target_: nemo_automodel.components.moe.utils.BackendConfig
+        attn: te
+        linear: te
+        rms_norm: te
+        enable_deepep: true
+        fake_balanced_gate: true
+        enable_hf_state_dict_adapter: false
+
+  megatron_cfg:
+    enabled: false
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    converter_type: "Qwen2ForCausalLM"
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    moe_permute_fusion: false
+    #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+    defer_fp32_logits: null
+
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: 1000
+      lr_warmup_iters: 13
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
+
+    env_vars: null
+
+  # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
+  # for more details on dynamic batching and sequence packing.
+  dynamic_batching:
+    enabled: False
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    sequence_length_round: 64
+
+  sequence_packing:
+    enabled: True
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
+  max_grad_norm: 1.0
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 5.0e-6
+      weight_decay: 0.01
+      betas: [0.9, 0.999]
+      eps: 1e-8
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
+
+  scheduler:
+    - name: "torch.optim.lr_scheduler.LinearLR"
+      kwargs:
+        start_factor: 0.1
+        end_factor: 1.0
+        total_iters: 50
+    - name: "torch.optim.lr_scheduler.ConstantLR"
+      kwargs:
+        factor: 1.0
+        total_iters: 10000000000
+    - milestones: [50]
+
+  generation:
+    backend: "vllm"
+    max_new_tokens: ${policy.max_total_sequence_length}
+    temperature: 1.0
+    top_p: 1.0
+    top_k: null
+    stop_token_ids: null
+    stop_strings: null
+    vllm_cfg:
+      async_engine: false
+      precision: ${policy.precision}
+      tensor_parallel_size: 1
+      pipeline_parallel_size: 1
+      enable_expert_parallel: false
+      gpu_memory_utilization: 0.6
+      max_model_len: ${policy.max_total_sequence_length}
+      # when enforce_eager is False, it is optional to set ++policy.generation.vllm_kwargs.compilation_config.use_inductor=False for better accuracy,
+      # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
+      # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
+      enforce_eager: False
+      use_deep_gemm: False
+      num_last_layers_in_bf16: 0
+      num_first_layers_in_bf16: 0
+    vllm_kwargs: {}
+    colocated:
+      # true: generation shares training GPUs
+      # false: uses dedicated generation resources
+      enabled: true
+      # only relevant when enabled is false
+      resources:
+        gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
+        num_nodes: null # Decides number of nodes to be dedicated to generation
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length} # upper bound, real truncation occurs at vllm.max_model_len
+  prompt_file: "examples/prompts/cot.txt"
+  system_prompt_file: null
+  dataset_name: "OpenMathInstruct-2"
+  shuffle: true
+
+env:
+  math:
+    num_workers: 8
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
+  wandb_enabled: false
+  tensorboard_enabled: false
+  mlflow_enabled: false  # Disable MLflow logging
+  monitor_gpus: true  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  wandb:
+    project: "grpo-dev"
+    name: "grpo-dev-logger"
+  tensorboard: {}
+  mlflow:
+    experiment_name: "grpo-dev"
+    run_name: "grpo-dev-logger"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
+61 −0		examples/benchmarking/configs/deepseek_v3_te_deepep.yaml
+61 −0		examples/benchmarking/configs/deepseek_v3_torch.yaml
+62 −0		examples/benchmarking/configs/kimi_k2_te_deepep.yaml
+60 −0		examples/benchmarking/configs/moonlight_16b_te_deepep.yaml
+60 −0		examples/benchmarking/configs/moonlight_16b_torch.yaml
+355 −0		examples/benchmarking/next_token_prediction.py
+8 −6		nemo_automodel/components/_peft/lora.py
+0 −9		nemo_automodel/components/_transformers/utils.py
+19 −36		nemo_automodel/components/checkpoint/checkpointing.py
+0 −63		nemo_automodel/components/distributed/parallel_styles.py
+1 −35		nemo_automodel/components/distributed/parallelizer.py
+9 −4		nemo_automodel/components/models/deepseek_v3/layers.py
+13 −2		nemo_automodel/components/models/deepseek_v3/model.py
+19 −59		nemo_automodel/components/training/rng.py
+753 −0		nemo_automodel/components/utils/flops_utils.py
+17 −18		nemo_automodel/recipes/base_recipe.py
+2 −2		nemo_automodel/recipes/llm/kd.py
+13 −24		nemo_automodel/recipes/llm/train_ft.py
+13 −22		nemo_automodel/recipes/vlm/finetune.py
+1 −1		pyproject.toml
+0 −2		tests/functional_tests/checkpoint/test_dcp.py
+0 −2		tests/functional_tests/checkpoint/test_dcp_vlm.py
+0 −2		tests/functional_tests/checkpoint/test_hf_consolidated_llm.py
+2 −0		tests/functional_tests/checkpoint/test_hf_consolidated_llm_scalar_param.py
+0 −2		tests/functional_tests/checkpoint/test_hf_consolidated_vlm.py
+0 −2		tests/functional_tests/checkpoint/test_hf_sharded.py
+0 −2		tests/functional_tests/checkpoint/test_hf_sharded_vlm.py
+2 −2		tests/functional_tests/checkpoint/test_peft.py
+0 −2		tests/functional_tests/checkpoint/test_peft_vlm.py
+3 −4		tests/functional_tests/training/test_megatron_dataset_checkpoint.py
+7 −4		tests/unit_tests/_transformers/test_auto_model.py
+2 −2		tests/unit_tests/models/deepseek_v3/test_layers.py
+1 −1		tests/unit_tests/recipes/test_finetune_vlm_helpers.py
+3 −3		tests/unit_tests/training/test_rng.py
+169 −0		tests/unit_tests/utils/test_flops_utils.py
+5 −5		uv.lock