Remove unused base model parallel plan from custom parallel configuration

RayenTian · RayenTian · commit 2ea47c15aa48 · 2025-11-30T22:35:05.000-08:00
Signed-off-by: ruit &lt;ruit@nvidia.com&gt;
diff --git a/examples/configs/recipes/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-4n8g-fsdp2tp8.yaml b/examples/configs/recipes/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-4n8g-fsdp2tp8.yaml
@@ -6,7 +6,9 @@ checkpointing:
   checkpoint_dir: results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
   metric_name: val:reward
 policy:
-  model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
+  # This is the model name is unusable because the model did bot update on huggingface yet.
+  # ISSUE: https://github.com/NVIDIA-NeMo/RL/issues/1571
+  model_name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5 
   max_total_sequence_length: 32768
   train_global_batch_size: 64
   train_micro_batch_size: 1
@@ -40,6 +42,8 @@ policy:
     vllm_cfg:
       tensor_parallel_size: 4
 data:
+  # Training with HelpSteer3 will lead to high logprob error.
+  # ISSUE: https://github.com/NVIDIA-NeMo/RL/issues/1570
   prompt_file: null
   dataset_name: HelpSteer3
   split: preference
diff --git a/examples/custom_parallel/llama_nemotron_super_49b_custom_plan.py b/examples/custom_parallel/llama_nemotron_super_49b_custom_plan.py
@@ -12,13 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import cast
 
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     ParallelStyle,
     RowwiseParallel,
-    SequenceParallel,
 )
 from torch.distributed.tensor.placement_types import Replicate, Shard
 
@@ -37,24 +35,6 @@ def get_custom_parallel_plan():
         "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
     }
 
-    base_model_sp_plan = {
-        "model.embed_tokens": RowwiseParallel(
-            input_layouts=Replicate(), output_layouts=Shard(1)
-        ),
-        "model.norm": SequenceParallel(),
-        "model.layers.*.input_layernorm": SequenceParallel(),
-        "model.layers.*.self_attn.o_proj": RowwiseParallel(output_layouts=Shard(1)),
-        "model.layers.*.post_attention_layernorm": SequenceParallel(),
-        "model.layers.*.mlp.down_proj": RowwiseParallel(output_layouts=Shard(1)),
-        "lm_head": ColwiseParallel(
-            input_layouts=Shard(1), output_layouts=Shard(-1), use_local_output=False
-        ),
-    }
-
-    if False:
-        # Enable sequence parallelism only if TP size > 1
-        base_model_tp_plan.update(cast(dict[str, ParallelStyle], base_model_sp_plan))
-
     return base_model_tp_plan
 
 
diff --git a/tests/functional/distillation/metrics.json b/tests/functional/distillation/metrics.json