Skip to content

Commit 2ea47c1

Browse files
committed
Remove unused base model parallel plan from custom parallel configuration
Signed-off-by: ruit <[email protected]>
1 parent 0e9e482 commit 2ea47c1

File tree

3 files changed

+568
-21
lines changed

3 files changed

+568
-21
lines changed

examples/configs/recipes/llm/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5-4n8g-fsdp2tp8.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ checkpointing:
66
checkpoint_dir: results/grpo-helpsteer3-llama-3.3-nemotron-super-49b-v1.5
77
metric_name: val:reward
88
policy:
9-
model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
9+
# This is the model name is unusable because the model did bot update on huggingface yet.
10+
# ISSUE: https://github.com/NVIDIA-NeMo/RL/issues/1571
11+
model_name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
1012
max_total_sequence_length: 32768
1113
train_global_batch_size: 64
1214
train_micro_batch_size: 1
@@ -40,6 +42,8 @@ policy:
4042
vllm_cfg:
4143
tensor_parallel_size: 4
4244
data:
45+
# Training with HelpSteer3 will lead to high logprob error.
46+
# ISSUE: https://github.com/NVIDIA-NeMo/RL/issues/1570
4347
prompt_file: null
4448
dataset_name: HelpSteer3
4549
split: preference

examples/custom_parallel/llama_nemotron_super_49b_custom_plan.py

Lines changed: 0 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,11 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from typing import cast
1615

1716
from torch.distributed.tensor.parallel import (
1817
ColwiseParallel,
1918
ParallelStyle,
2019
RowwiseParallel,
21-
SequenceParallel,
2220
)
2321
from torch.distributed.tensor.placement_types import Replicate, Shard
2422

@@ -37,24 +35,6 @@ def get_custom_parallel_plan():
3735
"lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
3836
}
3937

40-
base_model_sp_plan = {
41-
"model.embed_tokens": RowwiseParallel(
42-
input_layouts=Replicate(), output_layouts=Shard(1)
43-
),
44-
"model.norm": SequenceParallel(),
45-
"model.layers.*.input_layernorm": SequenceParallel(),
46-
"model.layers.*.self_attn.o_proj": RowwiseParallel(output_layouts=Shard(1)),
47-
"model.layers.*.post_attention_layernorm": SequenceParallel(),
48-
"model.layers.*.mlp.down_proj": RowwiseParallel(output_layouts=Shard(1)),
49-
"lm_head": ColwiseParallel(
50-
input_layouts=Shard(1), output_layouts=Shard(-1), use_local_output=False
51-
),
52-
}
53-
54-
if False:
55-
# Enable sequence parallelism only if TP size > 1
56-
base_model_tp_plan.update(cast(dict[str, ParallelStyle], base_model_sp_plan))
57-
5838
return base_model_tp_plan
5939

6040

0 commit comments

Comments
 (0)