Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions hardware/MUSA_S5000/FlagScale/diff.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
backends_commit: {}
backends_commit:
Megatron-LM: 015542d9607d50a7e2f39d8b625a3f999ad66d93
backends_version:
FlagScale: v0.11.0
commit: 66a37a7391987705624d7be1221c147f65236d71
contact: [email protected]
FlagScale: 0.8.0
Megatron-LM: 0.14.0
commit: c632b27986130d42a451970a93c1b3c905600626
contact: [email protected]
device_type: MUSA_S5000
models:
- Qwen2.5VL-32B
- Qwen3-8B
- Qwen3-0.6B
task:
- inference
- train
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
diff --git a/examples/qwen3/conf/train.yaml b/examples/qwen3/conf/train.yaml
index fb0ec63b..b8f7e5a7 100644
--- a/examples/qwen3/conf/train.yaml
+++ b/examples/qwen3/conf/train.yaml
@@ -1,7 +1,7 @@
defaults:
- _self_
# - train: 30b_a3b
- - train: 32b
+ - train: 0_6b

experiment:
# exp_name: Qwen3-30b-a3b-Train
@@ -16,17 +16,43 @@ experiment:
backend: megatron
entrypoint: flagscale/train/train_gpt.py
runner:
+ # backend: torchrun
+ # nnodes: 1
+ # nproc_per_node: 2
+ # rdzv_backend: static
+ # no_shared_fs: false
+
per_node_task: false
no_shared_fs: false
rdzv_backend: static
- hostfile: null
- cmds:
- before_start: ulimit -n 1048576 && source /root/miniconda3/bin/activate flagscale-train
+ ssh_port: 62216
+ hostfile: hostfile
+
+ # # per_node_task: false
+ # hostfile: null
+ # cmds:
+ # before_start: ulimit -n 1048576 && source /root/miniconda3/bin/activate flagscale-train
+
+ # cmds:
+ # before_start: ulimit -n 1048576 && source /root/miniconda3/bin/activate flagscale-train
envs:
- LOGLEVEL: "INFO"
- CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7"
+ LOGLEVEL: INFO
+ MUSA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
CUDA_DEVICE_MAX_CONNECTIONS: 1
-
+ MUSA_KERNEL_TIMEOUT: 3200000
+ ACCELERATOR_BACKEND: musa
+ MCCL_PROTOS: 2
+ MCCL_CHECK_POINTERS: 0
+ OMP_NUM_THREADS: 4
+ MCCL_ALGOS: 1
+ MCCL_BUFFSIZE: 20971520
+ MUSA_BLOCK_SCHEDULE_MODE: 1
+ MCCL_IB_GID_INDEX: 3
+ MCCL_NET_SHARED_BUFFERS: 0
+ MCCL_CROSS_NIC: 1
+ MCCL_SOCKET_IFNAME: bond0
+ MUSA_ENABLE_SQMMA: 1
+ TE_FL_PREFER: flagos
action: run

hydra:

Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
diff --git a/examples/qwen3/conf/train/0_6b.yaml b/examples/qwen3/conf/train/0_6b.yaml
index f5e42acf..061ebc67 100644
--- a/examples/qwen3/conf/train/0_6b.yaml
+++ b/examples/qwen3/conf/train/0_6b.yaml
@@ -8,7 +8,7 @@ system:
reset_position_ids: True
reset_attention_mask: True
qk_layernorm: true
- sequence_parallel: true
+ sequence_parallel: False
use_distributed_optimizer: true
overlap_grad_reduce: true
overlap_param_gather: true
@@ -20,8 +20,8 @@ system:
logging:
log_interval: 1
tensorboard_log_interval: 1
- wandb_project: ${experiment.exp_name}
- wandb_exp_name: ${experiment.exp_name}
+ # wandb_project: ${experiment.exp_name}
+ # wandb_exp_name: ${experiment.exp_name}
log_timers_to_tensorboard: true
log_validation_ppl_to_tensorboard: true
log_throughput: true
@@ -35,6 +35,9 @@ system:

model:
transformer_impl: transformer_engine
+ use_transformer_engine_fl: true
+ # enable_flag_gems: true
+ # flag_gems_unused: ['mm','bmm','addmm','_attn']
num_layers: 28
hidden_size: 1024
ffn_hidden_size: 3072
@@ -45,10 +48,12 @@ model:
max_position_embeddings: 40960
norm_epsilon: 1e-6
use_rotary_position_embeddings: true
+ no_gradient_accumulation_fusion: True
+
rotary_base: 1000000
swiglu: true
normalization: RMSNorm
- init_method_std: 0.02
+ init_method_std: 6e-3
attention_dropout: 0.0
hidden_dropout: 0.0
clip_grad: 1.0
@@ -58,12 +63,12 @@ model:
no_rope_fusion: true

# training
- seed: ${experiment.seed}
+ seed: 42
# finetune: false
- micro_batch_size: 1
+ micro_batch_size: 4
global_batch_size: 2048
eval_iters: 0
- train_samples: 244141056 #1T #29297664 #120B tokens
+ train_samples: 29297664 #1T #29297664 #120B tokens

optimizer:
weight_decay: 0.1
@@ -77,11 +82,12 @@ model:


data:
- data_path: /path/to/dataset
+ data_path: /data/train-data/
split: 1
no_mmap_bin_files: true
tokenizer:
tokenizer_type: QwenTokenizerFS
- tokenizer_path: Qwen3-0.6B
- vocab_size: 151936
+ tokenizer_path: /data/train-data/qwentokenizer/
+ vocab_size: 151851
make_vocab_size_divisible_by: 64
+

Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
diff --git a/examples/qwen3_vl/conf/train.yaml b/examples/qwen3_vl/conf/train.yaml
index f717db7a..9c5eac71 100644
--- a/examples/qwen3_vl/conf/train.yaml
+++ b/examples/qwen3_vl/conf/train.yaml
@@ -4,7 +4,8 @@ defaults:

experiment:
exp_name: train_qwen3_vl_8b
- exp_dir: ./${experiment.exp_name}
+ exp_dir: ./outputs_qwen3_vl
+ ckpt_format: torch
task:
type: train
backend: megatron
@@ -19,12 +20,20 @@ experiment:
envs:
# NCCL_DEBUG: INFO
# NCCL_DEBUG_SUBSYSTEM: ALL
- CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
- # CUDA_VISIBLE_DEVICES: 0,1,2,3
- # CUDA_VISIBLE_DEVICES: 4,5,6,7
+ LOGLEVEL: INFO
+ MUSA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
CUDA_DEVICE_MAX_CONNECTIONS: 1
- NVTE_APPLY_QK_LAYER_SCALING: 0
- NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+ MUSA_KERNEL_TIMEOUT: 3200000
+ ACCELERATOR_BACKEND: musa
+ MCCL_PROTOS: 2
+ MCCL_CHECK_POINTERS: 0
+ OMP_NUM_THREADS: 4
+ MCCL_ALGOS: 1
+ MCCL_BUFFSIZE: 20971520
+ MUSA_BLOCK_SCHEDULE_MODE: 1
+ MCCL_IB_GID_INDEX: 3
+ MCCL_NET_SHARED_BUFFERS: 0
+

action: run


Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
diff --git a/examples/qwen3_vl/conf/train/8b.yaml b/examples/qwen3_vl/conf/train/8b.yaml
index c13cbfa4..2c9d432e 100644
--- a/examples/qwen3_vl/conf/train/8b.yaml
+++ b/examples/qwen3_vl/conf/train/8b.yaml
@@ -4,13 +4,13 @@ system:
vision_ration: 0.1
num_workers: 1
calculate_per_token_loss: true
- tensor_model_parallel_size: 2
- pipeline_model_parallel_size: 1
+ tensor_model_parallel_size: 1
+ pipeline_model_parallel_size: 2
context_parallel_size: 1
# decoder_first_pipeline_num_layers: 12
use_flash_attn: True
use_distributed_optimizer: True
- sequence_parallel: True
+ sequence_parallel: False
tp_comm_overlap: False
overlap_grad_reduce: False # if has text-only must be false
overlap_param_gather: False # if has text-only must be false
@@ -27,13 +27,13 @@ system:
log_interval: 1
tensorboard_log_interval: 1
log_throughput: True
- wandb_project: ${experiment.exp_name}
- wandb_exp_name: ${experiment.exp_name}
+ # wandb_project: ${experiment.exp_name}
+ # wandb_exp_name: ${experiment.exp_name}
log_params_norm: True
log_num_zeros_in_grad: True
checkpoint:
save_interval: 1000
- pretrained_checkpoint: ${pretrained_checkpoint ???}
+ # pretrained_checkpoint: ${pretrained_checkpoint ???}
dataloader_save: ${experiment.exp_dir}/checkpoints/dataloader
# use_dist_ckpt: False
ckpt_format: torch
@@ -70,13 +70,15 @@ model:
group_query_attention: True
no_masked_softmax_fusion: True
untie_embeddings_and_output_weights: True
+ use_cpu_initialization: False

# position embedding
position_embedding_type: mrope
rotary_percent: 1.0
rotary_base: 5000000
rotary_seq_len_interpolation_factor: 1
- no_rope_fusion: False
+ no_rope_fusion: True
+ no_gradient_accumulation_fusion: True
mrope_section: [24, 20, 20]
eod_mask_loss: False

@@ -100,13 +102,14 @@ model:

data:
no_use_system_prompt: True
- data_path: ${data_path ???}
- vision_root: ${vision_root ???}
+ # mock_data: True
+ data_path: /home/dist/haoran.huang/FlagScale/llava-datasets/LLaVA-Pretrain/blip_laion_cc_sbu_558k/wds-1
+ vision_root: /home/dist/haoran.huang/FlagScale/llava-datasets/LLaVA-Pretrain
dataloader_type: external
split: 100,0,0
tokenizer:
- tokenizer_type: Qwen2VLTokenizer
- tokenizer_path: ${tokenizer_path ???}
- # vocab_size: 151936 #
+ # tokenizer_type: MultimodalTokenizer
+ tokenizer_model: /home/dist/haoran.huang/FlagScale/qwen3_vl_tokenizer
+ vocab_size: 151963 #
extra_vocab_size: 293 # Qwen3-VL specific. total vocab size = 151643 + extra_vocab_size
make_vocab_size_divisible_by: 64

Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
diff --git a/flagscale/train/models/qwen2_5_vl/vision_transformer_block.py b/flagscale/train/models/qwen2_5_vl/vision_transformer_block.py
index 3ba04864..d439f4f3 100644
--- a/flagscale/train/models/qwen2_5_vl/vision_transformer_block.py
+++ b/flagscale/train/models/qwen2_5_vl/vision_transformer_block.py
@@ -16,7 +16,7 @@ from megatron.core.fp8_utils import get_fp8_context
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.inference.contexts import BaseInferenceContext
from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.process_groups_config import ProcessGroupCollection
+# from megatron.core.process_groups_config import ProcessGroupCollection
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_block import TransformerBlock, TransformerBlockSubmodules
from megatron.core.transformer.transformer_config import TransformerConfig
@@ -52,7 +52,7 @@ class VisionTransformerBlock(TransformerBlock):
post_layer_norm: bool = True,
pre_process: bool = True,
post_process: bool = True,
- pg_collection: ProcessGroupCollection = None,
+ # pg_collection: ProcessGroupCollection = None,
):
super().__init__(
config=config,
@@ -60,7 +60,7 @@ class VisionTransformerBlock(TransformerBlock):
post_layer_norm=post_layer_norm,
pre_process=pre_process,
post_process=post_process,
- pg_collection=pg_collection,
+ # pg_collection=pg_collection,
)

def _checkpointed_forward(

Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
diff --git a/flagscale/train/models/qwen3_vl/language_model.py b/flagscale/train/models/qwen3_vl/language_model.py
index edf4a51a..11090524 100644
--- a/flagscale/train/models/qwen3_vl/language_model.py
+++ b/flagscale/train/models/qwen3_vl/language_model.py
@@ -22,7 +22,7 @@ from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.utils import WrappedTensor, deprecate_inference_params
from megatron.core.models.gpt.gpt_model import GPTModel
-from megatron.core.process_groups_config import ProcessGroupCollection
+# from megatron.core.process_groups_config import ProcessGroupCollection

from .language_transformer_block import LanguageTransformerBlock

@@ -179,10 +179,10 @@ class Qwen3VLLanguageModule(GPTModel):
scatter_embedding_sequence_parallel: bool = True,
seq_len_interpolation_factor: Optional[float] = None,
mtp_block_spec: Optional[ModuleSpec] = None,
- pg_collection: Optional[ProcessGroupCollection] = None,
+ # pg_collection: Optional[ProcessGroupCollection] = None,
vp_stage: Optional[int] = None,
) -> None:
- super(GPTModel, self).__init__(config=config, pg_collection=pg_collection)
+ super(GPTModel, self).__init__(config=config)#, pg_collection=pg_collection)

if has_config_logger_enabled(config):
log_config_to_disk(config, locals(), prefix=type(self).__name__)
@@ -248,7 +248,7 @@ class Qwen3VLLanguageModule(GPTModel):
spec=transformer_layer_spec,
pre_process=self.pre_process,
post_process=self.post_process,
- pg_collection=self.pg_collection,
+ # pg_collection=self.pg_collection,
vp_stage=vp_stage,
)

@@ -287,7 +287,7 @@ class Qwen3VLLanguageModule(GPTModel):
and self.share_embeddings_and_output_weights,
embedding_activation_buffer=self.embedding_activation_buffer,
grad_output_buffer=self.grad_output_buffer,
- tp_group=self.pg_collection.tp,
+ # tp_group=self.pg_collection.tp,
)

if self.pre_process or self.post_process:

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
diff --git a/flagscale/train/models/qwen3_vl/language_transformer_block.py b/flagscale/train/models/qwen3_vl/language_transformer_block.py
old mode 100755
new mode 100644
index 0a86705c..7b6e61da
--- a/flagscale/train/models/qwen3_vl/language_transformer_block.py
+++ b/flagscale/train/models/qwen3_vl/language_transformer_block.py
@@ -13,7 +13,7 @@ from megatron.core.transformer.transformer_block import TransformerBlock
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core import parallel_state, tensor_parallel
from megatron.core.enums import Fp8Recipe
-from megatron.core.fp4_utils import get_fp4_context
+# from megatron.core.fp4_utils import get_fp4_context
from megatron.core.fp8_utils import get_fp8_context
from megatron.core.inference.contexts import BaseInferenceContext
from megatron.core.packed_seq_params import PackedSeqParams
@@ -286,10 +286,10 @@ class LanguageTransformerBlock(TransformerBlock):
outer_quantization_context = (
get_fp8_context(self.config) if use_outer_quantization_context else nullcontext()
)
- elif self.config.fp4:
- use_outer_quantization_context = False
- use_inner_quantization_context = True
- outer_quantization_context = nullcontext()
+ # elif self.config.fp4:
+ # use_outer_quantization_context = False
+ # use_inner_quantization_context = True
+ # outer_quantization_context = nullcontext()
else:
# No quantization
use_outer_quantization_context = False

Loading