Transformer Engine for Baichuan2 and Qwen (#52)

* Transformer Engine for Baichuan-2 7B * Transformer Engine for Baichuan-2 7B * Transformer Engine for qwen --------- Co-authored-by: 同润 <[email protected]>
alibaba · Oct 19, 2023 · 796d17b · 796d17b
1 parent 5abf7dd
commit 796d17b
Show file tree

Hide file tree

Showing 18 changed files with 1,306 additions and 393 deletions.
diff --git a/examples/baichuan2/run_evaluate_huggingface_baichuan.sh b/examples/baichuan2/run_evaluate_huggingface_baichuan.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
-# sh run_evaluate_huggingface_baichuan.sh dsw /root/Megatron-LM-23.04/ /workspace/PAI-Megatron-Patch/ 13B 1 2048 80 0 fp16 /mnt/baichuan-datasets/alpaca_data.json /mnt/baichuan-ckpts/baichuan-13b-base
+# sh run_evaluate_huggingface_baichuan.sh dsw /workspace/Pai-Megatron-Patch/ 13B 1 2048 80 0 fp16 /mnt/baichuan-datasets/alpaca_data.json /mnt/baichuan-ckpts/baichuan-13b-base
 set -e
 ENV=$1
-MEGATRON_PATH=$2
-MEGATRON_PATCH_PATH=$3
+MEGATRON_PATCH_PATH=$2
+MEGATRON_PATH=${MEGATRON_PATCH_PATH}/Megatron-LM-main
 export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATCH_PATH}:$PYTHONPATH
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 if [ $ENV = dsw ]; then
@@ -24,14 +24,14 @@ fi
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
-MODEL_SIZE=$4
-BATCH_SIZE=$5
-SEQ_LEN=$6
-PAD_LEN=$7
-EXTRA_VOCAB_SIZE=$8
-PR=$9
-DATASET_PATH=${10}
-PRETRAIN_CHECKPOINT_PATH=${11}
+MODEL_SIZE=$3
+BATCH_SIZE=$4
+SEQ_LEN=$5
+PAD_LEN=$6
+EXTRA_VOCAB_SIZE=$7
+PR=$8
+DATASET_PATH=$9
+PRETRAIN_CHECKPOINT_PATH=${10}
 
 if [ $MODEL_SIZE = 7B ]; then
 

diff --git a/examples/baichuan2/run_evaluate_megatron_baichuan.sh b/examples/baichuan2/run_evaluate_megatron_baichuan.sh
@@ -1,11 +1,12 @@
 #!/bin/bash
-# sh run_evaluate_megatron_baichuan.sh dsw /workspace/Megatron-LM/ /workspace/github/Pai-Megatron-Patch/ 7B 1 2048 2048 0 bf16 2 1 sel true true true false /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-7B-Base-to-mg-tp2-pp1
-# sh run_evaluate_megatron_baichuan.sh dsw /workspace/Megatron-LM/ /workspace/github/Pai-Megatron-Patch/ 13B 1 2048 2048 0 bf16 2 1 sel true false true false /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-13B-Base-to-mg-tp2-pp1
+# sh run_evaluate_megatron_baichuan.sh dsw /workspace/Pai-Megatron-Patch/ 7B 1 2048 2048 0 bf16 2 1 sel true true true false /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-7B-Base-to-mg-tp2-pp1
+# sh run_evaluate_megatron_baichuan.sh dsw /workspace/Pai-Megatron-Patch/ 13B 1 2048 2048 0 bf16 2 1 sel true false true false /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-13B-Base-to-mg-tp2-pp1
+# sh run_evaluate_megatron_baichuan.sh dsw /workspace/Pai-Megatron-Patch/ 7B 1 2048 2048 0 bf16 2 1 sel true false true true /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-7B-Base-to-te-tp2-pp1
 
 set -e
 ENV=$1
-MEGATRON_PATH=$2
-MEGATRON_PATCH_PATH=$3
+MEGATRON_PATCH_PATH=$2
+MEGATRON_PATH=${MEGATRON_PATCH_PATH}/Megatron-LM-main
 export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATCH_PATH}:$PYTHONPATH
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 if [ $ENV = dsw ]; then
@@ -26,21 +27,21 @@ fi
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
-MODEL_SIZE=$4
-BATCH_SIZE=$5
-SEQ_LEN=$6
-PAD_LEN=$7
-EXTRA_VOCAB_SIZE=$8
-PR=$9
-TP=${10}
-PP=${11}
-AC=${12}
-DO=${13}
-FL=${14}
-SP=${15}
-TE=${16}
-DATASET_PATH=${17}
-PRETRAIN_CHECKPOINT_PATH=${18}
+MODEL_SIZE=$3
+BATCH_SIZE=$4
+SEQ_LEN=$5
+PAD_LEN=$6
+EXTRA_VOCAB_SIZE=$7
+PR=$8
+TP=$9
+PP=${10}
+AC=${11}
+DO=${12}
+FL=${13}
+SP=${14}
+TE=${15}
+DATASET_PATH=${16}
+PRETRAIN_CHECKPOINT_PATH=${17}
 
 if [ $MODEL_SIZE = 7B ]; then
 

diff --git a/examples/baichuan2/run_finetune_megatron_baichuan_wgbs.sh b/examples/baichuan2/run_finetune_megatron_baichuan_wgbs.sh
diff --git a/examples/baichuan2/run_pretrain_megatron_baichuan.sh b/examples/baichuan2/run_pretrain_megatron_baichuan.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-#sh run_pretrain_megatron_baichuan.sh dsw /workspace/Megatron-LM/ /workspace/github/Pai-Megatron-Patch/ 7B 1 8 1e-5 1e-6 2048 2048 0 bf16 2 1 sel true true true false 100000 /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-7B-Base-to-mg-tp2-pp1 100000000 10000 /mnt/output_baichuan2
-#sh run_pretrain_megatron_baichuan.sh dsw /workspace/Megatron-LM/ /workspace/github/Pai-Megatron-Patch/ 13B 1 8 1e-5 1e-6 2048 2048 0 bf16 2 1 sel true false true false 100000 /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-13B-Base-to-mg-tp2-pp1 100000000 10000 /mnt/output_baichuan2
+#sh run_pretrain_megatron_baichuan.sh dsw /workspace/Pai-Megatron-Patch/ 7B 1 8 1e-5 1e-6 2048 2048 0 bf16 2 1 sel true true true false 100000 /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-7B-Base-to-mg-tp2-pp1 100000000 10000 /mnt/output_baichuan2
+#sh run_pretrain_megatron_baichuan.sh dsw /workspace/Pai-Megatron-Patch/ 13B 1 8 1e-5 1e-6 2048 2048 0 bf16 2 1 sel true false true false 100000 /mnt/baichuan2-datasets/alpaca_zh.json /mnt/baichuan2-ckpts/Baichuan2-13B-Base-to-mg-tp2-pp1 100000000 10000 /mnt/output_baichuan2
 
 set -e
 ENV=$1
-MEGATRON_PATH=$2
-MEGATRON_PATCH_PATH=$3
+MEGATRON_PATCH_PATH=$2
+MEGATRON_PATH=${MEGATRON_PATCH_PATH}/Megatron-LM-main
 export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATCH_PATH}:$PYTHONPATH
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 if [ $ENV = dsw ]; then
@@ -26,28 +26,28 @@ fi
 
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
 
-MODEL_SIZE=$4
-BATCH_SIZE=$5
-GLOBAL_BATCH_SIZE=$6
-LR=$7
-MIN_LR=$8
-SEQ_LEN=$9
-PAD_LEN=${10}
-EXTRA_VOCAB_SIZE=${11}
-PR=${12}
-TP=${13}
-PP=${14}
-AC=${15}
-DO=${16}
-FL=${17}
-SP=${18}
-TE=${19}
-SAVE_INTERVAL=${20}
-DATASET_PATH=${21}
-PRETRAIN_CHECKPOINT_PATH=${22}
-TRAIN_TOKENS=${23}
-WARMUP_TOKENS=${24}
-OUTPUT_BASEPATH=${25}
+MODEL_SIZE=$3
+BATCH_SIZE=$4
+GLOBAL_BATCH_SIZE=$5
+LR=$6
+MIN_LR=$7
+SEQ_LEN=$8
+PAD_LEN=$9
+EXTRA_VOCAB_SIZE=${10}
+PR=${11}
+TP=${12}
+PP=${13}
+AC=${14}
+DO=${15}
+FL=${16}
+SP=${17}
+TE=${18}
+SAVE_INTERVAL=${19}
+DATASET_PATH=${20}
+PRETRAIN_CHECKPOINT_PATH=${21}
+TRAIN_TOKENS=${22}
+WARMUP_TOKENS=${23}
+OUTPUT_BASEPATH=${24}
 
 
 if [ $MODEL_SIZE = 7B ]; then