From 8d4398cafe3614b69b1a5ced4ccb1160061466dc Mon Sep 17 00:00:00 2001 From: Xiaoyu Xu Date: Mon, 1 May 2023 00:30:57 +0800 Subject: [PATCH 1/4] Update args_train.sh --- onebench/libai/args_train.sh | 54 ++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/onebench/libai/args_train.sh b/onebench/libai/args_train.sh index bb383d2..41c2f68 100644 --- a/onebench/libai/args_train.sh +++ b/onebench/libai/args_train.sh @@ -97,34 +97,34 @@ python3 -m oneflow.distributed.launch \ --nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ tools/train_net.py \ --resume \ ---config-file $CONFIG \ -model.cfg.hidden_dropout_prob=$hidden_dropout_prob \ -model.cfg.attention_probs_dropout_prob=$attention_probs_dropout_prob \ -model.cfg.bias_dropout_fusion=$bias_dropout_fusion \ -model.cfg.hidden_layers=$NUM_LAYER \ -model.cfg.hidden_size=$HIDDEN_SIZE \ -model.cfg.num_attention_heads=$NUM_ATT_HEADS \ -model.cfg.intermediate_size=$INTERMEDIATE_SIZE \ -model.cfg.ffn_hidden_size=$INTERMEDIATE_SIZE \ +--config-file $CONFIG \ +model.cfg.hidden_dropout_prob=$hidden_dropout_prob \ +model.cfg.attention_probs_dropout_prob=$attention_probs_dropout_prob \ +model.cfg.bias_dropout_fusion=$bias_dropout_fusion \ +model.cfg.hidden_layers=$NUM_LAYER \ +model.cfg.hidden_size=$HIDDEN_SIZE \ +model.cfg.num_attention_heads=$NUM_ATT_HEADS \ +model.cfg.intermediate_size=$INTERMEDIATE_SIZE \ +model.cfg.ffn_hidden_size=$INTERMEDIATE_SIZE \ model.cfg.head_size=$HEAD_SIZE \ -graph.enabled=$GRAPH_ENABLED \ -train.dist.pipeline_num_layers=$NUM_LAYER \ -train.train_micro_batch_size=$MICRO_BATCH_SIZE \ -train.global_batch_size=$GLOBAL_BATCH_SIZE \ -train.dist.tensor_parallel_size=$MP \ -train.dist.pipeline_parallel_size=$PP \ -train.amp.enabled=$USE_FP16 \ -train.activation_checkpoint.enabled=$ACTIVATION_CHECKPOINT \ -train.num_accumulation_steps=$ACC \ -train.evaluation.enabled=$EVALUATION_ENABLED \ -train.evaluation.eval_iter=$EVAL_ITER \ -train.train_iter=$TRAIN_ITERS \ -train.train_epoch=$TRAIN_EPOCH \ -train.log_period=$LOG_PERIOD \ -train.zero_optimization.enabled=$ZERO_ENABLE \ -train.zero_optimization.stage=$ZERO_STAGE \ -train.load_weight=$LOAD_WEIGHT \ -train.checkpoint.period=$save_checkpoint_period \ +graph.enabled=$GRAPH_ENABLED \ +train.dist.pipeline_num_layers=$NUM_LAYER \ +train.train_micro_batch_size=$MICRO_BATCH_SIZE \ +train.global_batch_size=$GLOBAL_BATCH_SIZE \ +train.dist.tensor_parallel_size=$MP \ +train.dist.pipeline_parallel_size=$PP \ +train.amp.enabled=$USE_FP16 \ +train.activation_checkpoint.enabled=$ACTIVATION_CHECKPOINT \ +train.num_accumulation_steps=$ACC \ +train.evaluation.enabled=$EVALUATION_ENABLED \ +train.evaluation.eval_iter=$EVAL_ITER \ +train.train_iter=$TRAIN_ITERS \ +train.train_epoch=$TRAIN_EPOCH \ +train.log_period=$LOG_PERIOD \ +train.zero_optimization.enabled=$ZERO_ENABLE \ +train.zero_optimization.stage=$ZERO_STAGE \ +train.load_weight=$LOAD_WEIGHT \ +train.checkpoint.period=$save_checkpoint_period \ train.output_dir=$LOG_FILENAME 2>&1 | tee ${LOG_FILENAME}/output.log From 6d0c58264d9dd17d3a2697a910312eec7b127e34 Mon Sep 17 00:00:00 2001 From: Xiaoyu Xu Date: Mon, 1 May 2023 00:39:23 +0800 Subject: [PATCH 2/4] Update args_train.sh --- onebench/libai/args_train.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/onebench/libai/args_train.sh b/onebench/libai/args_train.sh index 41c2f68..00c6093 100644 --- a/onebench/libai/args_train.sh +++ b/onebench/libai/args_train.sh @@ -96,7 +96,6 @@ echo LOG_FILENAME=$LOG_FILENAME python3 -m oneflow.distributed.launch \ --nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ tools/train_net.py \ ---resume \ --config-file $CONFIG \ model.cfg.hidden_dropout_prob=$hidden_dropout_prob \ model.cfg.attention_probs_dropout_prob=$attention_probs_dropout_prob \ From f08490d8ca244af4e94cb5c36defcb298f97fb51 Mon Sep 17 00:00:00 2001 From: Xiaoyu Xu Date: Tue, 2 May 2023 17:18:46 +0800 Subject: [PATCH 3/4] rm sed --- onebench/libai/args_train.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onebench/libai/args_train.sh b/onebench/libai/args_train.sh index 00c6093..a85621c 100644 --- a/onebench/libai/args_train.sh +++ b/onebench/libai/args_train.sh @@ -30,8 +30,8 @@ UNSET_DROPOUT=${23:-false} ONEFLOW_COMMIT=$(python3 -c 'import oneflow; print(oneflow.__git_commit__)') -sed -i '/import time/a\import os' ./libai/engine/trainer.py -sed -i '/for self.iter in range(start_iter, max_iter):/a\ if self.iter == 99: \ +#sed -i '/import time/a\import os' ./libai/engine/trainer.py +#sed -i '/for self.iter in range(start_iter, max_iter):/a\ if self.iter == 99: \ cmd = "nvidia-smi --query-gpu=timestamp,name,driver_version,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv" \ os.system(cmd)' ./libai/engine/trainer.py @@ -74,7 +74,7 @@ LOG_FILENAME=${TRAN_MODEL}_${RUN_TYPE}_nl${NUM_LAYER}_nah${NUM_ATT_HEADS}_hs${HI if [[ $UNSET_DROPOUT = "true" ]]; then #sed -i 's/persistent_workers=True/#persistent_workers=True/g' ./libai/data/build.py - sed -i 's/shuffle=True/shuffle=False/g' ./libai/data/build.py + #sed -i 's/shuffle=True/shuffle=False/g' ./libai/data/build.py hidden_dropout_prob=0.0 attention_probs_dropout_prob=0.0 bias_dropout_fusion=false @@ -83,7 +83,7 @@ fi if [[ $SAVE_MODEL = "false" ]]; then #sed -i 's/hooks.PeriodicCheckpointer/#&/' ./libai/engine/default.py - sed -i '/if self.cfg.train.evaluation.enabled:/i\ ret.pop()' ./libai/engine/default.py + #sed -i '/if self.cfg.train.evaluation.enabled:/i\ ret.pop()' ./libai/engine/default.py LOG_FOLDER=$LOG_FOLDER/${ONEFLOW_COMMIT} fi From b2d6dd8864c7c77c3b88f8a90e621ba11238590d Mon Sep 17 00:00:00 2001 From: Xiaoyu Xu Date: Tue, 2 May 2023 17:27:26 +0800 Subject: [PATCH 4/4] Update args_train.sh --- onebench/libai/args_train.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onebench/libai/args_train.sh b/onebench/libai/args_train.sh index a85621c..8624bd4 100644 --- a/onebench/libai/args_train.sh +++ b/onebench/libai/args_train.sh @@ -32,8 +32,8 @@ ONEFLOW_COMMIT=$(python3 -c 'import oneflow; print(oneflow.__git_commit__)') #sed -i '/import time/a\import os' ./libai/engine/trainer.py #sed -i '/for self.iter in range(start_iter, max_iter):/a\ if self.iter == 99: \ - cmd = "nvidia-smi --query-gpu=timestamp,name,driver_version,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv" \ - os.system(cmd)' ./libai/engine/trainer.py +# cmd = "nvidia-smi --query-gpu=timestamp,name,driver_version,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv" \ +# os.system(cmd)' ./libai/engine/trainer.py GPU_NAME="$(nvidia-smi -i 0 --query-gpu=gpu_name --format=csv,noheader)" GPU_NAME="${GPU_NAME// /_}"