Merge pull request #298 from Tencent/develop

feifeibear · web-flow · commit 0731c6ed2065 · 2022-01-06T13:56:26.000+08:00
Add SamplesPerSec as a performance indicator
diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md
@@ -1,9 +1,7 @@
 ## v0.4.5 Dec. 2021
+Refactor the files in example and add chunk size searching.=
 Evaluate on 8 nodes of SuperPod. Fix bugs in multi-GPU mem tracer.
 
-## v0.4.5 Dec. 2021
-Refactor the files in example and add chunk size searching.
-
 
 ### v0.4.4 Dec. 2021
 The system is successfully evaluated on a multi-node system.
diff --git a/doc/optimization_options.md b/doc/optimization_options.md
@@ -55,7 +55,7 @@ PatirckStar is famous for dynamic partition model data. With help of this flag y
 The is a computing efficient irrelevant option used for distributed training. It allocates memory for remote chunks but release it immediately. In this way, we can make sure the model parameter is randomly initialized the same as a serial version. Solve the problem with random seed. It is used in combination with the `--res_check` option to check the correctness of distributed training.
 
 7. Adjusting the quota of CPU and GPU memory of memory tracer.
-We provide ways to adjust the CPU and GPU memory usage quota for the memory tracer. We did not expose this optimization as parameters passed through the command line. As shown in the pretrain_bert_demo.py, there is a JSON config for the memory tracer setting. You can adjust the four ratio suffix values.
+We provide ways to adjust the CPU and GPU memory usage quota for the memory tracer. We did not expose this optimization as parameters passed through the command line. As shown in the pretrain_demo.py, there is a JSON config for the memory tracer setting. You can adjust the four ratio suffix values.
 
 `warmup_gpu_chunk_mem_ratio`: the max gpu memory of a GPU can be used for chunks during the warmup iteration.
 
diff --git a/examples/README.md b/examples/README.md
@@ -19,15 +19,15 @@ python huggingface_bert.py
 
 ### Use PatrickStar to train large model
 
-`run_transformers.sh` and `pretrain_bert_demo.py` is an example to train large PTMs with PatrickStar. You could run different size of model by adding config to`run_transformers.sh`.
+`run_transformers.sh` and `pretrain_demo.py` is an example to train large PTMs with PatrickStar. You could run different size of model by adding config to`run_transformers.sh`.
 
 The following command will run a model with 4B params:
 
 ```bash
 env MODEL_NAME=GPT2_4B RES_CHECK=0 DIST_PLAN="patrickstar" bash run_transformers.sh
 ```
 
-For the available `MODEL_NAME`, please check `pretrain_bert_demo.py`.
+For the available `MODEL_NAME`, please check `pretrain_demo.py`.
 
 Check the accuracy of PatrickStar with Bert:
 
diff --git a/examples/pretrain_demo.py b/examples/pretrain_demo.py
@@ -38,7 +38,7 @@
 from data_loader import get_bert_data_loader
 from patrickstar.profiler import profiler
 from patrickstar.runtime import initialize_engine
-from patrickstar.utils import see_memory_usage
+from patrickstar.utils import see_memory_usage, get_world_size
 from patrickstar.utils.logging import log_dist, logger
 from patrickstar.utils.model_size_calculator import get_ps_model_size
 from model_builder import build_transformer_model
@@ -180,10 +180,13 @@ def test_transformer_model_helper(
                 f"After step {n}. using {dist_plan}, gradient checkpoint: {is_ckp}, fp16 {is_fp16}",
                 force=True,
             )
+            world_size = get_world_size()
             if dist_plan == "patrickstar":
                 print(
                     f'{"[WARM UP] " if n == 0 else ""}'
-                    f"Step {n} elaspe {step_elapse} s, {total_macs / 1e12 / step_elapse} Tflops"
+                    f"Step {n} elaspe {step_elapse} s, "
+                    f"{total_macs / 1e12 / step_elapse} Tflops Per GPU "
+                    f"{args.batch_size * world_size/step_elapse} SamplesPerSec"
                 )
                 if n == num_steps - 1:
                     global_timer.my_timer.print()
@@ -193,7 +196,9 @@ def test_transformer_model_helper(
                     global_timer.data_move_cnter.reset()
             else:
                 print(
-                    f"Step {n} elaspe {step_elapse} s, {total_macs / 1e12 / step_elapse} Tflops"
+                    f"Step {n} elaspe {step_elapse} s, "
+                    f"{total_macs / 1e12 / step_elapse} Tflops Per GPU "
+                    f"{args.batch_size * world_size/step_elapse} SamplesPerSec"
                 )
 
         log_dist(f"End Step {n} with {dist_plan}.\n")
diff --git a/examples/run_transformers.sh b/examples/run_transformers.sh
@@ -202,7 +202,7 @@ done
 else
 env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
 --nnodes=${NNODES} --node_rank=${NODE_RANK} --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} \
-    pretrain_bert_demo.py \
+    pretrain_demo.py \
     --default_chunk_size=${CHUNK_SIZE} \
     ${cmd_opts} \
     2>&1 | tee ${LOG_DIR}/${LOG_FILE}