From 85199fbb396a4d8ccb908cf122c3d0a4c76fea7a Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Wed, 8 Dec 2021 14:28:19 +0800
Subject: [PATCH 01/19] init chunk size search

---
 examples/pretrain_bert_demo.py | 11 ++++++++++-
 patrickstar/core/client.py     |  9 ++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/examples/pretrain_bert_demo.py b/examples/pretrain_bert_demo.py
index e5e3fcbc5..4c7e883aa 100644
--- a/examples/pretrain_bert_demo.py
+++ b/examples/pretrain_bert_demo.py
@@ -48,6 +48,8 @@
 from patrickstar.utils.model_size_calculator import get_ps_model_size, estimate_bert_mac
 import optimizations.global_opt_flags as global_opt_flags
 
+USE_CHUNK_SIZE_SEARCH = True
+
 
 def _add_patrick_star_args(parser):
     group = parser.add_argument_group(title="patrickstar")
@@ -239,6 +241,8 @@ def test_bert_model_helper(
     num_head=12,
     num_steps=5,
 ):
+    if USE_CHUNK_SIZE_SEARCH:
+        logger.setLevel(logging.INFO)
     logger.info(
         f'test a bert {"fp16" if is_fp16 else "fp32"} model '
         f'{"with checkpoint" if is_ckp else ""}'
@@ -405,6 +409,12 @@ def visit_and_register_hooks(module):
         f"After model init. using {dist_plan}, gradient checkpoint: {is_ckp}, fp16 {is_fp16}",
         force=True,
     )
+    if USE_CHUNK_SIZE_SEARCH:
+        logger.setLevel(logging.WARNING)
+        model.client.mem_tracer.close_tracer()
+        if args.with_mem_profiler:
+            profiler.end()
+        return []
 
     data_loader = get_bert_data_loader(
         batch_size=batch_size,
@@ -489,7 +499,6 @@ def visit_and_register_hooks(module):
                 f"{dist_plan}_{args.model_name}_bs_{batch_size}_"
                 f"ckp_{is_ckp}_offload_{args.with_activation_offload}_profile.pkl"
             )
-    logging.info("*" * 20)
     return loss_res
 
 
diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 3015be784..2e77236f7 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -914,6 +914,8 @@ def display_chunk_info(self):
         logger.info("Print chunk list info.")
 
         overall_size = 0
+        overall_chunk_num = 0
+        overall_utilization_ratio = 0.0
         for (
             type,
             type_chunk_list,
@@ -935,7 +937,7 @@ def display_chunk_info(self):
                     chunk_id
                 ):
                     assert info.chunk_id == chunk_id, f"{info.chunk_id} vs {chunk_id}"
-                    logger.info(
+                    logger.debug(
                         f"** tensor: chunk_id {chunk_id}, start {info.start_offset}, "
                         f"end {info.start_offset + info.numel}, size {info.numel}, "
                         f"tensor_id {info.tensor_id}, state {info.state()}, name {info.tensor_name}"
@@ -945,6 +947,11 @@ def display_chunk_info(self):
                     f"chunk used {last_used_pos/1024/1024} M elem, "
                     f"{last_used_pos/chunk.capacity * 100} %"
                 )
+                overall_utilization_ratio += last_used_pos / chunk.capacity
                 overall_size += chunk.get_chunk_space()
+                overall_chunk_num += 1
 
         logger.info(f"OVERALL CHUNK SIZE {overall_size / 1024 / 1024 / 1024} GB")
+        logger.info(
+            f"OVERALL UTILIZATION {overall_utilization_ratio / overall_chunk_num} %"
+        )

From 09de2588905dae5b371e7083b2d6b7cc5bd55a04 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Wed, 8 Dec 2021 15:07:36 +0800
Subject: [PATCH 02/19] add get_overall_chunk_size

---
 examples/pretrain_bert_demo.py |  5 ++---
 patrickstar/core/client.py     | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/examples/pretrain_bert_demo.py b/examples/pretrain_bert_demo.py
index 4c7e883aa..84da53dee 100644
--- a/examples/pretrain_bert_demo.py
+++ b/examples/pretrain_bert_demo.py
@@ -241,8 +241,6 @@ def test_bert_model_helper(
     num_head=12,
     num_steps=5,
 ):
-    if USE_CHUNK_SIZE_SEARCH:
-        logger.setLevel(logging.INFO)
     logger.info(
         f'test a bert {"fp16" if is_fp16 else "fp32"} model '
         f'{"with checkpoint" if is_ckp else ""}'
@@ -410,8 +408,9 @@ def visit_and_register_hooks(module):
         force=True,
     )
     if USE_CHUNK_SIZE_SEARCH:
-        logger.setLevel(logging.WARNING)
         model.client.mem_tracer.close_tracer()
+        overall_chunk_size, utilization = model.client.get_overall_chunk_size()
+        print(f"overall_chunk_size {overall_chunk_size}, utilization {utilization}")
         if args.with_mem_profiler:
             profiler.end()
         return []
diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 2e77236f7..034be88e5 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -910,6 +910,33 @@ def release_grad(
     def reset(self):
         raise NotImplementedError
 
+    def get_overall_chunk_size(self):
+        """
+        return the overall size of all chunks and
+        the overall chunk utilization excluding fragments.
+        """
+        overall_size = 0
+        overall_chunk_num = 0
+        overall_utilization_ratio = 0.0
+        for (
+            type,
+            type_chunk_list,
+        ) in self.chunk_tensor_index.chunk_type_to_chunk_id_list_map.items():
+            logger.info(f"Chunk list {type}")
+            for chunk_id in type_chunk_list:
+                chunk = self.chunk_list[chunk_id]
+                comm_info = self.chunk_tensor_index.chunk_id_to_comm_info_map[chunk_id]
+                assert comm_info is not None
+                last_used_pos = 0
+                for info in self.chunk_tensor_index.generate_tensor_info_in_order(
+                    chunk_id
+                ):
+                    last_used_pos = max(last_used_pos, info.start_offset + info.numel)
+                overall_utilization_ratio += last_used_pos / chunk.capacity
+                overall_size += chunk.get_chunk_space()
+                overall_chunk_num += 1
+            return overall_size, overall_utilization_ratio
+
     def display_chunk_info(self):
         logger.info("Print chunk list info.")
 

From 5384b1dd739743cdf78f02443590917564853b34 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 14:55:21 +0800
Subject: [PATCH 03/19] split pretrain_bert_demo to multiple files and add
 chunk size searching mechanism.

---
 README.md                                     |   2 +-
 examples/README.md                            |   6 +-
 .../run_a100_benchmark_large_model.sh         |   2 +-
 .../run_a100_benchmark_small_model.sh         |   2 +-
 examples/benchmark/run_benchmark.sh           |   2 +-
 examples/best_chunk_size_search.py            | 202 +++++++
 examples/model_builder.py                     | 234 ++++++++
 examples/parse_args.py                        | 197 +++++++
 examples/pretrain_bert_demo.py                | 514 +-----------------
 examples/ps_config.py                         |  78 +++
 examples/{run_bert.sh => run_transformers.sh} |  31 +-
 patrickstar/core/client.py                    |   5 +-
 12 files changed, 776 insertions(+), 499 deletions(-)
 create mode 100644 examples/best_chunk_size_search.py
 create mode 100644 examples/model_builder.py
 create mode 100644 examples/parse_args.py
 create mode 100644 examples/ps_config.py
 rename examples/{run_bert.sh => run_transformers.sh} (85%)

diff --git a/README.md b/README.md
index 7fe2ade2a..982bff236 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ For some detail explanation of the above example, please check the guide [here](
 
 For more examples, please check [here](./examples).
 
-A quick-start benchmark script is [here](./examples/run_bert.sh). It is executed with random generated data, therefore you do not need to prepare the real data. It also demostrated all of the optimization techniques for patricksatr. For more optimization tricks runing the benchmark see [Optimization Options](./doc/optimization_options.md).
+A quick-start benchmark script is [here](./examples/run_transformers.sh). It is executed with random generated data, therefore you do not need to prepare the real data. It also demostrated all of the optimization techniques for patricksatr. For more optimization tricks runing the benchmark see [Optimization Options](./doc/optimization_options.md).
 
 
 ### Limitations
diff --git a/examples/README.md b/examples/README.md
index 0da456092..f422d8928 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -19,12 +19,12 @@ python huggingface_bert.py
 
 ### Use PatrickStar to train large model
 
-`run_bert.sh` and `pretrain_bert_demo.py` is an example to train large PTMs with PatrickStar. You could run different size of model by adding config to`run_bert.sh`.
+`run_transformers.sh` and `pretrain_bert_demo.py` is an example to train large PTMs with PatrickStar. You could run different size of model by adding config to`run_transformers.sh`.
 
 The following command will run a model with 4B params:
 
 ```bash
-env MODEL_NAME=GPT2_4B RES_CHECK=0 DIST_PLAN="patrickstar" bash run_bert.sh
+env MODEL_NAME=GPT2_4B RES_CHECK=0 DIST_PLAN="patrickstar" bash run_transformers.sh
 ```
 
 For the available `MODEL_NAME`, please check `pretrain_bert_demo.py`.
@@ -32,7 +32,7 @@ For the available `MODEL_NAME`, please check `pretrain_bert_demo.py`.
 Check the accuracy of PatrickStar with Bert:
 
 ```bash
-bash RES_CHECK=1 run_bert.sh
+bash RES_CHECK=1 run_transformers.sh
 ```
 
 ### MoE support
diff --git a/examples/benchmark/run_a100_benchmark_large_model.sh b/examples/benchmark/run_a100_benchmark_large_model.sh
index 6870eb9cb..04d67970f 100644
--- a/examples/benchmark/run_a100_benchmark_large_model.sh
+++ b/examples/benchmark/run_a100_benchmark_large_model.sh
@@ -32,7 +32,7 @@ do
 echo "****************** Begin ***************************"
 echo "* benchmarking CS ${CS} BS ${BS} MODEL ${MODEL_NAME} "
 echo "* CPU_EBD ${CPU_EBD} SP ${SP} ACT_OFFLOAD ${ACT_OFFLOAD} MSC ${MSC} CACHE ${CACHE}"
-bash ../run_bert.sh
+bash ../run_transformers.sh
 echo "****************** Finished ***************************"
 echo ""
 echo ""
diff --git a/examples/benchmark/run_a100_benchmark_small_model.sh b/examples/benchmark/run_a100_benchmark_small_model.sh
index 01611d99a..a9f03342c 100644
--- a/examples/benchmark/run_a100_benchmark_small_model.sh
+++ b/examples/benchmark/run_a100_benchmark_small_model.sh
@@ -32,7 +32,7 @@ do
 echo "****************** Begin ***************************"
 echo "* benchmarking CS ${CS} BS ${BS} MODEL ${MODEL_NAME} "
 echo "* CPU_EBD ${CPU_EBD} SP ${SP} ACT_OFFLOAD ${ACT_OFFLOAD} MSC ${MSC} CACHE ${CACHE}"
-bash ../run_bert.sh
+bash ../run_transformers.sh
 echo "****************** Finished ***************************"
 echo ""
 echo ""
diff --git a/examples/benchmark/run_benchmark.sh b/examples/benchmark/run_benchmark.sh
index e98acc1a9..f205ff3c7 100644
--- a/examples/benchmark/run_benchmark.sh
+++ b/examples/benchmark/run_benchmark.sh
@@ -24,7 +24,7 @@ do
 echo "****************** Begin ***************************"
 echo "* benchmarking CS ${CS} BS ${BS} MODEL ${MODEL_NAME} "
 echo "* CPU_EBD ${CPU_EBD} SP ${SP} ACT_OFFLOAD ${ACT_OFFLOAD}"
-bash ../run_bert.sh
+bash ../run_transformers.sh
 echo "****************** Finished ***************************"
 echo ""
 echo ""
diff --git a/examples/best_chunk_size_search.py b/examples/best_chunk_size_search.py
new file mode 100644
index 000000000..c1a6f5e29
--- /dev/null
+++ b/examples/best_chunk_size_search.py
@@ -0,0 +1,202 @@
+# BSD 3-Clause License
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+#  * Neither the name of the psutil authors nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import logging
+import torch
+from patrickstar.utils import see_memory_usage
+from patrickstar.utils.logging import logger
+from model_builder import build_transformer_model
+from ps_config import get_patrickstar_config
+from parse_args import parse_args
+from patrickstar.core import PatrickStarClient
+from patrickstar.core import PSPreProcessCtx
+
+from patrickstar.utils.distributed import get_local_world_size
+from patrickstar.utils.memory import get_memory_info
+
+
+def chunk_schema_valid_check(args, config, chunk_size, overall_chunk_size):
+    """
+    check validation of a chunk schema, given the overall chunk size
+    args:
+        @args: cmd args
+        @config: client config
+        @chunk_size: the chunk size in numel
+        @overall_chunk_size: the overall chunk size used for param fp16
+    returns:
+        bool: is the chunk schema valid
+    """
+    mem_info = get_memory_info()
+    local_world_size = get_local_world_size()
+    overall_gpu_mem = torch.cuda.get_device_properties(
+        args.local_rank
+    ).total_memory * config.get("overall_gpu_mem_ratio", 0.8)
+    overall_cpu_mem = (
+        mem_info.total * config.get("overall_cpu_mem_ratio", 0.8) / local_world_size
+    )
+    warmup_used_gpu_mem = overall_gpu_mem * config.get(
+        "warmup_gpu_chunk_mem_ratio", 0.1
+    )
+
+    logger.info(
+        f"warmup_used_gpu_mem {warmup_used_gpu_mem}, "
+        f"overall_cpu_mem {overall_cpu_mem}, "
+        f"overall_chunk_size {overall_chunk_size}"
+    )
+    if warmup_used_gpu_mem < chunk_size * 2:
+        logger.info("chunk is unable to be fitted in GPU during warmup")
+        return False
+
+    if warmup_used_gpu_mem + overall_cpu_mem < overall_chunk_size / 6 * 14:
+        logger.info("overall chunks is not able to fit in CPU + GPU")
+        return False
+    return True
+
+
+def get_param_used_chunk_size(args, config, model_func):
+    """
+    return overall chunk size of param fp16 and param fp32.
+    as well as the memory utilization of chunks.
+    """
+    client = PatrickStarClient(
+        rank=args.local_rank,
+        default_chunk_size=args.default_chunk_size,
+        config=config.get("client", None),
+    )
+
+    try:
+        with PSPreProcessCtx(
+            client=client,
+            dtype=torch.float,
+            release_after_init=args.release_after_init,
+            use_cpu_embedding=args.use_cpu_embedding,
+        ):
+            model = model_func()
+    except Exception:
+        return -1, -1
+
+    del model
+
+    overall_chunk_size, util = client.get_overall_chunk_size()
+    if chunk_schema_valid_check(
+        args,
+        config["client"]["mem_tracer"],
+        args.default_chunk_size,
+        overall_chunk_size,
+    ):
+        return overall_chunk_size, util
+    else:
+        return -1, -1
+
+
+def evaluate_chunk_size(
+    args,
+    is_ckp: bool = False,
+    is_fp16: bool = False,
+    dist_plan: str = "torch",
+    num_steps=5,
+):
+    """
+    Evaluate the current training task defined by the args.
+    write the chunk memory usage to the file.
+    """
+    logger.info(
+        f'test a bert {"fp16" if is_fp16 else "fp32"} model '
+        f'{"with checkpoint" if is_ckp else ""}'
+    )
+
+    # Use single card to simulate multicard. Used when you are poor and
+    # no more GPU avaiable.
+    if args.use_fake_dist:
+        rank = 0
+    else:
+        rank = args.local_rank
+
+    # Avoid gpu0 use more memory.
+    # https://discuss.pytorch.org/t/extra-10gb-memory-on-gpu-0-in-ddp-tutorial/118113
+    torch.cuda.set_device(rank)
+    torch.cuda.empty_cache()
+
+    lr = 0.001
+    betas = (0.9, 0.999)
+    eps = 1e-6
+    weight_decay = 0
+
+    model_func, sequence_length = build_transformer_model(args)
+    config = get_patrickstar_config(
+        args, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay
+    )
+
+    see_memory_usage(
+        f"before get_param_used_chunk_size for {args.default_chunk_size/1024/1024} MB",
+        True,
+        "MB",
+    )
+
+    overall_chunk_size, utils = get_param_used_chunk_size(args, config, model_func)
+
+    see_memory_usage(
+        f"after get_param_used_chunk_size for {args.default_chunk_size/1024/1024} MB",
+        True,
+        "MB",
+    )
+
+    logger.info(f"{overall_chunk_size}, {utils}\n")
+    logger.info(f"writing to {args.slog_file}\n")
+
+    with open(f"{args.slog_file}", "a+") as fh:
+        fh.write(
+            f"{args.default_chunk_size/1024/1024} {overall_chunk_size/1024/1024}, {utils}\n"
+        )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    use_ckp = args.use_ckp
+    use_fp16 = args.use_fp16
+    dist_plan = args.dist_plan
+    res_check = args.res_check
+
+    # You could set the logger level to INFO to view more runtime
+    # information.
+    logger.setLevel(logging.INFO)
+    if not torch.distributed.is_initialized():
+        torch.distributed.init_process_group(
+            backend="gloo" if args.use_fake_dist else "nccl"
+        )
+
+    torch.manual_seed(0)
+    loss_list = evaluate_chunk_size(
+        args=args,
+        is_ckp=use_ckp,
+        is_fp16=use_fp16,
+        dist_plan=dist_plan,
+        num_steps=5,
+    )
diff --git a/examples/model_builder.py b/examples/model_builder.py
new file mode 100644
index 000000000..59c856b5a
--- /dev/null
+++ b/examples/model_builder.py
@@ -0,0 +1,234 @@
+# BSD 3-Clause License
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+#  * Neither the name of the psutil authors nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import transformers
+from transformers import BertConfig
+from packaging import version
+import optimizations.global_opt_flags as global_opt_flags
+
+
+def model_config(model_name):
+    """
+    generate the model config according to the model name.
+    """
+    if model_name == "Bert":
+        # 0.11B
+        HIDDEN_DIM = 768
+        SEQ_LEN = 512
+        NUM_LAYER = 6
+        NUM_HEAD = 12
+    elif model_name == "Bertlarge":
+        # 0.35B
+        HIDDEN_DIM = 1024
+        SEQ_LEN = 512
+        NUM_LAYER = 24
+        NUM_HEAD = 16
+    elif model_name == "GPT2small":
+        # 0.7B
+        HIDDEN_DIM = 1536
+        SEQ_LEN = 128
+        NUM_LAYER = 24
+        NUM_HEAD = 16
+    elif model_name == "GPT2_1B":
+        # 0.9B
+        HIDDEN_DIM = 2048
+        SEQ_LEN = 1024
+        NUM_LAYER = 20
+        NUM_HEAD = 16
+    elif model_name == "megatron_1.3B":
+        HIDDEN_DIM = 2048
+        SEQ_LEN = 1024
+        NUM_LAYER = 24
+        NUM_HEAD = 32
+    elif model_name == "GPT2_2B":
+        # zero-offload
+        HIDDEN_DIM = 2048
+        SEQ_LEN = 1024
+        NUM_LAYER = 40
+        NUM_HEAD = 16
+    elif model_name == "megatron_3.9B":
+        # Table 4 in Megatron Paper
+        HIDDEN_DIM = 2560
+        SEQ_LEN = 1024
+        NUM_LAYER = 24
+        NUM_HEAD = 40
+    elif model_name == "GPT2_4B":
+        HIDDEN_DIM = 2304  # 2048
+        SEQ_LEN = 1024
+        NUM_LAYER = 64
+        NUM_HEAD = 16
+    elif model_name == "GPT3_6B":
+        # 6.7B model
+        HIDDEN_DIM = 3072
+        SEQ_LEN = 1024
+        NUM_LAYER = 53
+        NUM_HEAD = 16
+    elif model_name == "GPT3_8B":
+        # 6.7B model
+        HIDDEN_DIM = 3072
+        SEQ_LEN = 1024
+        NUM_LAYER = 72
+        NUM_HEAD = 16
+    elif model_name == "GPT3_10B":
+        HIDDEN_DIM = 4096
+        SEQ_LEN = 1024
+        NUM_LAYER = 50
+        NUM_HEAD = 16
+    elif model_name == "GPT3_11B":
+        HIDDEN_DIM = 4096
+        SEQ_LEN = 1024
+        NUM_LAYER = 55
+        NUM_HEAD = 16
+    elif model_name == "GPT3_12B":
+        HIDDEN_DIM = 4096
+        SEQ_LEN = 1024
+        NUM_LAYER = 60
+        NUM_HEAD = 16
+    elif model_name == "GPT3_13B":
+        HIDDEN_DIM = 4096
+        SEQ_LEN = 1024
+        NUM_LAYER = 65
+        NUM_HEAD = 16
+    elif model_name == "GPT3_15B":
+        HIDDEN_DIM = 4096
+        SEQ_LEN = 1024
+        NUM_LAYER = 78
+        NUM_HEAD = 16
+    elif model_name == "GPT3_18B":
+        HIDDEN_DIM = 4096
+        SEQ_LEN = 1024
+        NUM_LAYER = 90
+        NUM_HEAD = 16
+    # The following configs comes from paper
+    # Efficient Large-Scale Language Model Training on GPU Clusters
+    # NV model is wider in hidden-size
+    elif model_name == "GPT_NV_18B":
+        HIDDEN_DIM = 6144
+        SEQ_LEN = 1024
+        NUM_LAYER = 40
+        NUM_HEAD = 16
+    elif model_name == "GPT_NV_39B":
+        HIDDEN_DIM = 8192
+        SEQ_LEN = 1024
+        NUM_LAYER = 48
+        NUM_HEAD = 16
+    elif model_name == "GPT_NV_76B":
+        HIDDEN_DIM = 10240
+        SEQ_LEN = 1024
+        NUM_LAYER = 60
+        NUM_HEAD = 16
+    # The following configs comes from Deep-Offload
+    # http://pasalabs.org/papers/2021/ATC21_zero-offload.pdf
+    elif model_name == "GPT_DS_20B":
+        HIDDEN_DIM = 8192
+        SEQ_LEN = 1024
+        NUM_LAYER = 25
+        NUM_HEAD = 16
+    elif model_name == "GPT_DS_40B":
+        HIDDEN_DIM = 8192
+        SEQ_LEN = 1024
+        NUM_LAYER = 50
+        NUM_HEAD = 16
+    elif model_name == "GPT_DS_50B":
+        HIDDEN_DIM = 8192
+        SEQ_LEN = 1024
+        NUM_LAYER = 62
+        NUM_HEAD = 16
+    elif model_name == "GPT_DS_60B":
+        HIDDEN_DIM = 8192
+        SEQ_LEN = 1024
+        NUM_LAYER = 75
+        NUM_HEAD = 16
+    elif model_name == "GPT_DS_70B":
+        HIDDEN_DIM = 9216
+        SEQ_LEN = 1024
+        NUM_LAYER = 69
+        NUM_HEAD = 16
+    else:
+        raise RuntimeError(f"The model name {model_name} is not valid!")
+    assert HIDDEN_DIM % NUM_HEAD == 0
+    return (HIDDEN_DIM, SEQ_LEN, NUM_LAYER, NUM_HEAD)
+
+
+def print_model_config(args, hidden_dim, sequence_len, num_layer, num_head):
+    if args.rank == 0:
+        config_dict = {
+            "hidden_dim": hidden_dim,
+            "sequence_len": sequence_len,
+            "num_layer": num_layer,
+            "num_head": num_head,
+        }
+        print("------------------ model config ------------------", flush=True)
+        str_list = []
+        for key, value in config_dict.items():
+            dots = "." * (32 - len(key))
+            str_list.append("  {} {} {}".format(key, dots, value))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print("-------------- end of model config --------------", flush=True)
+
+
+def build_transformer_model(args):
+    """
+    Build a transformer-based model based on transformer bert.
+    return a function able to build the model.
+    """
+    if args.with_tiling_linear or args.with_activation_offload:
+        if args.with_tiling_linear:
+            global_opt_flags.USE_TILE = True
+        else:
+            global_opt_flags.USE_TILE = False
+        if args.with_activation_offload:
+            global_opt_flags.USE_ACT_OFFLOAD = True
+        else:
+            global_opt_flags.USE_ACT_OFFLOAD = False
+        from optimizations.ps_tile_modeling_bert import BertForSequenceClassification
+    else:
+        from transformers import BertForSequenceClassification
+
+    hidden_dim, sequence_length, num_layer, num_head = model_config(args.model_name)
+
+    bert_config = BertConfig(
+        gradient_checkpointing=args.use_ckp,
+        hidden_size=hidden_dim,
+        intermediate_size=hidden_dim * 4,
+        num_attention_heads=num_head,
+        max_position_embeddings=sequence_length,
+        num_hidden_layers=num_layer,
+    )
+
+    def model_func():
+        model = BertForSequenceClassification(bert_config)
+        if args.use_ckp and version.parse(transformers.__version__) >= version.parse(
+            "4.11.0"
+        ):
+            model.gradient_checkpointing_enable()
+        return model
+
+    return model_func, sequence_length
diff --git a/examples/parse_args.py b/examples/parse_args.py
new file mode 100644
index 000000000..861c3c6b6
--- /dev/null
+++ b/examples/parse_args.py
@@ -0,0 +1,197 @@
+# BSD 3-Clause License
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+#  * Neither the name of the psutil authors nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import os
+
+
+def _add_patrick_star_args(parser):
+    group = parser.add_argument_group(title="patrickstar")
+    group.add_argument(
+        "--use_fake_dist",
+        dest="use_fake_dist",
+        action="store_true",
+        help="Using one GPU to stimulate multiple card.",
+    )
+    group.add_argument(
+        "--default_chunk_size",
+        type=int,
+        default=32 * 1024 * 1024,
+        help="Default Chunk Size in elements.",
+    )
+    group.add_argument(
+        "--use_cpu_embedding",
+        dest="use_cpu_embedding",
+        action="store_true",
+        help="Using CPU to perform Embedding and do not assign "
+        "embedding params to chunks",
+    )
+    group.add_argument(
+        "--release_after_init",
+        action="store_true",
+        help="Release the remote chunk after the whole initialization."
+        "This would use more CPU memory during initialization, "
+        "but may fix some errors relate to checkpoint loading or"
+        "weight intialization.",
+    )
+    group.add_argument(
+        "--use_hybrid_adam",
+        action="store_true",
+        help="Use hybrid adam optimization. "
+        "By default ADAM is on CPU and run ADAM on GPU if possible.",
+    )
+    # Some hyperparams to tune when you failed to run a model.
+    group.add_argument(
+        "--with_static_partition",
+        action="store_true",
+        help="Use static partition for model data on CPU and GPU.",
+    )
+    group.add_argument(
+        "--with_mem_profiler",
+        action="store_true",
+        help="Profiling memory usage.",
+    )
+    group.add_argument(
+        "--init_loss_scale_power",
+        type=float,
+        default=10,
+        help="initial loss scale power",
+    )
+    group.add_argument(
+        "--with_async_mem_monitor",
+        action="store_true",
+        help="Use async memory monitor.",
+    )
+    group.add_argument(
+        "--with_mem_saving_comm",
+        action="store_true",
+        help="Use communication saving memory.",
+    )
+    group.add_argument(
+        "--with_mem_cache",
+        action="store_true",
+        help="Use caching to allocate chunk payload.",
+    )
+    group.add_argument(
+        "--with_async_move",
+        action="store_true",
+        help="Use asynchronize move.",
+    )
+    group.add_argument(
+        "--slog_file",
+        type=str,
+        default="./slog_file/tmp.txt",
+        help="The file to record chunk size serach log.",
+    )
+    return parser
+
+
+def _add_general_opt_args(parser):
+    group = parser.add_argument_group(title="test_bert")
+    group.add_argument(
+        "--use_ckp",
+        dest="use_ckp",
+        action="store_true",
+        help="using gradient checkpointing for memory saveing.",
+    )
+    group.add_argument(
+        "--with_activation_offload",
+        dest="with_activation_offload",
+        action="store_true",
+        help="Use activation offloading.",
+    )
+    group.add_argument(
+        "--with_tiling_linear",
+        action="store_true",
+        help="Use linear tiling.",
+    )
+    return parser
+
+
+def _add_test_config_args(parser):
+    group = parser.add_argument_group(title="test_config")
+    group.add_argument(
+        "--batch_size", type=int, default=32, help="Batch size of input."
+    )
+    group.add_argument(
+        "--local_rank",
+        type=int,
+        default=None,
+        help="local rank passed from distributed launcher.",
+    )
+
+    group.add_argument(
+        "--res_check",
+        dest="res_check",
+        action="store_true",
+        help="check results correctness of checkpointing.",
+    )
+    group.add_argument(
+        "--use_fp16",
+        dest="use_fp16",
+        action="store_true",
+        help="using FP16 for training.",
+    )
+    group.add_argument(
+        "--dist_plan",
+        type=str,
+        default="torch",
+        help="Distributed Plan [torch, patrickstar]",
+    )
+    group.add_argument(
+        "--model_name", type=str, default="GPTsmall", help="The model name."
+    )
+    group.add_argument("--with_lightseq", action="store_true", help="use lightseq")
+    return parser
+
+
+def _print_args(args):
+    """Print arguments."""
+    if args.rank == 0:
+        print("------------------- arguments -------------------", flush=True)
+        str_list = []
+        for arg in vars(args):
+            dots = "." * (32 - len(arg))
+            str_list.append("  {} {} {}".format(arg, dots, getattr(args, arg)))
+        for arg in sorted(str_list, key=lambda x: x.lower()):
+            print(arg, flush=True)
+        print("---------------- end of arguments ----------------", flush=True)
+
+
+def parse_args():
+    """Parse all arguments."""
+    parser = argparse.ArgumentParser(description="PatrickStar Arguments")
+    parser = _add_patrick_star_args(parser)
+    parser = _add_test_config_args(parser)
+    parser = _add_general_opt_args(parser)
+    args = parser.parse_args()
+    args.rank = int(os.getenv("RANK", "0"))
+    args.world_size = int(os.getenv("WORLD_SIZE", "1"))
+    _print_args(args)
+    return args
diff --git a/examples/pretrain_bert_demo.py b/examples/pretrain_bert_demo.py
index 84da53dee..35a7f703e 100644
--- a/examples/pretrain_bert_demo.py
+++ b/examples/pretrain_bert_demo.py
@@ -28,16 +28,11 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-import argparse
 import logging
-import os
 import time
-from packaging import version
 
 import torch
 import numpy as np
-import transformers
-from transformers import BertConfig
 
 import patrickstar.utils.global_timer as global_timer
 from data_loader import get_bert_data_loader
@@ -45,200 +40,17 @@
 from patrickstar.runtime import initialize_engine
 from patrickstar.utils import see_memory_usage
 from patrickstar.utils.logging import logger
-from patrickstar.utils.model_size_calculator import get_ps_model_size, estimate_bert_mac
-import optimizations.global_opt_flags as global_opt_flags
+from patrickstar.utils.model_size_calculator import get_ps_model_size
+from model_builder import build_transformer_model
+from parse_args import parse_args
+from ps_config import get_patrickstar_config
 
-USE_CHUNK_SIZE_SEARCH = True
 
-
-def _add_patrick_star_args(parser):
-    group = parser.add_argument_group(title="patrickstar")
-    group.add_argument(
-        "--use_fake_dist",
-        dest="use_fake_dist",
-        action="store_true",
-        help="Using one GPU to stimulate multiple card.",
-    )
-    group.add_argument(
-        "--default_chunk_size",
-        type=int,
-        default=32 * 1024 * 1024,
-        help="Default Chunk Size in elements.",
-    )
-    group.add_argument(
-        "--use_cpu_embedding",
-        dest="use_cpu_embedding",
-        action="store_true",
-        help="Using CPU to perform Embedding and do not assign "
-        "embedding params to chunks",
-    )
-    group.add_argument(
-        "--release_after_init",
-        action="store_true",
-        help="Release the remote chunk after the whole initialization."
-        "This would use more CPU memory during initialization, "
-        "but may fix some errors relate to checkpoint loading or"
-        "weight intialization.",
-    )
-    group.add_argument(
-        "--use_hybrid_adam",
-        action="store_true",
-        help="Use hybrid adam optimization. "
-        "By default ADAM is on CPU and run ADAM on GPU if possible.",
-    )
-    # Some hyperparams to tune when you failed to run a model.
-    group.add_argument(
-        "--with_static_partition",
-        action="store_true",
-        help="Use static partition for model data on CPU and GPU.",
-    )
-    group.add_argument(
-        "--with_mem_profiler",
-        action="store_true",
-        help="Profiling memory usage.",
-    )
-    group.add_argument(
-        "--init_loss_scale_power",
-        type=float,
-        default=10,
-        help="initial loss scale power",
-    )
-    group.add_argument(
-        "--with_async_mem_monitor",
-        action="store_true",
-        help="Use async memory monitor.",
-    )
-    group.add_argument(
-        "--with_mem_saving_comm",
-        action="store_true",
-        help="Use communication saving memory.",
-    )
-    group.add_argument(
-        "--with_mem_cache",
-        action="store_true",
-        help="Use caching to allocate chunk payload.",
-    )
-    group.add_argument(
-        "--with_async_move",
-        action="store_true",
-        help="Use asynchronize move.",
-    )
-    return parser
-
-
-def _add_general_opt_args(parser):
-    group = parser.add_argument_group(title="test_bert")
-    group.add_argument(
-        "--use_ckp",
-        dest="use_ckp",
-        action="store_true",
-        help="using gradient checkpointing for memory saveing.",
-    )
-    group.add_argument(
-        "--with_activation_offload",
-        dest="with_activation_offload",
-        action="store_true",
-        help="Use activation offloading.",
-    )
-    group.add_argument(
-        "--with_tiling_linear",
-        action="store_true",
-        help="Use linear tiling.",
-    )
-    return parser
-
-
-def _add_test_config_args(parser):
-    group = parser.add_argument_group(title="test_config")
-    group.add_argument(
-        "--batch_size", type=int, default=32, help="Batch size of input."
-    )
-    group.add_argument(
-        "--local_rank",
-        type=int,
-        default=None,
-        help="local rank passed from distributed launcher.",
-    )
-
-    group.add_argument(
-        "--res_check",
-        dest="res_check",
-        action="store_true",
-        help="check results correctness of checkpointing.",
-    )
-    group.add_argument(
-        "--use_fp16",
-        dest="use_fp16",
-        action="store_true",
-        help="using FP16 for training.",
-    )
-    group.add_argument(
-        "--dist_plan",
-        type=str,
-        default="torch",
-        help="Distributed Plan [torch, patrickstar]",
-    )
-    group.add_argument(
-        "--model_name", type=str, default="GPTsmall", help="The model name."
-    )
-    group.add_argument("--with_lightseq", action="store_true", help="use lightseq")
-    return parser
-
-
-def _print_args(args):
-    """Print arguments."""
-    if args.rank == 0:
-        print("------------------- arguments -------------------", flush=True)
-        str_list = []
-        for arg in vars(args):
-            dots = "." * (32 - len(arg))
-            str_list.append("  {} {} {}".format(arg, dots, getattr(args, arg)))
-        for arg in sorted(str_list, key=lambda x: x.lower()):
-            print(arg, flush=True)
-        print("---------------- end of arguments ----------------", flush=True)
-
-
-def parse_args():
-    """Parse all arguments."""
-    parser = argparse.ArgumentParser(description="PatrickStar Arguments")
-    parser = _add_patrick_star_args(parser)
-    parser = _add_test_config_args(parser)
-    parser = _add_general_opt_args(parser)
-    args = parser.parse_args()
-    args.rank = int(os.getenv("RANK", "0"))
-    args.world_size = int(os.getenv("WORLD_SIZE", "1"))
-    _print_args(args)
-    return args
-
-
-def print_model_config(hidden_dim, sequence_len, num_layer, num_head):
-    if args.rank == 0:
-        config_dict = {
-            "hidden_dim": hidden_dim,
-            "sequence_len": sequence_len,
-            "num_layer": num_layer,
-            "num_head": num_head,
-        }
-        print("------------------ model config ------------------", flush=True)
-        str_list = []
-        for key, value in config_dict.items():
-            dots = "." * (32 - len(key))
-            str_list.append("  {} {} {}".format(key, dots, value))
-        for arg in sorted(str_list, key=lambda x: x.lower()):
-            print(arg, flush=True)
-        print("-------------- end of model config --------------", flush=True)
-
-
-def test_bert_model_helper(
+def test_transformer_model_helper(
     args,
     is_ckp: bool = False,
     is_fp16: bool = False,
     dist_plan: str = "torch",
-    batch_size=32,
-    hidden_dim=768,
-    sequence_length=256,
-    num_layer=12,
-    num_head=12,
     num_steps=5,
 ):
     logger.info(
@@ -246,139 +58,49 @@ def test_bert_model_helper(
         f'{"with checkpoint" if is_ckp else ""}'
     )
 
-    # Use single card to imitate multicard.
+    # Use single card to simulate multicard. Used when you are poor and
+    # no more GPU avaiable.
     if args.use_fake_dist:
         rank = 0
     else:
         rank = args.local_rank
 
-    if args.with_tiling_linear or args.with_activation_offload:
-        if args.with_tiling_linear:
-            global_opt_flags.USE_TILE = True
-        else:
-            global_opt_flags.USE_TILE = False
-        if args.with_activation_offload:
-            global_opt_flags.USE_ACT_OFFLOAD = True
-        else:
-            global_opt_flags.USE_ACT_OFFLOAD = False
-        from optimizations.ps_tile_modeling_bert import BertForSequenceClassification
-    else:
-        from transformers import BertForSequenceClassification
-
     # Avoid gpu0 use more memory.
     # https://discuss.pytorch.org/t/extra-10gb-memory-on-gpu-0-in-ddp-tutorial/118113
     torch.cuda.set_device(rank)
     torch.cuda.empty_cache()
-
     device = torch.device(f"cuda:{rank}")
 
-    bert_config = BertConfig(
-        gradient_checkpointing=is_ckp,
-        hidden_size=hidden_dim,
-        intermediate_size=hidden_dim * 4,
-        num_attention_heads=num_head,
-        max_position_embeddings=sequence_length,
-        num_hidden_layers=num_layer,
-    )
+    if args.with_mem_profiler:
+        print("start memory profiler")
+        profiler.start()
 
     lr = 0.001
     betas = (0.9, 0.999)
     eps = 1e-6
     weight_decay = 0
 
-    if args.with_mem_profiler:
-        print("start memory profiler")
-        profiler.start()
+    model_func, sequence_length = build_transformer_model(args)
     if dist_plan == "patrickstar":
         if not is_fp16:
             logger.warning("PatrickStar will always use mixed precision training.")
-
-        def model_func():
-            model = BertForSequenceClassification(bert_config)
-            if is_ckp and version.parse(transformers.__version__) >= version.parse(
-                "4.11.0"
-            ):
-                model.gradient_checkpointing_enable()
-            return model
-
-        config = {
-            # The same format as optimizer config of DeepSpeed
-            # https://www.deepspeed.ai/docs/config-json/#optimizer-parameters
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": lr,
-                    "betas": betas,
-                    "eps": eps,
-                    "weight_decay": weight_decay,
-                    "use_hybrid_adam": args.use_hybrid_adam,
-                },
-            },
-            "fp16": {
-                "enabled": True,
-                # Set "loss_scale" to 0 to use DynamicLossScaler.
-                "loss_scale": 0,
-                "initial_scale_power": args.init_loss_scale_power,
-                "loss_scale_window": 1000,
-                "hysteresis": 2,
-                "min_loss_scale": 1,
-            },
-            "default_chunk_size": args.default_chunk_size,
-            "release_after_init": args.release_after_init,
-            "use_fake_dist": args.use_fake_dist,
-            "use_cpu_embedding": args.use_cpu_embedding,
-            "client": {
-                "mem_tracer": {
-                    "use_async_mem_monitor": args.with_async_mem_monitor,
-                    "warmup_gpu_chunk_mem_ratio": 0.1,
-                    "overall_gpu_mem_ratio": 0.8,
-                    "overall_cpu_mem_ratio": 0.8,
-                    "margin_use_ratio": 0.8,
-                    "use_fake_dist": False,
-                    "with_static_partition": args.with_static_partition,
-                },
-                "opts": {
-                    "with_mem_saving_comm": args.with_mem_saving_comm,
-                    "with_mem_cache": args.with_mem_cache,
-                    "with_async_move": args.with_async_move,
-                },
-            },
-        }
-
+        config = get_patrickstar_config(
+            args, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay
+        )
         model, optimizer = initialize_engine(
             model_func=model_func, local_rank=rank, config=config
         )
     else:
-        model = BertForSequenceClassification(bert_config)
+        model = model_func()
         if args.with_mem_profiler:
             from patrickstar.core.torch_profiler_hook import (
                 register_torch_profiler_hook,
             )
 
             register_torch_profiler_hook(model)
-        if is_ckp and version.parse(transformers.__version__) >= version.parse(
-            "4.11.0"
-        ):
-            model.gradient_checkpointing_enable()
+
         model.cuda(rank)
         model.train()
-        if args.with_lightseq:
-            from optimizations.ls_hf_transformer_encoder_layer import (
-                inject_ls_enc_layer,
-            )
-
-            inject_ls_enc_layer(model, args, bert_config)
-            print("Using Lightseq Kernels, all submodules includes:")
-
-            def visit_and_register_hooks(module):
-                is_child_node = True
-                for _, submodule in module.named_children():
-                    visit_and_register_hooks(submodule)
-                    is_child_node = False
-                if is_child_node:
-                    print(f"module name {module.__class__.__name__}")
-
-            visit_and_register_hooks(model)
 
         optimizer = torch.optim.Adam(
             model.parameters(), lr=lr, betas=betas, eps=eps, weight_decay=weight_decay
@@ -396,27 +118,17 @@ def visit_and_register_hooks(module):
 
     model_numel, model_num_param = get_ps_model_size(model)
     logger.info(f"Model size {model_numel / 1e9} B, total params: {model_num_param}")
-    total_macs, nvidia_total_macs = estimate_bert_mac(
-        bert_config, batch_size, sequence_length, model_numel
-    )
-    logger.info(f"Total MACs: {total_macs} TFlops")
-    logger.info(f"NVIDIA total MACs: {nvidia_total_macs}")
-    logger.debug(f"Diff csig/nvidia {total_macs / nvidia_total_macs}")
+    total_macs = model_numel * args.batch_size * sequence_length * 2 * 4
+    logger.info(f"Total MACs: {total_macs/1024/1024/1024/1024} TFlops")
 
     see_memory_usage(
         f"After model init. using {dist_plan}, gradient checkpoint: {is_ckp}, fp16 {is_fp16}",
         force=True,
     )
-    if USE_CHUNK_SIZE_SEARCH:
-        model.client.mem_tracer.close_tracer()
-        overall_chunk_size, utilization = model.client.get_overall_chunk_size()
-        print(f"overall_chunk_size {overall_chunk_size}, utilization {utilization}")
-        if args.with_mem_profiler:
-            profiler.end()
-        return []
 
+    # load data, here we generate random data for benchmarking.
     data_loader = get_bert_data_loader(
-        batch_size=batch_size,
+        batch_size=args.batch_size,
         total_samples=10000,
         sequence_length=sequence_length,
         device=device,
@@ -426,7 +138,7 @@ def visit_and_register_hooks(module):
 
     loss_res = []
 
-    print(f"MAC {total_macs / 1e9} GFlop, model param size: {model_numel / 1e9} B")
+    print(f"model param size: {model_numel / 1e9} B")
 
     for n, batch in enumerate(data_loader):
         if n == num_steps:
@@ -495,7 +207,7 @@ def visit_and_register_hooks(module):
         profiler.end()
         if rank == 0:
             profiler.save(
-                f"{dist_plan}_{args.model_name}_bs_{batch_size}_"
+                f"{dist_plan}_{args.model_name}_bs_{args.batch_size}_"
                 f"ckp_{is_ckp}_offload_{args.with_activation_offload}_profile.pkl"
             )
     return loss_res
@@ -519,171 +231,17 @@ def visit_and_register_hooks(module):
 
     world_size = torch.distributed.get_world_size()
 
-    MODEL_NAME = args.model_name
     if res_check:
-        MODEL_NAME = "Bert"
-    if MODEL_NAME == "Bert":
-        # 0.11B
-        HIDDEN_DIM = 768
-        SEQ_LEN = 512
-        NUM_LAYER = 6
-        NUM_HEAD = 12
-    elif MODEL_NAME == "Bertlarge":
-        # 0.35B
-        HIDDEN_DIM = 1024
-        SEQ_LEN = 512
-        NUM_LAYER = 24
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT2small":
-        # 0.7B
-        HIDDEN_DIM = 1536
-        SEQ_LEN = 128
-        NUM_LAYER = 24
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT2_1B":
-        # 0.9B
-        HIDDEN_DIM = 2048
-        SEQ_LEN = 1024
-        NUM_LAYER = 20
-        NUM_HEAD = 16
-    elif MODEL_NAME == "megatron_1.3B":
-        HIDDEN_DIM = 2048
-        SEQ_LEN = 1024
-        NUM_LAYER = 24
-        NUM_HEAD = 32
-    elif MODEL_NAME == "GPT2_2B":
-        # zero-offload
-        HIDDEN_DIM = 2048
-        SEQ_LEN = 1024
-        NUM_LAYER = 40
-        NUM_HEAD = 16
-    elif MODEL_NAME == "megatron_3.9B":
-        # Table 4 in Megatron Paper
-        HIDDEN_DIM = 2560
-        SEQ_LEN = 1024
-        NUM_LAYER = 24
-        NUM_HEAD = 40
-    elif MODEL_NAME == "GPT2_4B":
-        HIDDEN_DIM = 2304  # 2048
-        SEQ_LEN = 1024
-        NUM_LAYER = 64
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_6B":
-        # 6.7B model
-        HIDDEN_DIM = 3072
-        SEQ_LEN = 1024
-        NUM_LAYER = 53
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_8B":
-        # 6.7B model
-        HIDDEN_DIM = 3072
-        SEQ_LEN = 1024
-        NUM_LAYER = 72
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_10B":
-        HIDDEN_DIM = 4096
-        SEQ_LEN = 1024
-        NUM_LAYER = 50
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_11B":
-        HIDDEN_DIM = 4096
-        SEQ_LEN = 1024
-        NUM_LAYER = 55
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_12B":
-        HIDDEN_DIM = 4096
-        SEQ_LEN = 1024
-        NUM_LAYER = 60
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_13B":
-        HIDDEN_DIM = 4096
-        SEQ_LEN = 1024
-        NUM_LAYER = 65
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_15B":
-        HIDDEN_DIM = 4096
-        SEQ_LEN = 1024
-        NUM_LAYER = 78
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT3_18B":
-        HIDDEN_DIM = 4096
-        SEQ_LEN = 1024
-        NUM_LAYER = 90
-        NUM_HEAD = 16
-    # The following configs comes from paper
-    # Efficient Large-Scale Language Model Training on GPU Clusters
-    # NV model is wider in hidden-size
-    elif MODEL_NAME == "GPT_NV_18B":
-        HIDDEN_DIM = 6144
-        SEQ_LEN = 1024
-        NUM_LAYER = 40
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT_NV_39B":
-        HIDDEN_DIM = 8192
-        SEQ_LEN = 1024
-        NUM_LAYER = 48
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT_NV_76B":
-        HIDDEN_DIM = 10240
-        SEQ_LEN = 1024
-        NUM_LAYER = 60
-        NUM_HEAD = 16
-    # The following configs comes from Deep-Offload
-    # http://pasalabs.org/papers/2021/ATC21_zero-offload.pdf
-    elif MODEL_NAME == "GPT_DS_20B":
-        HIDDEN_DIM = 8192
-        SEQ_LEN = 1024
-        NUM_LAYER = 25
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT_DS_40B":
-        HIDDEN_DIM = 8192
-        SEQ_LEN = 1024
-        NUM_LAYER = 50
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT_DS_50B":
-        HIDDEN_DIM = 8192
-        SEQ_LEN = 1024
-        NUM_LAYER = 62
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT_DS_60B":
-        HIDDEN_DIM = 8192
-        SEQ_LEN = 1024
-        NUM_LAYER = 75
-        NUM_HEAD = 16
-    elif MODEL_NAME == "GPT_DS_70B":
-        HIDDEN_DIM = 9216
-        SEQ_LEN = 1024
-        NUM_LAYER = 69
-        NUM_HEAD = 16
-    else:
-        raise RuntimeError(f"The model name {MODEL_NAME} is not valid!")
-    if res_check:
-        BATCH_SIZE = 2
-    else:
-        BATCH_SIZE = args.batch_size
-
-    assert HIDDEN_DIM % NUM_HEAD == 0
-    logging.info(f"Benchmarking {MODEL_NAME}")
-
-    print_model_config(
-        hidden_dim=HIDDEN_DIM,
-        sequence_len=SEQ_LEN,
-        num_layer=NUM_LAYER,
-        num_head=NUM_HEAD,
-    )
+        args.model_name = "Bert"
+        args.batch_size = 2
 
     if not res_check:
         torch.manual_seed(0)
-        loss_list = test_bert_model_helper(
+        loss_list = test_transformer_model_helper(
             args=args,
             is_ckp=use_ckp,
             is_fp16=use_fp16,
             dist_plan=dist_plan,
-            batch_size=BATCH_SIZE,
-            hidden_dim=HIDDEN_DIM,
-            sequence_length=SEQ_LEN,
-            num_layer=NUM_LAYER,
-            num_head=NUM_HEAD,
             num_steps=5,
         )
         print("*" * 20 + " LOSS " + "*" * 20)
@@ -697,16 +255,11 @@ def visit_and_register_hooks(module):
         NUM_STEPS = 5
 
         torch.manual_seed(0)
-        torch_res_list = test_bert_model_helper(
+        torch_res_list = test_transformer_model_helper(
             args=args,
             is_ckp=use_ckp,
             is_fp16=False,
             dist_plan="torch",
-            hidden_dim=HIDDEN_DIM,
-            batch_size=BATCH_SIZE,
-            sequence_length=SEQ_LEN,
-            num_layer=NUM_LAYER,
-            num_head=NUM_HEAD,
             num_steps=NUM_STEPS,
         )
 
@@ -714,16 +267,11 @@ def visit_and_register_hooks(module):
         logging.info("-" * 50)
 
         torch.manual_seed(0)
-        autocast_res_list = test_bert_model_helper(
+        autocast_res_list = test_transformer_model_helper(
             args=args,
             is_ckp=use_ckp,
             is_fp16=True,
             dist_plan="torch",
-            hidden_dim=HIDDEN_DIM,
-            batch_size=BATCH_SIZE,
-            sequence_length=SEQ_LEN,
-            num_layer=NUM_LAYER,
-            num_head=NUM_HEAD,
             num_steps=NUM_STEPS,
         )
 
@@ -731,17 +279,11 @@ def visit_and_register_hooks(module):
         logging.info("-" * 50)
 
         torch.manual_seed(0)
-        ps_res_list = test_bert_model_helper(
+        ps_res_list = test_transformer_model_helper(
             args=args,
             is_ckp=use_ckp,
             is_fp16=use_fp16,
             dist_plan="patrickstar",
-            hidden_dim=HIDDEN_DIM,
-            batch_size=BATCH_SIZE,
-            sequence_length=SEQ_LEN,
-            num_layer=NUM_LAYER,
-            num_head=NUM_HEAD,
-            num_steps=NUM_STEPS,
         )
 
         print("-" * 20 + " LOSS " + "-" * 20)
diff --git a/examples/ps_config.py b/examples/ps_config.py
new file mode 100644
index 000000000..f12a97340
--- /dev/null
+++ b/examples/ps_config.py
@@ -0,0 +1,78 @@
+# BSD 3-Clause License
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+#  * Neither the name of the psutil authors nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+def get_patrickstar_config(
+    args, lr=0.001, betas=(0.9, 0.999), eps=1e-6, weight_decay=0
+):
+    config = {
+        # The same format as optimizer config of DeepSpeed
+        # https://www.deepspeed.ai/docs/config-json/#optimizer-parameters
+        "optimizer": {
+            "type": "Adam",
+            "params": {
+                "lr": lr,
+                "betas": betas,
+                "eps": eps,
+                "weight_decay": weight_decay,
+                "use_hybrid_adam": args.use_hybrid_adam,
+            },
+        },
+        "fp16": {
+            "enabled": True,
+            # Set "loss_scale" to 0 to use DynamicLossScaler.
+            "loss_scale": 0,
+            "initial_scale_power": args.init_loss_scale_power,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1,
+        },
+        "default_chunk_size": args.default_chunk_size,
+        "release_after_init": args.release_after_init,
+        "use_fake_dist": args.use_fake_dist,
+        "use_cpu_embedding": args.use_cpu_embedding,
+        "client": {
+            "mem_tracer": {
+                "use_async_mem_monitor": args.with_async_mem_monitor,
+                "warmup_gpu_chunk_mem_ratio": 0.1,
+                "overall_gpu_mem_ratio": 0.8,
+                "overall_cpu_mem_ratio": 0.8,
+                "margin_use_ratio": 0.8,
+                "use_fake_dist": False,
+                "with_static_partition": args.with_static_partition,
+            },
+            "opts": {
+                "with_mem_saving_comm": args.with_mem_saving_comm,
+                "with_mem_cache": args.with_mem_cache,
+                "with_async_move": args.with_async_move,
+            },
+        },
+    }
+
+    return config
diff --git a/examples/run_bert.sh b/examples/run_transformers.sh
similarity index 85%
rename from examples/run_bert.sh
rename to examples/run_transformers.sh
index 53da4329a..fa76cc8c8 100644
--- a/examples/run_bert.sh
+++ b/examples/run_transformers.sh
@@ -35,8 +35,8 @@ export CACHE=${CACHE:-1}
 export ASYNC_MOVE=${ASYNC_MOVE:-0}
 # linear tiling comm
 export TILING=${TILING:-0}
-
 export LOCAL_WORLD_SIZE=${LOCAL_WORLD_SIZE:-1}
+export CS_SEARCH=${CS_SEARCH:-0}
 
 if [[ ${TILING} == 1 ]];  then
 TILING_FLAG="--with_tiling_linear"
@@ -149,8 +149,8 @@ fi
 wc=`cat /proc/cpuinfo | grep "processor"| wc -l`
 let TNUM=wc/${GPU_NUM}
 echo "CPU core number " $wc "THREAD NUM " ${TNUM}
-env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
-    pretrain_bert_demo.py \
+
+cmd_opts="
     --use_fp16 \
     ${RES_CHECK_FLAG} \
     ${NO_RETRY_FLAG} \
@@ -162,7 +162,6 @@ env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.l
     ${CPU_EBD_FLAG} \
     ${HYBRID_ADAM_FLAG} \
     ${RELEASE_AFTER_INIT_FLAG} \
-    --default_chunk_size=${CHUNK_SIZE} \
     ${LIGHTSEQ_FLAG} \
     ${ACT_OFFLOAD_FLAG} \
     ${SP_FLAG} \
@@ -172,4 +171,28 @@ env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.l
     ${CACHE_FLAG} \
     ${ASYNC_MOVE_FLAG} \
     ${TILING_FLAG} \
+"
+
+if [[ ${CS_SEARCH} == 1 ]];  then
+for((i=64;i<=512;i+=32));
+do
+let CUR_CHUNK_SIZE=${i}*1024*1024
+echo "searching ${CUR_CHUNK_SIZE}"
+mkdir -p ./search_res
+SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_gpu_${GPU_NUM}_bs_${BS}_cpueb_${CPU_EBD}_lightseq_${LIGHTSEQ}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
+rm -rf ${SLOG_FILE}
+
+python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
+    best_chunk_size_search.py \
+    --default_chunk_size=${CUR_CHUNK_SIZE} \
+    --slog_file=${SLOG_FILE} \
+    ${cmd_opts}
+
+done
+else
+env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
+    pretrain_bert_demo.py \
+    --default_chunk_size=${CHUNK_SIZE} \
+    ${cmd_opts}
     2>&1 | tee ${LOG_DIR}/${LOG_FILE}
+fi
diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 034be88e5..2904bfa12 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -272,7 +272,7 @@ def append_tensor(
             chunk_id, param_list, access_type
         ):
             raise RuntimeError(
-                f"Can not append a tensor to chunk_tensor_index."
+                f"Can not append a tensor to chunk_tensor_index. "
                 f"Overall size of param list is larger than the default chunk size {self.default_chunk_size}."
             )
         return
@@ -935,7 +935,8 @@ def get_overall_chunk_size(self):
                 overall_utilization_ratio += last_used_pos / chunk.capacity
                 overall_size += chunk.get_chunk_space()
                 overall_chunk_num += 1
-            return overall_size, overall_utilization_ratio
+        overall_utilization_ratio /= overall_chunk_num
+        return overall_size, overall_utilization_ratio
 
     def display_chunk_info(self):
         logger.info("Print chunk list info.")

From b3c9757c4ea0633fcc28e374dcdcb61b79eda369 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 15:00:54 +0800
Subject: [PATCH 04/19] polish

---
 examples/run_transformers.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/run_transformers.sh b/examples/run_transformers.sh
index fa76cc8c8..ef8a14aeb 100644
--- a/examples/run_transformers.sh
+++ b/examples/run_transformers.sh
@@ -174,20 +174,20 @@ cmd_opts="
 "
 
 if [[ ${CS_SEARCH} == 1 ]];  then
-for((i=64;i<=512;i+=32));
-do
-let CUR_CHUNK_SIZE=${i}*1024*1024
-echo "searching ${CUR_CHUNK_SIZE}"
 mkdir -p ./search_res
 SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_gpu_${GPU_NUM}_bs_${BS}_cpueb_${CPU_EBD}_lightseq_${LIGHTSEQ}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
 rm -rf ${SLOG_FILE}
 
+
+for((i=64;i<=512;i+=32));
+do
+let CUR_CHUNK_SIZE=${i}*1024*1024
+echo "searching ${CUR_CHUNK_SIZE}"
 python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
     best_chunk_size_search.py \
     --default_chunk_size=${CUR_CHUNK_SIZE} \
     --slog_file=${SLOG_FILE} \
     ${cmd_opts}
-
 done
 else
 env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \

From 1a64e55695a9f80effe53a7f88c515a681a5c8cb Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 15:04:27 +0800
Subject: [PATCH 05/19] update example readme

---
 examples/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index f422d8928..49b075280 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -44,3 +44,13 @@ python -m torch.distributed.launch --nproc_per_node=4 huggingface_bert_moe.py
 ```
 
 Note that you need to install [FastMoE](https://github.com/laekov/fastmoe) before running this example.
+
+
+### Search the best chunk size
+
+Chunk size (CS) is an important hyperparameter for patrickstar.
+Although you can set an CS value empirically by run your training task serveral times. We provide an systemic way to find a CS with less memory footprint. Using the following command to search the chunk size.
+
+```
+ env CS_SEARCH=1 bash run_transformers.sh
+```

From 322e891e9e00461d7aaad19cdc69d836803ef9fc Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 15:09:33 +0800
Subject: [PATCH 06/19] except dummy chunk in chunk size calculating

---
 patrickstar/core/client.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 2904bfa12..577a2cf09 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -914,6 +914,7 @@ def get_overall_chunk_size(self):
         """
         return the overall size of all chunks and
         the overall chunk utilization excluding fragments.
+        Excepting the dummy chunk if using MSC.
         """
         overall_size = 0
         overall_chunk_num = 0
@@ -922,9 +923,12 @@ def get_overall_chunk_size(self):
             type,
             type_chunk_list,
         ) in self.chunk_tensor_index.chunk_type_to_chunk_id_list_map.items():
+
             logger.info(f"Chunk list {type}")
             for chunk_id in type_chunk_list:
                 chunk = self.chunk_list[chunk_id]
+                if self.opt_config["with_mem_saving_comm"] and chunk.is_dummy():
+                    continue
                 comm_info = self.chunk_tensor_index.chunk_id_to_comm_info_map[chunk_id]
                 assert comm_info is not None
                 last_used_pos = 0

From 678e6ff8149d7457f3d54e48d677d61fd3d4eede Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 15:52:40 +0800
Subject: [PATCH 07/19] do not init param when chunk size searching

---
 examples/best_chunk_size_search.py | 12 +++++----
 patrickstar/core/preprocess.py     | 39 ++++++++++++++++--------------
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/examples/best_chunk_size_search.py b/examples/best_chunk_size_search.py
index c1a6f5e29..e791ac1c1 100644
--- a/examples/best_chunk_size_search.py
+++ b/examples/best_chunk_size_search.py
@@ -38,7 +38,7 @@
 from patrickstar.core import PatrickStarClient
 from patrickstar.core import PSPreProcessCtx
 
-from patrickstar.utils.distributed import get_local_world_size
+from patrickstar.utils.distributed import get_local_world_size, get_rank
 from patrickstar.utils.memory import get_memory_info
 
 
@@ -97,6 +97,7 @@ def get_param_used_chunk_size(args, config, model_func):
             dtype=torch.float,
             release_after_init=args.release_after_init,
             use_cpu_embedding=args.use_cpu_embedding,
+            not_init=True,
         ):
             model = model_func()
     except Exception:
@@ -171,10 +172,11 @@ def evaluate_chunk_size(
     logger.info(f"{overall_chunk_size}, {utils}\n")
     logger.info(f"writing to {args.slog_file}\n")
 
-    with open(f"{args.slog_file}", "a+") as fh:
-        fh.write(
-            f"{args.default_chunk_size/1024/1024} {overall_chunk_size/1024/1024}, {utils}\n"
-        )
+    if get_rank() == 0:
+        with open(f"{args.slog_file}", "a+") as fh:
+            fh.write(
+                f"{args.default_chunk_size/1024/1024} {overall_chunk_size/1024/1024}, {utils}\n"
+            )
 
 
 if __name__ == "__main__":
diff --git a/patrickstar/core/preprocess.py b/patrickstar/core/preprocess.py
index 253ae8541..683e634a3 100644
--- a/patrickstar/core/preprocess.py
+++ b/patrickstar/core/preprocess.py
@@ -219,6 +219,7 @@ def __init__(
         release_after_init=False,
         use_cpu_embedding=False,
         dtype=None,
+        not_init=False,
     ):
         super().__init__(config=None, dtype=dtype)
         self.rank = get_rank()
@@ -231,6 +232,7 @@ def __init__(
         self.use_cpu_embedding = use_cpu_embedding
 
         self.submodule_id = -1
+        self.not_init = not_init
 
     def _pre_context_exec(self):
         Embedding.use_cpu = self.use_cpu_embedding
@@ -297,24 +299,25 @@ def _origin_new(cls, *arg, **kwargs):
                         param_fp32_chunk_id
                     ),
                 ):
-                    if is_param_registered(param_fp32) and is_param_registered(
-                        param_fp16
-                    ):
-                        ps_data_fp16 = self.client.access_data(
-                            param_fp16, torch.device("cpu:0")
-                        )
-
-                        ps_data_fp32 = self.client.access_data(
-                            param_fp32, torch.device("cpu:0")
-                        )
-
-                        # Here the dtype of param_fp16 is actually fp32.
-                        ps_data_fp16.copy_(param_fp16.data)
-                        ps_data_fp32.copy_(param_fp16.data)
-
-                        self.client.release_data(param_fp16)
-                        self.client.release_data(param_fp32)
-                        param_fp16 = param_fp16.to(torch.half)
+                    if not self.not_init:
+                        if is_param_registered(param_fp32) and is_param_registered(
+                            param_fp16
+                        ):
+                            ps_data_fp16 = self.client.access_data(
+                                param_fp16, torch.device("cpu:0")
+                            )
+
+                            ps_data_fp32 = self.client.access_data(
+                                param_fp32, torch.device("cpu:0")
+                            )
+
+                            # Here the dtype of param_fp16 is actually fp32.
+                            ps_data_fp16.copy_(param_fp16.data)
+                            ps_data_fp32.copy_(param_fp16.data)
+
+                            self.client.release_data(param_fp16)
+                            self.client.release_data(param_fp32)
+                            param_fp16 = param_fp16.to(torch.half)
             else:
                 for param_fp16 in self.client.chunk_tensor_index.params_generator(
                     param_fp16_chunk_id

From 950655e2bf35349eee7df092183fcc63eeb2e788 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 16:42:51 +0800
Subject: [PATCH 08/19] fix a bug

---
 examples/best_chunk_size_search.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/best_chunk_size_search.py b/examples/best_chunk_size_search.py
index e791ac1c1..2d194376e 100644
--- a/examples/best_chunk_size_search.py
+++ b/examples/best_chunk_size_search.py
@@ -38,7 +38,7 @@
 from patrickstar.core import PatrickStarClient
 from patrickstar.core import PSPreProcessCtx
 
-from patrickstar.utils.distributed import get_local_world_size, get_rank
+from patrickstar.utils.distributed import get_local_world_size, get_rank, get_world_size
 from patrickstar.utils.memory import get_memory_info
 
 
@@ -74,7 +74,10 @@ def chunk_schema_valid_check(args, config, chunk_size, overall_chunk_size):
         logger.info("chunk is unable to be fitted in GPU during warmup")
         return False
 
-    if warmup_used_gpu_mem + overall_cpu_mem < overall_chunk_size / 6 * 14:
+    if (
+        warmup_used_gpu_mem + overall_cpu_mem
+        < overall_chunk_size / get_world_size() / 6 * 14
+    ):
         logger.info("overall chunks is not able to fit in CPU + GPU")
         return False
     return True

From 2facdb55298a812e804934ae3c71364bb6baaf19 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Thu, 9 Dec 2021 17:30:58 +0800
Subject: [PATCH 09/19] polish code

---
 examples/best_chunk_size_search.py | 32 +++++++++++++++++++-----------
 examples/parse_args.py             |  2 +-
 examples/run_transformers.sh       |  2 +-
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/examples/best_chunk_size_search.py b/examples/best_chunk_size_search.py
index 2d194376e..da565d2b5 100644
--- a/examples/best_chunk_size_search.py
+++ b/examples/best_chunk_size_search.py
@@ -65,21 +65,26 @@ def chunk_schema_valid_check(args, config, chunk_size, overall_chunk_size):
         "warmup_gpu_chunk_mem_ratio", 0.1
     )
 
-    logger.info(
-        f"warmup_used_gpu_mem {warmup_used_gpu_mem}, "
-        f"overall_cpu_mem {overall_cpu_mem}, "
-        f"overall_chunk_size {overall_chunk_size}"
-    )
     if warmup_used_gpu_mem < chunk_size * 2:
-        logger.info("chunk is unable to be fitted in GPU during warmup")
+        logger.error(
+            "chunk is unable to be fitted in GPU during warmup!\n"
+            f"GPU Mem {warmup_used_gpu_mem/1024/1024} MB vs. Chunk {chunk_size * 2/1024/1024} MB"
+        )
         return False
 
-    if (
-        warmup_used_gpu_mem + overall_cpu_mem
-        < overall_chunk_size / get_world_size() / 6 * 14
-    ):
-        logger.info("overall chunks is not able to fit in CPU + GPU")
+    cpu_gpu_mem = warmup_used_gpu_mem + overall_cpu_mem
+    need_mem = overall_chunk_size / get_world_size() / 6 * 14
+    if cpu_gpu_mem < need_mem:
+        logger.error(
+            f"overall chunks is not able to fit in CPU + GPU "
+            f"{cpu_gpu_mem/1024/1024} MB vs. {need_mem/1024/1024} MB"
+        )
         return False
+    print(
+        f"warmup_used_gpu_mem {warmup_used_gpu_mem / 1024/1024} MB\n"
+        f"overall_cpu_mem {overall_cpu_mem/ 1024/1024} MB\n"
+        f"need_mem {need_mem/ 1024/1024} MB\n"
+    )
     return True
 
 
@@ -104,6 +109,7 @@ def get_param_used_chunk_size(args, config, model_func):
         ):
             model = model_func()
     except Exception:
+        logger.error("PSPreProcessCtx failed")
         return -1, -1
 
     del model
@@ -115,8 +121,10 @@ def get_param_used_chunk_size(args, config, model_func):
         args.default_chunk_size,
         overall_chunk_size,
     ):
+
         return overall_chunk_size, util
     else:
+        logger.error("chunk_schema_valid_check failed")
         return -1, -1
 
 
@@ -191,7 +199,7 @@ def evaluate_chunk_size(
 
     # You could set the logger level to INFO to view more runtime
     # information.
-    logger.setLevel(logging.INFO)
+    logger.setLevel(logging.WARNING)
     if not torch.distributed.is_initialized():
         torch.distributed.init_process_group(
             backend="gloo" if args.use_fake_dist else "nccl"
diff --git a/examples/parse_args.py b/examples/parse_args.py
index 861c3c6b6..6f8c86513 100644
--- a/examples/parse_args.py
+++ b/examples/parse_args.py
@@ -193,5 +193,5 @@ def parse_args():
     args = parser.parse_args()
     args.rank = int(os.getenv("RANK", "0"))
     args.world_size = int(os.getenv("WORLD_SIZE", "1"))
-    _print_args(args)
+    # _print_args(args)
     return args
diff --git a/examples/run_transformers.sh b/examples/run_transformers.sh
index ef8a14aeb..a0521465c 100644
--- a/examples/run_transformers.sh
+++ b/examples/run_transformers.sh
@@ -179,7 +179,7 @@ SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_gpu_${GPU_NUM}_bs_${BS}_cpueb_${
 rm -rf ${SLOG_FILE}
 
 
-for((i=64;i<=512;i+=32));
+for((i=512;i>=64;i-=32));
 do
 let CUR_CHUNK_SIZE=${i}*1024*1024
 echo "searching ${CUR_CHUNK_SIZE}"

From fd02f3be94108761e94de09755eca28593349428 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 11:43:44 +0800
Subject: [PATCH 10/19] polish logger for chunk size evaluation

---
 ...hunk_size_search.py => eval_chunk_size.py} | 139 ++++++++----------
 examples/parse_args.py                        |   3 +-
 examples/ps_config.py                         |   4 +-
 examples/run_transformers.sh                  |  20 +--
 4 files changed, 70 insertions(+), 96 deletions(-)
 rename examples/{best_chunk_size_search.py => eval_chunk_size.py} (62%)

diff --git a/examples/best_chunk_size_search.py b/examples/eval_chunk_size.py
similarity index 62%
rename from examples/best_chunk_size_search.py
rename to examples/eval_chunk_size.py
index da565d2b5..40e80038a 100644
--- a/examples/best_chunk_size_search.py
+++ b/examples/eval_chunk_size.py
@@ -30,16 +30,32 @@
 
 import logging
 import torch
-from patrickstar.utils import see_memory_usage
-from patrickstar.utils.logging import logger
+
+# from patrickstar.utils.logging import logger
 from model_builder import build_transformer_model
 from ps_config import get_patrickstar_config
 from parse_args import parse_args
 from patrickstar.core import PatrickStarClient
 from patrickstar.core import PSPreProcessCtx
 
-from patrickstar.utils.distributed import get_local_world_size, get_rank, get_world_size
-from patrickstar.utils.memory import get_memory_info
+from patrickstar.utils.distributed import get_rank
+
+from rich.logging import RichHandler
+
+logger = logging.getLogger(__name__)
+logger.addHandler(RichHandler())
+
+MB_NUM = 1024 * 1024
+GB_NUM = 1024 * MB_NUM
+
+HARDWARE_SETTING_JSON = {
+    "per_cpu_mem": 16 * GB_NUM,
+    "per_gpu_mem": 8 * GB_NUM,
+    "global_gpu_num": 1,
+    "gloabl_cpu_num": 1,
+    "local_gpu_num": 1,
+    "local_cpu_num": 1,
+}
 
 
 def chunk_schema_valid_check(args, config, chunk_size, overall_chunk_size):
@@ -53,37 +69,50 @@ def chunk_schema_valid_check(args, config, chunk_size, overall_chunk_size):
     returns:
         bool: is the chunk schema valid
     """
-    mem_info = get_memory_info()
-    local_world_size = get_local_world_size()
-    overall_gpu_mem = torch.cuda.get_device_properties(
-        args.local_rank
-    ).total_memory * config.get("overall_gpu_mem_ratio", 0.8)
-    overall_cpu_mem = (
-        mem_info.total * config.get("overall_cpu_mem_ratio", 0.8) / local_world_size
-    )
-    warmup_used_gpu_mem = overall_gpu_mem * config.get(
-        "warmup_gpu_chunk_mem_ratio", 0.1
+    per_gpu_mem = HARDWARE_SETTING_JSON.get("per_gpu_mem")
+    per_cpu_mem = HARDWARE_SETTING_JSON.get("per_cpu_mem")
+    global_gpu_num = HARDWARE_SETTING_JSON.get("global_gpu_num")
+    global_cpu_num = HARDWARE_SETTING_JSON.get("gloabl_cpu_num")
+    ava_per_gpu_mem = (
+        per_gpu_mem
+        * config.get("overall_gpu_mem_ratio", 0.8)
+        * config.get("warmup_gpu_chunk_mem_ratio", 0.1)
     )
 
-    if warmup_used_gpu_mem < chunk_size * 2:
+    ava_per_cpu_mem = per_cpu_mem * config.get("overall_cpu_mem_ratio", 0.8)
+
+    # GPU mem has to host at least two chunks.
+    if ava_per_gpu_mem < chunk_size * 2:
         logger.error(
             "chunk is unable to be fitted in GPU during warmup!\n"
-            f"GPU Mem {warmup_used_gpu_mem/1024/1024} MB vs. Chunk {chunk_size * 2/1024/1024} MB"
+            "GPU Mem %.2f MB vs. Two Chunks %.2f MB",
+            ava_per_gpu_mem / MB_NUM,
+            chunk_size * 2 / MB_NUM,
         )
         return False
 
-    cpu_gpu_mem = warmup_used_gpu_mem + overall_cpu_mem
-    need_mem = overall_chunk_size / get_world_size() / 6 * 14
-    if cpu_gpu_mem < need_mem:
+    # CPU + GPU shall not exceed the 14M (M numel of param)
+    overall_cpu_gpu_mem = (
+        ava_per_gpu_mem * global_gpu_num + ava_per_cpu_mem * global_cpu_num
+    )
+    need_mem = overall_chunk_size / 6 * 14
+    if overall_cpu_gpu_mem < need_mem:
         logger.error(
-            f"overall chunks is not able to fit in CPU + GPU "
-            f"{cpu_gpu_mem/1024/1024} MB vs. {need_mem/1024/1024} MB"
+            "Overall chunks can't fit in memory of CPU+GPU " "%.2f MB vs. %.2f MB",
+            overall_cpu_gpu_mem / MB_NUM,
+            need_mem / MB_NUM,
         )
         return False
-    print(
-        f"warmup_used_gpu_mem {warmup_used_gpu_mem / 1024/1024} MB\n"
-        f"overall_cpu_mem {overall_cpu_mem/ 1024/1024} MB\n"
-        f"need_mem {need_mem/ 1024/1024} MB\n"
+
+    logger.info(
+        "Evaluated chunk size %d Melem"
+        "ava_per_gpu_mem %.2f MB, "
+        "ava_per_cpu_mem %.2f MB, "
+        "need_mem %.2f MB\n",
+        args.default_chunk_size / MB_NUM,
+        ava_per_gpu_mem / MB_NUM,
+        ava_per_cpu_mem / MB_NUM,
+        need_mem / MB_NUM,
     )
     return True
 
@@ -124,36 +153,18 @@ def get_param_used_chunk_size(args, config, model_func):
 
         return overall_chunk_size, util
     else:
-        logger.error("chunk_schema_valid_check failed")
+        logger.error("Chunk schema validation check failed!")
         return -1, -1
 
 
-def evaluate_chunk_size(
-    args,
-    is_ckp: bool = False,
-    is_fp16: bool = False,
-    dist_plan: str = "torch",
-    num_steps=5,
-):
+def evaluate_chunk_size(args):
     """
     Evaluate the current training task defined by the args.
     write the chunk memory usage to the file.
     """
-    logger.info(
-        f'test a bert {"fp16" if is_fp16 else "fp32"} model '
-        f'{"with checkpoint" if is_ckp else ""}'
-    )
-
-    # Use single card to simulate multicard. Used when you are poor and
-    # no more GPU avaiable.
-    if args.use_fake_dist:
-        rank = 0
-    else:
-        rank = args.local_rank
-
     # Avoid gpu0 use more memory.
     # https://discuss.pytorch.org/t/extra-10gb-memory-on-gpu-0-in-ddp-tutorial/118113
-    torch.cuda.set_device(rank)
+    torch.cuda.set_device(args.local_rank)
     torch.cuda.empty_cache()
 
     lr = 0.001
@@ -166,21 +177,11 @@ def evaluate_chunk_size(
         args, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay
     )
 
-    see_memory_usage(
-        f"before get_param_used_chunk_size for {args.default_chunk_size/1024/1024} MB",
-        True,
-        "MB",
-    )
-
     overall_chunk_size, utils = get_param_used_chunk_size(args, config, model_func)
 
-    see_memory_usage(
-        f"after get_param_used_chunk_size for {args.default_chunk_size/1024/1024} MB",
-        True,
-        "MB",
+    logger.info(
+        "chunk uses %.2f MB, utilization %.2f \n", overall_chunk_size / MB_NUM, utils
     )
-
-    logger.info(f"{overall_chunk_size}, {utils}\n")
     logger.info(f"writing to {args.slog_file}\n")
 
     if get_rank() == 0:
@@ -192,24 +193,6 @@ def evaluate_chunk_size(
 
 if __name__ == "__main__":
     args = parse_args()
-    use_ckp = args.use_ckp
-    use_fp16 = args.use_fp16
-    dist_plan = args.dist_plan
-    res_check = args.res_check
-
-    # You could set the logger level to INFO to view more runtime
-    # information.
-    logger.setLevel(logging.WARNING)
-    if not torch.distributed.is_initialized():
-        torch.distributed.init_process_group(
-            backend="gloo" if args.use_fake_dist else "nccl"
-        )
-
+    logger.setLevel(logging.INFO)
     torch.manual_seed(0)
-    loss_list = evaluate_chunk_size(
-        args=args,
-        is_ckp=use_ckp,
-        is_fp16=use_fp16,
-        dist_plan=dist_plan,
-        num_steps=5,
-    )
+    evaluate_chunk_size(args=args)
diff --git a/examples/parse_args.py b/examples/parse_args.py
index 6f8c86513..7921588c7 100644
--- a/examples/parse_args.py
+++ b/examples/parse_args.py
@@ -145,7 +145,6 @@ def _add_test_config_args(parser):
         default=None,
         help="local rank passed from distributed launcher.",
     )
-
     group.add_argument(
         "--res_check",
         dest="res_check",
@@ -193,5 +192,5 @@ def parse_args():
     args = parser.parse_args()
     args.rank = int(os.getenv("RANK", "0"))
     args.world_size = int(os.getenv("WORLD_SIZE", "1"))
-    # _print_args(args)
+    _print_args(args)
     return args
diff --git a/examples/ps_config.py b/examples/ps_config.py
index f12a97340..0e5aeb366 100644
--- a/examples/ps_config.py
+++ b/examples/ps_config.py
@@ -61,8 +61,8 @@ def get_patrickstar_config(
             "mem_tracer": {
                 "use_async_mem_monitor": args.with_async_mem_monitor,
                 "warmup_gpu_chunk_mem_ratio": 0.1,
-                "overall_gpu_mem_ratio": 0.8,
-                "overall_cpu_mem_ratio": 0.8,
+                "overall_gpu_mem_ratio": 0.9,
+                "overall_cpu_mem_ratio": 0.9,
                 "margin_use_ratio": 0.8,
                 "use_fake_dist": False,
                 "with_static_partition": args.with_static_partition,
diff --git a/examples/run_transformers.sh b/examples/run_transformers.sh
index a0521465c..37660ded4 100644
--- a/examples/run_transformers.sh
+++ b/examples/run_transformers.sh
@@ -102,14 +102,6 @@ else
 export RELEASE_AFTER_INIT_FLAG=""
 fi
 
-export LIGHTSEQ=0
-if [[ ${LIGHTSEQ} == 1 ]]; then
-export LIGHTSEQ_FLAG="--with_lightseq"
-else
-export LIGHTSEQ_FLAG=""
-fi
-
-
 if [[ ${CKP} == 1 ]]; then
     export CKP_FLAG="--use_ckp"
 else
@@ -175,16 +167,16 @@ cmd_opts="
 
 if [[ ${CS_SEARCH} == 1 ]];  then
 mkdir -p ./search_res
-SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_gpu_${GPU_NUM}_bs_${BS}_cpueb_${CPU_EBD}_lightseq_${LIGHTSEQ}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
+SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_bs_${BS}_cpueb_${CPU_EBD}_lightseq_${LIGHTSEQ}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
 rm -rf ${SLOG_FILE}
 
-
-for((i=512;i>=64;i-=32));
+for((i=312;i>=64;i-=32));
 do
 let CUR_CHUNK_SIZE=${i}*1024*1024
-echo "searching ${CUR_CHUNK_SIZE}"
-python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
-    best_chunk_size_search.py \
+echo "searching CHUNK_SIZE ${i} M elem"
+
+python -m torch.distributed.launch --nproc_per_node=1 \
+    eval_chunk_size.py \
     --default_chunk_size=${CUR_CHUNK_SIZE} \
     --slog_file=${SLOG_FILE} \
     ${cmd_opts}

From a7b258e4e3030a38aded00fe4fb14c198b66e87a Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 12:01:18 +0800
Subject: [PATCH 11/19] use rich logger

---
 examples/pretrain_bert_demo.py          |  2 +-
 patrickstar/core/client.py              | 12 ++++++------
 patrickstar/core/memtracer/memtracer.py |  3 ++-
 patrickstar/ops/chunk_io_buff.py        |  6 +++---
 patrickstar/ops/fp16_cpu_adam.py        |  2 +-
 patrickstar/utils/global_timer.py       |  4 ++--
 patrickstar/utils/logging.py            |  3 ++-
 requirements.txt                        |  1 +
 8 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/examples/pretrain_bert_demo.py b/examples/pretrain_bert_demo.py
index 35a7f703e..aa492a63c 100644
--- a/examples/pretrain_bert_demo.py
+++ b/examples/pretrain_bert_demo.py
@@ -223,7 +223,7 @@ def test_transformer_model_helper(
 
     # You could set the logger level to INFO to view more runtime
     # information.
-    logger.setLevel(logging.WARNING)
+    logger.setLevel(logging.INFO)
     if not torch.distributed.is_initialized():
         torch.distributed.init_process_group(
             backend="gloo" if args.use_fake_dist else "nccl"
diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 577a2cf09..520bf5efd 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -464,7 +464,7 @@ def _fetch_remote_chunks(
                     "CLIENT_fetch_remote_chunks_allgather"
                 )
 
-            logger.info(f"rank {rank} allgather {chunk_id_list}")
+            logger.debug(f"rank {rank} allgather {chunk_id_list}")
             torch.distributed.all_gather(
                 allgather_payload_buff,
                 self.chunk_list[local_chunk_id].payload,
@@ -924,7 +924,7 @@ def get_overall_chunk_size(self):
             type_chunk_list,
         ) in self.chunk_tensor_index.chunk_type_to_chunk_id_list_map.items():
 
-            logger.info(f"Chunk list {type}")
+            logger.debug(f"Chunk list {type}")
             for chunk_id in type_chunk_list:
                 chunk = self.chunk_list[chunk_id]
                 if self.opt_config["with_mem_saving_comm"] and chunk.is_dummy():
@@ -943,7 +943,7 @@ def get_overall_chunk_size(self):
         return overall_size, overall_utilization_ratio
 
     def display_chunk_info(self):
-        logger.info("Print chunk list info.")
+        logger.debug("Print chunk list info.")
 
         overall_size = 0
         overall_chunk_num = 0
@@ -952,13 +952,13 @@ def display_chunk_info(self):
             type,
             type_chunk_list,
         ) in self.chunk_tensor_index.chunk_type_to_chunk_id_list_map.items():
-            logger.info(f"Chunk list {type}")
+            logger.debug(f"Chunk list {type}")
             for chunk_id in type_chunk_list:
                 chunk = self.chunk_list[chunk_id]
                 comm_info = self.chunk_tensor_index.chunk_id_to_comm_info_map[chunk_id]
                 assert comm_info is not None
 
-                logger.info(
+                logger.debug(
                     f"Chunk id {chunk.chunk_id}, state {chunk.get_state()}, "
                     f"comm info {comm_info}, "
                     f"capacity {chunk.capacity / 1024 / 1024} M elems, "
@@ -975,7 +975,7 @@ def display_chunk_info(self):
                         f"tensor_id {info.tensor_id}, state {info.state()}, name {info.tensor_name}"
                     )
                     last_used_pos = max(last_used_pos, info.start_offset + info.numel)
-                logger.info(
+                logger.debug(
                     f"chunk used {last_used_pos/1024/1024} M elem, "
                     f"{last_used_pos/chunk.capacity * 100} %"
                 )
diff --git a/patrickstar/core/memtracer/memtracer.py b/patrickstar/core/memtracer/memtracer.py
index 2d67bacd5..03cf10ceb 100644
--- a/patrickstar/core/memtracer/memtracer.py
+++ b/patrickstar/core/memtracer/memtracer.py
@@ -208,7 +208,8 @@ def update_margin_mem(self):
             f"Margin Mem Size {margin_mem_size / 1e6} MB, "
             f"available chunk num for Optimizer States {self._margin_chunk_num_for_gpu_adam}"
         )
-        logger.info(f"OVERALL GPU MEM {self._overall_gpu_mem}")
+        logger.info("--------------- GPU INFO AFTER BWD ----------------")
+        logger.debug(f"OVERALL GPU MEM {self._overall_gpu_mem/1024/1024} MB")
 
     def reset_memory_stats(self):
         """
diff --git a/patrickstar/ops/chunk_io_buff.py b/patrickstar/ops/chunk_io_buff.py
index de8e37bfb..9218f0333 100644
--- a/patrickstar/ops/chunk_io_buff.py
+++ b/patrickstar/ops/chunk_io_buff.py
@@ -135,7 +135,7 @@ def reset(self):
         if self.cached_src_chunk_id is None:
             return
         global_rank = get_rank()
-        logger.info(
+        logger.debug(
             f"global_rank {global_rank} finally, write chunk {self.cached_target_chunk_id}"
         )
         # It's possible that the chunk is empty (no payload), e.g. the process only possesses
@@ -197,13 +197,13 @@ def __init__(
                     gpu_device, chunk_size, torch.half, False
                 )
             else:
-                logger.info(
+                logger.debug(
                     f"Allocate fp32 Chunk Buffer of size {chunk_size / 1e6} MB on CPU."
                 )
                 self.gpu_payload = torch.empty(
                     chunk_size, dtype=torch.half, device=gpu_device
                 )
-            logger.info(
+            logger.debug(
                 f"Allocate fp32 Chunk Buffer of size {chunk_size / 1e6} MB on {gpu_device}."
             )
         self.cached_chunk_id = None
diff --git a/patrickstar/ops/fp16_cpu_adam.py b/patrickstar/ops/fp16_cpu_adam.py
index 11610b374..111ce44f5 100644
--- a/patrickstar/ops/fp16_cpu_adam.py
+++ b/patrickstar/ops/fp16_cpu_adam.py
@@ -329,7 +329,7 @@ def fp16_chunk_adam_ops(
         Copy the chunk into a tmp buffer to speed up the memcpy between devices.
         """
         local_rank = client.local_rank
-        logger.info(
+        logger.debug(
             f"local_rank {local_rank} margin_chunk_num_for_gpu_adam {margin_chunk_num_for_gpu_adam}, "
             f"param cnt {len(fp32_param_list)}"
         )
diff --git a/patrickstar/utils/global_timer.py b/patrickstar/utils/global_timer.py
index 240484840..74fd47ef4 100644
--- a/patrickstar/utils/global_timer.py
+++ b/patrickstar/utils/global_timer.py
@@ -29,7 +29,7 @@
 
 import time
 
-from .logging import logger
+# from .logging import logger
 from .singleton_meta import SingletonMeta
 
 
@@ -72,7 +72,7 @@ def reset(self):
     def print(self):
         if not self.start_flag:
             return
-        logger.info("------------- PROFILE RESULTS ----------------")
+        print("------------- PROFILE RESULTS ----------------")
         dot_length = 20
         for k in self.elapse_stat:
             dot_length = max(dot_length, len(k) + 2)
diff --git a/patrickstar/utils/logging.py b/patrickstar/utils/logging.py
index bdc8a23f2..80da21027 100644
--- a/patrickstar/utils/logging.py
+++ b/patrickstar/utils/logging.py
@@ -28,6 +28,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import logging
 import sys
+from rich.logging import RichHandler
 
 import torch.distributed as dist
 
@@ -57,7 +58,7 @@ def create_logger(name=None, level=logging.WARNING):
         logger_.propagate = False
         ch = logging.StreamHandler(stream=sys.stdout)
         ch.setFormatter(formatter)
-        logger_.addHandler(ch)
+        logger_.addHandler(RichHandler())
         return logger_
 
 
diff --git a/requirements.txt b/requirements.txt
index 377edef87..169c74355 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ torch>=1.5.0
 pytest
 psutil
 ninja
+rich

From e309345c67ebb0ae10aa1ae6c10fee649fb65c97 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 14:48:42 +0800
Subject: [PATCH 12/19] polish dist log

---
 examples/eval_chunk_size.py             | 18 ++++++---------
 examples/run_transformers.sh            |  4 ++--
 patrickstar/core/client.py              | 10 ++++-----
 patrickstar/core/memtracer/memtracer.py | 29 +++++++++++++++++--------
 patrickstar/core/preprocess.py          |  8 +++----
 patrickstar/runtime/engine.py           |  6 ++---
 patrickstar/utils/logging.py            |  2 +-
 7 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/examples/eval_chunk_size.py b/examples/eval_chunk_size.py
index 40e80038a..9b74da0bb 100644
--- a/examples/eval_chunk_size.py
+++ b/examples/eval_chunk_size.py
@@ -31,26 +31,21 @@
 import logging
 import torch
 
-# from patrickstar.utils.logging import logger
+from patrickstar.utils.logging import logger, log_dist
 from model_builder import build_transformer_model
 from ps_config import get_patrickstar_config
 from parse_args import parse_args
 from patrickstar.core import PatrickStarClient
 from patrickstar.core import PSPreProcessCtx
-
+import time
 from patrickstar.utils.distributed import get_rank
 
-from rich.logging import RichHandler
-
-logger = logging.getLogger(__name__)
-logger.addHandler(RichHandler())
-
 MB_NUM = 1024 * 1024
 GB_NUM = 1024 * MB_NUM
 
 HARDWARE_SETTING_JSON = {
-    "per_cpu_mem": 16 * GB_NUM,
-    "per_gpu_mem": 8 * GB_NUM,
+    "per_cpu_mem": 240 * GB_NUM,
+    "per_gpu_mem": 32 * GB_NUM,
     "global_gpu_num": 1,
     "gloabl_cpu_num": 1,
     "local_gpu_num": 1,
@@ -127,7 +122,7 @@ def get_param_used_chunk_size(args, config, model_func):
         default_chunk_size=args.default_chunk_size,
         config=config.get("client", None),
     )
-
+    start_time = time.time()
     try:
         with PSPreProcessCtx(
             client=client,
@@ -140,7 +135,8 @@ def get_param_used_chunk_size(args, config, model_func):
     except Exception:
         logger.error("PSPreProcessCtx failed")
         return -1, -1
-
+    end_time = time.time()
+    log_dist(f"PSPreProcessCtx Model Constructing elapse {end_time - start_time}")
     del model
 
     overall_chunk_size, util = client.get_overall_chunk_size()
diff --git a/examples/run_transformers.sh b/examples/run_transformers.sh
index 37660ded4..03dce1ec3 100644
--- a/examples/run_transformers.sh
+++ b/examples/run_transformers.sh
@@ -115,7 +115,7 @@ LOG_DIR="./logs_${MODEL_NAME}"
 mkdir -p ${LOG_DIR}
 
 GIT_VER=`git rev-parse --short=5 HEAD`
-LOG_FILE="log.${MODEL_NAME}_gpu_${GPU_NUM}_cs_${CS}_bs_${BS}_cpueb_${CPU_EBD}_lightseq_${LIGHTSEQ}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
+LOG_FILE="log.${MODEL_NAME}_gpu_${GPU_NUM}_cs_${CS}_bs_${BS}_cpueb_${CPU_EBD}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
 
 is_run_flag=`python ./benchmark/is_run_this_file.py --path "${LOG_DIR}" --file "${LOG_FILE}"`
 echo is_run_flag $is_run_flag
@@ -167,7 +167,7 @@ cmd_opts="
 
 if [[ ${CS_SEARCH} == 1 ]];  then
 mkdir -p ./search_res
-SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_bs_${BS}_cpueb_${CPU_EBD}_lightseq_${LIGHTSEQ}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
+SLOG_FILE="./search_res/slog_file.${MODEL_NAME}_bs_${BS}_cpueb_${CPU_EBD}_offload_${ACT_OFFLOAD}_SP_${SP}_AMM_${AMM}_MSC_${MSC}_CACHE_${CACHE}_TILING_${TILING}_${GIT_VER}"
 rm -rf ${SLOG_FILE}
 
 for((i=312;i>=64;i-=32));
diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 520bf5efd..8360d76c1 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -31,7 +31,7 @@
 import torch
 
 import patrickstar.utils.global_timer as global_timer
-from patrickstar.utils import logger, get_world_size, get_rank
+from patrickstar.utils import logger, get_world_size, get_rank, log_dist
 from .chunk_list import ChunkList, ChunkType
 from .chunk_tensor_index import ChunkTensorIndex
 from .const import AccessType, ChunkState, TensorState, TrainingStage
@@ -91,7 +91,7 @@ def __init__(self, rank: int, default_chunk_size: int, config=None):
             self.opt_config["with_async_move"],
         )
         if self.opt_config["with_mem_cache"]:
-            print("[CONFIG] USING MEM CACHE")
+            logger.debug("[CONFIG] USING MEM CACHE")
         self._time_profile = True
 
         if torch.distributed.is_initialized():
@@ -229,7 +229,7 @@ def append_dummy_chunk(self, data_type: torch.dtype, chunk_type: ChunkType):
             AccessType.DATA,
         )
 
-        logger.info(
+        log_dist(
             f"Append a dummy chunk to the Chunk List {chunk_type} "
             f"comm info {comm_info}"
         )
@@ -983,7 +983,7 @@ def display_chunk_info(self):
                 overall_size += chunk.get_chunk_space()
                 overall_chunk_num += 1
 
-        logger.info(f"OVERALL CHUNK SIZE {overall_size / 1024 / 1024 / 1024} GB")
-        logger.info(
+        log_dist(f"OVERALL CHUNK SIZE {overall_size / 1024 / 1024 / 1024} GB")
+        log_dist(
             f"OVERALL UTILIZATION {overall_utilization_ratio / overall_chunk_num} %"
         )
diff --git a/patrickstar/core/memtracer/memtracer.py b/patrickstar/core/memtracer/memtracer.py
index 03cf10ceb..9f5e0c920 100644
--- a/patrickstar/core/memtracer/memtracer.py
+++ b/patrickstar/core/memtracer/memtracer.py
@@ -34,6 +34,7 @@
 from patrickstar.core.const import TrainingStage
 from patrickstar.profiler import profiler
 from patrickstar.utils import (
+    log_dist,
     get_memory_info,
     get_sys_memory_used,
     get_world_size,
@@ -150,7 +151,7 @@ def __init__(self, local_rank: int = 0, config=None):
                 mem_info.total * self._overall_cpu_mem_ratio / local_world_size
             )
 
-        logger.info(
+        log_dist(
             f"Init Manager over all gpu mem {self._overall_gpu_mem / 1e6} MB, "
             f"cpu mem {self._overall_cpu_mem / 1e6} MB"
         )
@@ -175,14 +176,14 @@ def close_tracer(self):
         """
         if self.use_async_mem_monitor:
             self.async_mem_monitor.finish()
-        print("**** Memory Tracer is closed! ****")
+        log_dist("**** Memory Tracer is closed! ****")
 
     def start_train(self, param_fp16_chunk_size, chunk_size):
         self._param_fp16_chunk_size = param_fp16_chunk_size
         self._default_chunk_size = chunk_size
         if self.use_async_mem_monitor:
             self.async_mem_monitor.start()
-        print("**** Memory Tracer is stared! ****")
+        log_dist("**** Memory Tracer is stared! ****")
 
     def update_margin_mem(self):
         r"""Update the number of GPU free chunks for optimizer."""
@@ -193,6 +194,15 @@ def update_margin_mem(self):
             max_gpu_sys_used = 0
         else:
             max_gpu_sys_used = max(self.gpu_sys_used_list)
+
+        if len(self.cpu_sys_used_list) == 0:
+            logger.warning(
+                "No gpu info collected. Maybe there are no chunk based tensors."
+            )
+            max_cpu_sys_used = 0
+        else:
+            max_cpu_sys_used = max(self.cpu_sys_used_list)
+
         margin_mem_size = (
             self._overall_gpu_mem - max_gpu_sys_used - self._param_fp16_chunk_size
         )
@@ -201,14 +211,15 @@ def update_margin_mem(self):
             (margin_mem_size) / (self._default_chunk_size * 12) * self._margin_use_ratio
         )
 
-        logger.info("--------------- GPU INFO AFTER BWD ----------------")
-        logger.info(f"Max GPU System Mem (non-chunk) Used {max_gpu_sys_used / 1e6} MB")
-        logger.info(f"Param FP16 Chunk Size {self._param_fp16_chunk_size / 1e6} MB")
-        logger.info(
+        log_dist("--------------- GPU INFO AFTER BWD ----------------")
+        log_dist(f"Max GPU System Mem (non-chunk) Used {max_gpu_sys_used / 1e6} MB")
+        log_dist(f"Max CPU System Mem (non-chunk) Used {max_cpu_sys_used / 1e6} MB")
+        log_dist(f"Param FP16 Chunk Size {self._param_fp16_chunk_size / 1e6} MB")
+        log_dist(
             f"Margin Mem Size {margin_mem_size / 1e6} MB, "
             f"available chunk num for Optimizer States {self._margin_chunk_num_for_gpu_adam}"
         )
-        logger.info("--------------- GPU INFO AFTER BWD ----------------")
+        log_dist("--------------- GPU INFO AFTER BWD ----------------")
         logger.debug(f"OVERALL GPU MEM {self._overall_gpu_mem/1024/1024} MB")
 
     def reset_memory_stats(self):
@@ -229,7 +240,7 @@ def reset_memory_stats(self):
             self.gpu_used_list = []
             self.gpu_chunk_used_list = []
             self.gpu_sys_used_list = []
-        logger.info("Reset Memory Statistics")
+        log_dist("Reset Memory Statistics")
 
     def get_margin_chunk_num_for_gpu_adam(self):
         return self._margin_chunk_num_for_gpu_adam
diff --git a/patrickstar/core/preprocess.py b/patrickstar/core/preprocess.py
index 683e634a3..06885ed18 100644
--- a/patrickstar/core/preprocess.py
+++ b/patrickstar/core/preprocess.py
@@ -35,7 +35,7 @@
 from patrickstar.core import register_param, is_param_registered, ParamType
 from patrickstar.manager import _runtime_config
 from patrickstar.ops import Embedding
-from patrickstar.utils import logger, print_rank, get_rank, get_world_size
+from patrickstar.utils import logger, log_dist, print_rank, get_rank, get_world_size
 from patrickstar.utils import see_memory_usage
 
 _orig_torch_empty = torch.empty
@@ -251,7 +251,7 @@ def _post_context_exec(self):
             number of processes.
         3. Add a dummy param at the start of CPU Embedding for huggingface.
         """
-        logger.info("Post Model Init Context")
+        log_dist("Post Model Init Context")
 
         def _origin_new(cls, *arg, **kwargs):
             return object.__new__(cls)
@@ -333,7 +333,7 @@ def _origin_new(cls, *arg, **kwargs):
             chunk_num += 1
 
         world_size = get_world_size()
-        logger.info(f"param fp16 chunk num {chunk_num}")
+        log_dist(f"Param fp16 chunk num {chunk_num}")
         while chunk_num % world_size != 0:
             self.client.append_dummy_chunk(torch.half, ChunkType.PARAM_FP16)
             chunk_num += 1
@@ -363,8 +363,6 @@ def _post_init_method(self, module):
                     self.client.torch_param_allreduce_list.append(param)
                 return
 
-        print_rank(f"Converting Params in {module.__class__.__name__}", force=False)
-
         if not _runtime_config.use_chunk:
             for name, param in module.named_parameters(recurse=False):
                 name = f"{module.__class__.__name__}.{name}_{self.param_idx}"
diff --git a/patrickstar/runtime/engine.py b/patrickstar/runtime/engine.py
index 7d97689a9..d235daafc 100644
--- a/patrickstar/runtime/engine.py
+++ b/patrickstar/runtime/engine.py
@@ -32,7 +32,7 @@
 from patrickstar.core import ChunkState, TensorState, TrainingStage, ParamType
 from patrickstar.fp16 import LossScaler, DynamicLossScaler
 from patrickstar.ops import FP16Adam
-from patrickstar.utils import logger, global_timer
+from patrickstar.utils import log_dist, global_timer
 
 from .checkpoint import state_dict, load_state_dict
 from patrickstar.profiler import profiler
@@ -86,7 +86,7 @@ def __init__(self, model, client, config):
                 ), "Must have `loss_scale` field set."
                 loss_scale = loss_scale_config["loss_scale"]
                 if loss_scale == 0:
-                    logger.info("Use DynamicLossScaler")
+                    log_dist("Use DynamicLossScaler")
                     self.loss_scaler = DynamicLossScaler(
                         init_scale=(
                             2 ** loss_scale_config.get("initial_scale_power", 16)
@@ -129,7 +129,7 @@ def __init__(self, model, client, config):
         self.iteration_cnt_ = 0
         # TODO(jiaruifang) pass in via config.
         self.warmup_times = 1
-        logger.info("PatrickStarEngine initialized.")
+        log_dist("PatrickStarEngine initialized.")
 
     def _move_torch_parts_to_gpu(self, model):
         # TODO(zilinzhu) Currently we move all buffers to GPU as the buffer size is
diff --git a/patrickstar/utils/logging.py b/patrickstar/utils/logging.py
index 80da21027..1dea2e72c 100644
--- a/patrickstar/utils/logging.py
+++ b/patrickstar/utils/logging.py
@@ -65,7 +65,7 @@ def create_logger(name=None, level=logging.WARNING):
 logger = LoggerFactory.create_logger(name="PatrickStar", level=logging.WARNING)
 
 
-def log_dist(message, ranks=None, level=logging.INFO):
+def log_dist(message, ranks=[0], level=logging.INFO):
     """Log message when one of following condition meets
     + not dist.is_initialized()
     + dist.get_rank() in ranks if ranks is not None or ranks = [-1]

From a870eee85ffd3df78b129a7f6f21ddbf971f03e9 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 15:05:19 +0800
Subject: [PATCH 13/19] polish dist log in pretrained_bert

---
 examples/pretrain_bert_demo.py          | 15 +++++----------
 patrickstar/core/memtracer/memtracer.py |  4 ----
 2 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/examples/pretrain_bert_demo.py b/examples/pretrain_bert_demo.py
index aa492a63c..ca18a933c 100644
--- a/examples/pretrain_bert_demo.py
+++ b/examples/pretrain_bert_demo.py
@@ -39,7 +39,7 @@
 from patrickstar.profiler import profiler
 from patrickstar.runtime import initialize_engine
 from patrickstar.utils import see_memory_usage
-from patrickstar.utils.logging import logger
+from patrickstar.utils.logging import log_dist, logger
 from patrickstar.utils.model_size_calculator import get_ps_model_size
 from model_builder import build_transformer_model
 from parse_args import parse_args
@@ -53,11 +53,6 @@ def test_transformer_model_helper(
     dist_plan: str = "torch",
     num_steps=5,
 ):
-    logger.info(
-        f'test a bert {"fp16" if is_fp16 else "fp32"} model '
-        f'{"with checkpoint" if is_ckp else ""}'
-    )
-
     # Use single card to simulate multicard. Used when you are poor and
     # no more GPU avaiable.
     if args.use_fake_dist:
@@ -117,9 +112,9 @@ def test_transformer_model_helper(
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
 
     model_numel, model_num_param = get_ps_model_size(model)
-    logger.info(f"Model size {model_numel / 1e9} B, total params: {model_num_param}")
+    log_dist(f"Model size {model_numel / 1e9} B, total params: {model_num_param}")
     total_macs = model_numel * args.batch_size * sequence_length * 2 * 4
-    logger.info(f"Total MACs: {total_macs/1024/1024/1024/1024} TFlops")
+    log_dist(f"Total MACs: {total_macs/1024/1024/1024/1024} TFlops")
 
     see_memory_usage(
         f"After model init. using {dist_plan}, gradient checkpoint: {is_ckp}, fp16 {is_fp16}",
@@ -145,7 +140,7 @@ def test_transformer_model_helper(
             break
         # You may need to empty_cache for really large models.
         torch.cuda.empty_cache()
-        logger.info(f"Start Step {n} with {dist_plan}...")
+        log_dist(f"Start Step {n} with {dist_plan}...")
 
         step_start_time = time.time()
         # Only collect running time of the last iteration.
@@ -201,7 +196,7 @@ def test_transformer_model_helper(
                     f"Step {n} elaspe {step_elapse} s, {total_macs / 1e12 / step_elapse} Tflops"
                 )
 
-        logger.info(f"End Step {n} with {dist_plan}.\n")
+        log_dist(f"End Step {n} with {dist_plan}.\n")
 
     if args.with_mem_profiler:
         profiler.end()
diff --git a/patrickstar/core/memtracer/memtracer.py b/patrickstar/core/memtracer/memtracer.py
index 9f5e0c920..cee477362 100644
--- a/patrickstar/core/memtracer/memtracer.py
+++ b/patrickstar/core/memtracer/memtracer.py
@@ -126,10 +126,6 @@ def __init__(self, local_rank: int = 0, config=None):
         if self.use_async_mem_monitor:
             self.async_mem_monitor = AsyncMemoryMonitor()
 
-        print(
-            f"[Mem Tracer] Using Asyn Mem Monitor Flag : {self.use_async_mem_monitor}"
-        )
-
         mem_info = get_memory_info()
         local_world_size = get_local_world_size()
         if self.use_fake_dist:

From 833cf168569be8832f5aa9c88475815102b84d69 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 15:13:26 +0800
Subject: [PATCH 14/19] polish log

---
 patrickstar/core/client.py      | 2 +-
 patrickstar/runtime/__init__.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 8360d76c1..cfffd4e28 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -229,7 +229,7 @@ def append_dummy_chunk(self, data_type: torch.dtype, chunk_type: ChunkType):
             AccessType.DATA,
         )
 
-        log_dist(
+        logger.debug(
             f"Append a dummy chunk to the Chunk List {chunk_type} "
             f"comm info {comm_info}"
         )
diff --git a/patrickstar/runtime/__init__.py b/patrickstar/runtime/__init__.py
index 469fbdb29..030ed31bb 100644
--- a/patrickstar/runtime/__init__.py
+++ b/patrickstar/runtime/__init__.py
@@ -30,8 +30,9 @@
 import torch
 from patrickstar.core import PSPreProcessCtx, PatrickStarClient
 from patrickstar.core.memtracer import RuntimeMemTracer
-from patrickstar.utils import logger
+from patrickstar.utils import logger, log_dist
 from .engine import PatrickStarEngine
+import time
 
 DEFAULT_CHUNK_SIZE = 32 * 1024 * 1024
 
@@ -73,6 +74,8 @@ def initialize_engine(model_func, local_rank, config=None, client=None):
             config=config.get("client", None),
         )
 
+        start_time = time.time()
+        log_dist("begin initialize the model parameters...")
         with PSPreProcessCtx(
             client=client,
             dtype=torch.float,
@@ -80,6 +83,10 @@ def initialize_engine(model_func, local_rank, config=None, client=None):
             use_cpu_embedding=use_cpu_embedding,
         ):
             model = model_func()
+        end_time = time.time()
+        log_dist(
+            f"finished initialized the model parameters... {end_time  - start_time} s"
+        )
 
     engine = PatrickStarEngine(model=model, client=client, config=config)
     client.start_mem_tracer()

From 654ab579784b641695ddbf441d775131f8848f2d Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 15:14:54 +0800
Subject: [PATCH 15/19] polish

---
 examples/run_transformers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_transformers.sh b/examples/run_transformers.sh
index 03dce1ec3..1c8f15624 100644
--- a/examples/run_transformers.sh
+++ b/examples/run_transformers.sh
@@ -185,6 +185,6 @@ else
 env OMP_NUM_THREADS=${TNUM} timeout -s SIGKILL 30m python -m torch.distributed.launch --nproc_per_node=${GPU_NUM} \
     pretrain_bert_demo.py \
     --default_chunk_size=${CHUNK_SIZE} \
-    ${cmd_opts}
+    ${cmd_opts} \
     2>&1 | tee ${LOG_DIR}/${LOG_FILE}
 fi

From 862382656167b1f7d20393fef6e9296537b8d061 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 15:59:09 +0800
Subject: [PATCH 16/19] polish

---
 examples/optimizations/ps_tile_modeling_bert.py | 4 ++--
 patrickstar/core/client.py                      | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/optimizations/ps_tile_modeling_bert.py b/examples/optimizations/ps_tile_modeling_bert.py
index 2cd9fa600..dbf22da99 100644
--- a/examples/optimizations/ps_tile_modeling_bert.py
+++ b/examples/optimizations/ps_tile_modeling_bert.py
@@ -584,7 +584,7 @@ def __init__(self, config, add_pooling_layer=True):
 
         self.pooler = BertPooler(config) if add_pooling_layer else None
 
-        self.init_weights()
+        # self.init_weights()
 
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
@@ -771,7 +771,7 @@ def __init__(self, config):
         self.dropout = nn.Dropout(classifier_dropout)
         self.classifier = nn.Linear(config.hidden_size, config.num_labels)
 
-        self.init_weights()
+        # self.init_weights()
 
     def forward(
         self,
diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index cfffd4e28..2430f7272 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -948,6 +948,7 @@ def display_chunk_info(self):
         overall_size = 0
         overall_chunk_num = 0
         overall_utilization_ratio = 0.0
+        max_utilization_ratio = 0
         for (
             type,
             type_chunk_list,
@@ -955,6 +956,8 @@ def display_chunk_info(self):
             logger.debug(f"Chunk list {type}")
             for chunk_id in type_chunk_list:
                 chunk = self.chunk_list[chunk_id]
+                if chunk.is_dummy():
+                    continue
                 comm_info = self.chunk_tensor_index.chunk_id_to_comm_info_map[chunk_id]
                 assert comm_info is not None
 
@@ -979,7 +982,9 @@ def display_chunk_info(self):
                     f"chunk used {last_used_pos/1024/1024} M elem, "
                     f"{last_used_pos/chunk.capacity * 100} %"
                 )
-                overall_utilization_ratio += last_used_pos / chunk.capacity
+                cur_util = last_used_pos / chunk.capacity
+                max_utilization_ratio = max(cur_util, max_utilization_ratio)
+                overall_utilization_ratio += cur_util
                 overall_size += chunk.get_chunk_space()
                 overall_chunk_num += 1
 

From 825ac6add36ba72f88ab816cb0a54b3ff909cab8 Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 16:00:57 +0800
Subject: [PATCH 17/19] polish code.

---
 patrickstar/core/client.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 2430f7272..4625edfa7 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -990,5 +990,6 @@ def display_chunk_info(self):
 
         log_dist(f"OVERALL CHUNK SIZE {overall_size / 1024 / 1024 / 1024} GB")
         log_dist(
-            f"OVERALL UTILIZATION {overall_utilization_ratio / overall_chunk_num} %"
+            f"OVERALL UTILIZATION {overall_utilization_ratio / overall_chunk_num * 100} %"
         )
+        log_dist(f"MAX UTILIZATION {max_utilization_ratio * 100} %")

From a1eac307e3bfec7c2c8bb0bc54f1f1e35678c64c Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Fri, 10 Dec 2021 16:01:30 +0800
Subject: [PATCH 18/19] polish code

---
 patrickstar/core/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/patrickstar/core/client.py b/patrickstar/core/client.py
index 4625edfa7..0121fe269 100644
--- a/patrickstar/core/client.py
+++ b/patrickstar/core/client.py
@@ -956,7 +956,7 @@ def display_chunk_info(self):
             logger.debug(f"Chunk list {type}")
             for chunk_id in type_chunk_list:
                 chunk = self.chunk_list[chunk_id]
-                if chunk.is_dummy():
+                if self.opt_config["with_mem_saving_comm"] and chunk.is_dummy():
                     continue
                 comm_info = self.chunk_tensor_index.chunk_id_to_comm_info_map[chunk_id]
                 assert comm_info is not None

From e9695caeabf324063da47038aec69e926f5abe5a Mon Sep 17 00:00:00 2001
From: jiaruifang <jiaruifang@tencent.com>
Date: Mon, 13 Dec 2021 14:49:52 +0800
Subject: [PATCH 19/19] 0.4.4 -> 0.4.5

---
 CHANGE_LOG.md | 4 ++++
 setup.py      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md
index 6182c1eca..f439d6223 100644
--- a/CHANGE_LOG.md
+++ b/CHANGE_LOG.md
@@ -1,3 +1,7 @@
+## v0.4.5 Dec. 2021
+refactory the files in example and adding chunk size searching.
+
+
 ### v0.4.4 Dec. 2021
 The system is successfully evaluated on a multi-node system.
 The benchmark scripts are integrated with memory-centric tiling borrowed from DeepSpeed.
diff --git a/setup.py b/setup.py
index b999b98a1..0ed063765 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ def fetch_requirements(path):
 
 setup(
     name="patrickstar",
-    version="0.4.4",
+    version="0.4.5",
     description="PatrickStart library",
     long_description="PatrickStar: Parallel Training of Large Language Models via a Chunk-based Parameter Server",
     long_description_content_type="text/markdown",