From 2382bd4a6bfd0ec7199e1b7876cd8c457029e8e1 Mon Sep 17 00:00:00 2001 From: yang <7129+yang@users.noreply.github.com> Date: Thu, 6 Jun 2024 18:27:01 -0700 Subject: [PATCH] Fix changed behavior of pipe_parallel (#1219) * Fix changed behavior of pipe_parallel * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: Yang Zhang Co-authored-by: github-actions Co-authored-by: Quentin Anthony --- configs/neox_arguments.md | 2 +- megatron/neox_arguments/arguments.py | 17 ++++------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index f6c3ecde3..7a56e361e 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 7aa0074 + Default = 8451671 current git hash of repository diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index 98a444ea4..9cad02c43 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -180,7 +180,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None) config_files = dict() # iterate of all to be loaded yaml files for conf_file_name in paths_to_yml_files: - # load file with open(conf_file_name) as conf_file: conf = yaml.load(conf_file, Loader=yaml.FullLoader) @@ -477,7 +476,6 @@ def get_extra_deepspeed_args(self): return extra_ds_args def get_deepspeed_main_args(self): - args_list = list() if self.autotuning_run is not None: @@ -796,14 +794,11 @@ def calculate_batch_parameters( # either none of the three parameters are provided or just gradient_accumulation_step is provided else: - assert ( - False - ), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" + assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided" return int(train_batch), int(micro_batch), int(grad_acc) @staticmethod def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc): - assert ( train_batch > 0 ), f"Train batch size: {train_batch} has to be greater than 0" @@ -1033,10 +1028,7 @@ def calculate_derived(self): # Update 'is pipe parallel' flag # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs - self.update_value( - "is_pipe_parallel", - self.pipe_parallel_size > 1 and self.moe_num_experts == 1, - ) + self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1) if self.moe_num_experts > 1: assert not ( self.is_pipe_parallel or self.pipe_parallel_size > 1 @@ -1106,8 +1098,8 @@ def calculate_derived(self): if "flash" in self.attention_config: _flash_version = packaging.version.Version(version("flash-attn")) if self.sliding_window_width is not None: - assert _flash_version >= packaging.version.Version( - "2.3.0" + assert ( + _flash_version >= packaging.version.Version("2.3.0") ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention." if self.pos_emb == "alibi": if not _flash_version >= packaging.version.Version("2.4.0.post1"): @@ -1234,7 +1226,6 @@ def validate_values(self): # Parameters sharing does not work with torch DDP. if (self.num_unique_layers is not None) and (self.num_layers is not None): - if not (self.num_unique_layers <= self.num_layers): error_message = ( self.__class__.__name__