Skip to content

Commit

Permalink
Fix changed behavior of pipe_parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
yang committed May 21, 2024
1 parent 49cd41f commit 4fe3b3b
Showing 1 changed file with 4 additions and 13 deletions.
17 changes: 4 additions & 13 deletions megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ def from_ymls(cls, paths_to_yml_files: List[str], overwrite_values: Dict = None)
config_files = dict()
# iterate of all to be loaded yaml files
for conf_file_name in paths_to_yml_files:

# load file
with open(conf_file_name) as conf_file:
conf = yaml.load(conf_file, Loader=yaml.FullLoader)
Expand Down Expand Up @@ -477,7 +476,6 @@ def get_extra_deepspeed_args(self):
return extra_ds_args

def get_deepspeed_main_args(self):

args_list = list()

if self.autotuning_run is not None:
Expand Down Expand Up @@ -796,14 +794,11 @@ def calculate_batch_parameters(

# either none of the three parameters are provided or just gradient_accumulation_step is provided
else:
assert (
False
), "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
assert False, "Either train_batch_size or train_micro_batch_size_per_gpu needs to be provided"
return int(train_batch), int(micro_batch), int(grad_acc)

@staticmethod
def check_batch_parameters(dp_world_size, train_batch, micro_batch, grad_acc):

assert (
train_batch > 0
), f"Train batch size: {train_batch} has to be greater than 0"
Expand Down Expand Up @@ -1033,10 +1028,7 @@ def calculate_derived(self):
# Update 'is pipe parallel' flag
# if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
# the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
self.update_value(
"is_pipe_parallel",
self.pipe_parallel_size > 1 and self.moe_num_experts == 1,
)
self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)
if self.moe_num_experts > 1:
assert not (
self.is_pipe_parallel or self.pipe_parallel_size > 1
Expand Down Expand Up @@ -1106,8 +1098,8 @@ def calculate_derived(self):
if "flash" in self.attention_config:
_flash_version = packaging.version.Version(version("flash-attn"))
if self.sliding_window_width is not None:
assert _flash_version >= packaging.version.Version(
"2.3.0"
assert (
_flash_version >= packaging.version.Version("2.3.0")
), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention."
if self.pos_emb == "alibi":
if not _flash_version >= packaging.version.Version("2.4.0.post1"):
Expand Down Expand Up @@ -1234,7 +1226,6 @@ def validate_values(self):

# Parameters sharing does not work with torch DDP.
if (self.num_unique_layers is not None) and (self.num_layers is not None):

if not (self.num_unique_layers <= self.num_layers):
error_message = (
self.__class__.__name__
Expand Down

0 comments on commit 4fe3b3b

Please sign in to comment.