From 8d175ed0cbc31ca8a8c2d558f366ea4934595034 Mon Sep 17 00:00:00 2001 From: "hatef.4" Date: Wed, 15 May 2024 14:20:34 -0400 Subject: [PATCH 1/4] misc changes to neox_args --- configs/neox_arguments.md | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c8e1492ae..2b979f962 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -9,17 +9,6 @@ LR Scheduler Arguments - **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential'] - Default = linear - - Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'. - - - -- **lr_decay_iters**: int - - Default = None - - Number of iterations to decay learning rate over, If None defaults to --train-iters From 93b880672745e1071e9486cc547ed580e0f1d9e0 Mon Sep 17 00:00:00 2001 From: "hatef.4" Date: Thu, 16 May 2024 17:05:04 -0400 Subject: [PATCH 2/4] allow rwkv pp --- megatron/neox_arguments/arguments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py index ff4f4bc21..98a444ea4 100644 --- a/megatron/neox_arguments/arguments.py +++ b/megatron/neox_arguments/arguments.py @@ -1070,8 +1070,8 @@ def calculate_derived(self): ), "Mamba does not yet have dropout implemented" if "rwkv" in self.attention_config: assert ( - not self.is_pipe_parallel and self.model_parallel_size == 1 - ), "RWKV not currently compatible with parallelism" + self.model_parallel_size == 1 + ), "RWKV not currently compatible with model parallelism" if isinstance(self.zero_stage, int): assert self.zero_stage <= 2, "Zero stage 3 not compatible with RWKV" assert ( From 1607f73188432271fe8b2560e06d557bda104bab Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 16 May 2024 21:05:52 +0000 Subject: [PATCH 3/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 58 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 2b979f962..2c8e3c19f 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -9,6 +9,17 @@ LR Scheduler Arguments - **lr_decay_style**: typing.Literal['constant', 'linear', 'cosine', 'exponential'] + Default = linear + + Learning rate decay function. Choose from 'constant', 'linear', 'cosine', 'exponential'. + + + +- **lr_decay_iters**: int + + Default = None + + Number of iterations to decay learning rate over, If None defaults to --train-iters @@ -100,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 6fb840e + Default = 93b8806 current git hash of repository @@ -1190,7 +1201,7 @@ Text Generation arguments -- **num_experts**: int +- **moe_num_experts**: int Default = 1 @@ -1232,7 +1243,7 @@ Text Generation arguments - **moe_token_dropping**: bool - Default = True + Default = False Whether to drop tokens when exceeding capacity @@ -1262,6 +1273,47 @@ Text Generation arguments +- **moe_type**: str + + Default = megablocks + + Either `deepspeed` or `megablocks` + + + +- **moe_glu**: bool + + Default = False + + Use gated linear units in MoE + + + +- **moe_lbl_in_fp32**: bool + + Default = False + + Whether to compute the load balancing loss in fp32. + + + +- **moe_jitter_eps**: float + + Default = None + + Coefficient for MoE routing jitter. Jitter is + not used if set to None + + + +- **enable_expert_tensor_parallelism**: bool + + Default = False + + Enable expert tensor parallelism + + + ## NeoXArgsTokenizer Tokenizer Arguments From 6860a0af69d84a8a9ffcd78326a2f0ce8e1807af Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 21 May 2024 22:34:30 +0000 Subject: [PATCH 4/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index dd10a0e09..48c03f15a 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 8d175ed + Default = 0d5992f current git hash of repository