From 17b061ff9c3f0668fc66b20001ad2a2df5f42c1c Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Fri, 17 May 2024 04:14:09 +0000 Subject: [PATCH 1/4] Tolerate no fused kernels --- megatron/model/norms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/megatron/model/norms.py b/megatron/model/norms.py index 8b06b177c..dda44659f 100644 --- a/megatron/model/norms.py +++ b/megatron/model/norms.py @@ -14,7 +14,6 @@ import torch from torch.nn import LayerNorm as LayerNorm -from .fused_layer_norm import MixedFusedLayerNorm def get_norm(neox_args): @@ -23,7 +22,11 @@ def get_norm(neox_args): eps = neox_args.rms_norm_epsilon elif neox_args.norm == "layernorm": eps = neox_args.layernorm_epsilon - norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm + if neox_args.layernorm_fusion: + from .fused_layer_norm import MixedFusedLayerNorm + norm = MixedFusedLayerNorm + else: + norm = LayerNorm elif neox_args.norm == "scalenorm": eps = neox_args.scalenorm_epsilon norm = ScaleNorm From fdc395f096e0fa64ee8f4e7119c290480ae7a42f Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Fri, 17 May 2024 04:14:51 +0000 Subject: [PATCH 2/4] Fix requirements file syntax --- requirements/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 501edf345..3ac92598a 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,6 +1,6 @@ -git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed +deepspeed@git+https://github.com/EleutherAI/DeeperSpeed.git@02e2ebf7dee6aaab3d89094ed470a4609763c742#egg=deepspeed ftfy>=6.0.1 -git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 +lm_dataformat@git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836 huggingface_hub>=0.11.0 jinja2==3.1.4 lm_eval>=0.4.0,<=0.4.1 From ae7e849671028e3c8e82a0feb6413dbb866d547c Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 May 2024 04:36:26 +0000 Subject: [PATCH 3/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c8e1492ae..e03265bca 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 6fb840e + Default = fdc395f current git hash of repository @@ -1201,7 +1201,7 @@ Text Generation arguments -- **num_experts**: int +- **moe_num_experts**: int Default = 1 @@ -1243,7 +1243,7 @@ Text Generation arguments - **moe_token_dropping**: bool - Default = True + Default = False Whether to drop tokens when exceeding capacity @@ -1273,6 +1273,47 @@ Text Generation arguments +- **moe_type**: str + + Default = megablocks + + Either `deepspeed` or `megablocks` + + + +- **moe_glu**: bool + + Default = False + + Use gated linear units in MoE + + + +- **moe_lbl_in_fp32**: bool + + Default = False + + Whether to compute the load balancing loss in fp32. + + + +- **moe_jitter_eps**: float + + Default = None + + Coefficient for MoE routing jitter. Jitter is + not used if set to None + + + +- **enable_expert_tensor_parallelism**: bool + + Default = False + + Enable expert tensor parallelism + + + ## NeoXArgsTokenizer Tokenizer Arguments From 0b61f535287339a483033a60c8e73b1440648cb4 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 21 May 2024 22:38:35 +0000 Subject: [PATCH 4/4] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 1dbb4dd8a..c6d369524 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = b68ba6d + Default = 1b85a2f current git hash of repository