diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 34b08910d9..a6d42c9418 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -150,7 +150,6 @@ class TransformerConfig(ModelParallelConfig): # activation recomputation #################### recompute_granularity: str = None - recompute_granularity: str = None """Determines which type of activation recompute to use. Megatron-core supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed. These memory intensive activations are also less compute intensive which makes activation diff --git a/megatron/training/training.py b/megatron/training/training.py index eaaf9bde24..1c1a214c97 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -161,7 +161,7 @@ def pretrain(train_valid_test_dataset_provider, 1) initialize Megatron. 2) setup model, optimizer and lr schedule using the model_provider. 3) call train_val_test_data_provider to get train/val/test datasets. - 4) train the modle using the forward_step_func. + 4) train the model using the forward_step_func. Args: train_valid_test_dataset_provider: a function that takes the size of @@ -211,9 +211,6 @@ def pretrain(train_valid_test_dataset_provider, time.time() - _TRAIN_START_TIME)) print_datetime('after megatron is initialized') - args = get_args() - timers = get_timers() - one_logger = get_one_logger() if one_logger: one_logger.log_metrics({