Skip to content

Commit

Permalink
Merge pull request #160 from Modalities/generic-downstream-eval
Browse files Browse the repository at this point in the history
- We confirmed, that we can convert checkpoints of models trained with FSDP
- We merged the current main into our branch
- We confirmed that all tests in the codebase run successfully
  • Loading branch information
rrutmann authored Jul 9, 2024
2 parents f810fcc + 26aedc5 commit f25c018
Show file tree
Hide file tree
Showing 17 changed files with 582 additions and 547 deletions.
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
prediction_key: logits
training:
callback_interval_in_samples: 32768
global_num_training_samples: 2048
global_num_seen_samples: 0
global_num_seen_steps: 0
activation_checkpointing_modules: []
gradient_acc_steps: 1
local_train_micro_batch_size: 16
sequence_length: 4096
gradient_clipping:
mode: NONE
local_train_micro_batch_size: 1
sequence_length: 256
global_training_log_interval_in_steps: 8
global_checkpointing_interval_in_steps: 8
global_evaluation_interval_in_steps: 8
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpointing_path: data/checkpoints


collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
Expand All @@ -31,11 +32,11 @@ collate_fn:

train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_megatron
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpyjama_v2_default_DE_num_docs_1024/redpyjama_v2_default_DE_num_docs_1024.pbin
raw_data_path: ./data/lorem_ipsum.pbin
block_size: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
Expand Down Expand Up @@ -68,14 +69,6 @@ train_dataloader:
instance_key: collate_fn
pass_type: BY_REFERENCE

val_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_megatron
config:
raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpyjama_v2_default_DE_num_docs_1024/redpyjama_v2_default_DE_num_docs_1024.pbin
block_size: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

val_dataloader:
component_key: data_loader
variant_key: default
Expand All @@ -85,23 +78,54 @@ val_dataloader:
shuffle: false
dataloader_tag: "val"
dataset:
instance_key: val_dataset
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.training.local_train_micro_batch_size}
batch_size: 4
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
shuffle: false
dataset:
instance_key: val_dataset
instance_key: train_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

test_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "test"
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: 2
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
Expand All @@ -110,41 +134,33 @@ val_dataloader:
eval_dataloaders:
- instance_key: val_dataloader
pass_type: BY_REFERENCE
- instance_key: test_dataloader
pass_type: BY_REFERENCE

checkpointing:
component_key: checkpointing
checkpoint_saving:
component_key: checkpoint_saving
variant_key: default
config:
checkpointing_strategy:
component_key: checkpointing_strategy
checkpoint_saving_strategy:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
checkpointing_execution:
component_key: checkpointing_execution
variant_key: fsdp_to_disc_checkpointing
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: fsdp
config:
checkpoint_path: ${settings.paths.checkpointing_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [ MambaBlock ]

model:
component_key: model
variant_key: mamba
# resolving class types via different enums sucks...
loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
d_model: 16
n_layer: 2
vocab_size: 50257
rms_norm: True
ssm_cfg: {}
residual_in_fp32: True
fused_add_norm: True
pad_vocab_size_multiple: 8
tie_embeddings: True
prediction_key: logits
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}

wrapped_model:
component_key: model
Expand All @@ -158,6 +174,42 @@ wrapped_model:
sharding_strategy: FULL_SHARD
block_names: [ MambaBlock ]

model:
component_key: model
variant_key: mamba
config:
d_model: 768
n_layer: 24
vocab_size: 50257
rms_norm: true
residual_in_fp32: true
fused_add_norm: true
pad_vocab_size_multiple: 8
tie_embeddings: true
prediction_key: ${settings.referencing_keys.prediction_key}
sample_key: ${settings.referencing_keys.sample_key}
seed: null
dtype: null
initializer_cfg: {}
num_last_tokens: 0
inference_params: {}
mixer_model_config:
norm_epsilon: 1e-5
device: null
mamba_block_config:
d_state: 16
d_conv: 4
expand: 2
dt_rank: auto
dt_min: 0.001
dt_max: 0.1
dt_init: random
dt_scale: 1.0
dt_init_floor: 1e-4
conv_bias: true
bias: false
use_fast_path: true

scheduler:
component_key: scheduler
variant_key: onecycle_lr
Expand All @@ -168,41 +220,43 @@ scheduler:
max_lr: 6e-4
div_factor: 10
final_div_factor: 1
total_steps: 64
total_steps: 16
pct_start: 0.01
anneal_strategy: cos

loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}

optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [ 0.9, 0.95 ]
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
variant_key: fsdp
config:
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
norm_type: P2_NORM
max_norm: 1.0

batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_samples: ${settings.training.global_num_seen_samples}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
eval_dataloaders:
- instance_key: val_dataloader
instance_key: eval_dataloaders
pass_type: BY_REFERENCE


Expand All @@ -211,7 +265,8 @@ evaluation_subscriber:
variant_key: wandb
config:
local_rank: ${settings.cuda_env.local_rank}
project: modalities
project: modalities_lorem_ipsum
mode: ONLINE
experiment_id: ${settings.experiment_id}
directory: "."
directory: "."
config_file_path: ${settings.config_file_path}
Loading

0 comments on commit f25c018

Please sign in to comment.