Skip to content

Commit

Permalink
Merge pull request #106 from Modalities/repeating_dataloader
Browse files Browse the repository at this point in the history
Repeating dataloader
  • Loading branch information
le1nux committed Apr 23, 2024
2 parents f903c6e + 40df8f0 commit 98893e9
Show file tree
Hide file tree
Showing 22 changed files with 1,199 additions and 52 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,6 @@ docs/source/generated
docs/source/api
pyenv*
.devcontainer/
noteboks/*
noteboks/*

tests/tmp/*
95 changes: 95 additions & 0 deletions config_files/text_generation/text_generation_overfitted_de.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
settings:
referencing_keys:
sample_key: input_ids
prediction_key: logits
model_path: /raid/s3/opengptx/max_lue/modalities/data/checkpoints/2024-04-22__08-55-14/eid_2024-04-22__08-55-14-model-num_steps_1152.bin
device: 0
context_length: 2048

text_inference_component:
component_key: inference_component
variant_key: text
config:
model:
instance_key: checkpointed_model
pass_type: BY_REFERENCE
tokenizer:
instance_key: tokenizer
pass_type: BY_REFERENCE
context_length: ${settings.context_length}
eod_token: <eod>
prompt_template: "{prompt_input}" # "<instruction> Du bist Moody, ein LLM welches Menschen helfen soll. user: {prompt_input}"
temperature: 0.000001
# chat: false

checkpointed_model:
component_key: model
variant_key: checkpointed
config:
checkpoint_loading:
component_key: checkpoint_loading
variant_key: torch
config:
device: ${settings.device}
precision: FP16
model:
instance_key: raw_model
pass_type: BY_REFERENCE
checkpoint_path: ${settings.model_path}

raw_model:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
block_size: ${settings.context_length}
prediction_key: ${settings.referencing_keys.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_head_kv: 12
ffn_hidden: 2048
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${raw_model.config.n_embd}
n_head: ${raw_model.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
activation_type: gelu
weight_init:
mean: 0.0
std: 0.02
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5

tokenizer:
component_key: tokenizer
variant_key: pretrained_hf_tokenizer
config:
pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2
padding: false
max_length: ${settings.context_length}
254 changes: 254 additions & 0 deletions config_files/training/config_gpt2_small_overfitting_de.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
settings:
experiment_id: ${modalities_env:experiment_id}
referencing_keys:
sample_key: input_ids
target_key: target_ids
prediction_key: logits
training:
global_training_log_interval_in_steps: 1
global_checkpointing_interval_in_steps: 128
global_evaluation_interval_in_steps: 64
global_num_seen_steps: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 16
sequence_length: 2048
gradient_clipping:
mode: p2_norm
threshold: 1.0
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
world_size: ${cuda_env:WORLD_SIZE}
paths:
checkpointing_path: data/checkpoints

collate_fn:
component_key: collate_fn
variant_key: gpt_2_llm_collator
config:
sample_key: ${settings.referencing_keys.sample_key}
target_key: ${settings.referencing_keys.target_key}

train_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: /workspaces/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_de.pbin
block_size: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

val_dataset:
component_key: dataset
variant_key: packed_mem_map_dataset_continuous
config:
raw_data_path: /workspaces/modalities/data/sample_datasets/overfitting/hf_gpt2_2048/data_overfitting_en.pbin
block_size: ${settings.training.sequence_length}
sample_key: ${settings.referencing_keys.sample_key}

train_dataloader:
component_key: data_loader
variant_key: repeating_data_loader
config:
reshuffle_after_epoch: false
num_epochs: 100
dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: train
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.training.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: true
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

val_dataloader:
component_key: data_loader
variant_key: default
config:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: val
dataset:
instance_key: val_dataset
pass_type: BY_REFERENCE
batch_sampler:
component_key: batch_sampler
variant_key: default
config:
batch_size: ${settings.training.local_train_micro_batch_size}
drop_last: true
sampler:
component_key: sampler
variant_key: distributed_sampler
config:
rank: ${settings.cuda_env.global_rank}
num_replicas: ${settings.cuda_env.world_size}
shuffle: false
dataset:
instance_key: val_dataset
pass_type: BY_REFERENCE
collate_fn:
instance_key: collate_fn
pass_type: BY_REFERENCE

eval_dataloaders:
- instance_key: val_dataloader
pass_type: BY_REFERENCE


checkpointing:
component_key: checkpointing
variant_key: default
config:
checkpointing_strategy:
component_key: checkpointing_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
checkpointing_execution:
component_key: checkpointing_execution
variant_key: fsdp_to_disc_checkpointing
config:
checkpoint_path: ${settings.paths.checkpointing_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]

wrapped_model:
component_key: model
variant_key: fsdp_wrapped
config:
model:
instance_key: model
pass_type: BY_REFERENCE
sync_module_states: true
mixed_precision_settings: BF_16
sharding_strategy: FULL_SHARD
block_names: [GPT2Block]

model:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
block_size: ${settings.training.sequence_length}
prediction_key: ${settings.referencing_keys.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_head_kv: 12
ffn_hidden: 2048
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${model.config.n_embd}
n_head: ${model.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
activation_type: gelu
weight_init:
mean: 0.0
std: 0.02
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${model.config.n_embd}
bias: true
epsilon: 1e-5

loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
config:
target_key: ${settings.referencing_keys.target_key}
prediction_key: ${settings.referencing_keys.prediction_key}

optimizer:
component_key: optimizer
variant_key: adam_w
config:
lr: 0.0001
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE

scheduler:
component_key: scheduler
variant_key: dummy_lr
config:
optimizer:
instance_key: optimizer
pass_type: BY_REFERENCE


batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
eval_dataloaders:
- instance_key: val_dataloader
pass_type: BY_REFERENCE


evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
local_rank: ${settings.cuda_env.local_rank}
project: modalities
mode: ONLINE
experiment_id: ${settings.experiment_id}
directory: "."
Loading

0 comments on commit 98893e9

Please sign in to comment.