Skip to content

Commit

Permalink
Merge branch 'main' into pydantic-warnings-cli
Browse files Browse the repository at this point in the history
  • Loading branch information
mali-git committed Jun 11, 2024
2 parents c7d828c + b278b3b commit bc7d9f3
Show file tree
Hide file tree
Showing 122 changed files with 6,637 additions and 1,764 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,6 @@ docs/source/generated
docs/source/api
pyenv*
.devcontainer/
noteboks/*
noteboks/*

tests/tmp/*
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Modalities

[![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT')
[![Coverage Status](https://coveralls.io/repos/github/Modalities/modalities/badge.svg)](https://coveralls.io/github/Modalities/modalities)



# Getting started
For training and evaluation a model, feel free to checkout [this](https://github.com/Modalities/modalities/blob/main/examples/getting_started/getting_started_example.md) getting started tutorial, in which we train a small, 60M-parameter GPT model on a tiny subset of the Redpajama V2 dataset.
Also, see our Wiki and API reference documentation: https://modalities.github.io/modalities/
Expand Down
18 changes: 18 additions & 0 deletions config_files/data_preparation/packed_cc_en_2048.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
settings:
src_path: /workspaces/modalities/data/cc_en/raw/train.jsonl
dst_path: /workspaces/modalities/data/cc_en/processed/train.pbin
index_path: /workspaces/modalities/data/cc_en/processed/train.idx
jq_pattern: .text
num_cpus: ${node_env:num_cpus}
eod_token: <eod>
processing_batch_size: 1000
raw_samples_queue_size: 300
processed_samples_queue_size: 300

tokenizer:
component_key: tokenizer
variant_key: pretrained_sp_tokenizer
config:
tokenizer_model_file: /workspaces/modalities/data/tokenizer/sp_bpe_en/bpe_tokenizer.model
padding: false
max_length: 2048
93 changes: 93 additions & 0 deletions config_files/text_generation/text_generation_config_torch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
settings:
referencing_keys:
sample_key: input_ids
prediction_key: logits
model_path: /raid/s3/opengptx/max_lue/modalities/data/checkpoints/2024-04-22__13-16-03/eid_2024-04-22__13-16-03-model-num_steps_1152.bin
device: 0
context_length: 2048

text_inference_component:
component_key: inference_component
variant_key: text
config:
device: ${settings.device}
model:
instance_key: checkpointed_model
pass_type: BY_REFERENCE
tokenizer:
instance_key: tokenizer
pass_type: BY_REFERENCE
context_length: ${settings.context_length}
eod_token: <eod>
prompt_template: "{prompt_input}" # "<instruction> Du bist Moody, ein LLM welches Menschen helfen soll. user: {prompt_input}"
temperature: 0
# chat: false

checkpointed_model:
component_key: model
variant_key: checkpointed
config:
checkpoint_loading:
component_key: checkpoint_loading
variant_key: torch
config:
device: ${settings.device}
precision: BF16
model:
instance_key: raw_model
pass_type: BY_REFERENCE
checkpoint_path: ${settings.model_path}

raw_model:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: ABSOLUTE
block_size: ${settings.context_length}
prediction_key: ${settings.referencing_keys.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_head_kv: 12
ffn_hidden: 2048
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: IdentityTransform
config: {}
activation_type: gelu
weight_init:
mean: 0.0
std: 0.02
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5

tokenizer:
component_key: tokenizer
variant_key: pretrained_hf_tokenizer
config:
pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2
padding: false
max_length: ${settings.context_length}
96 changes: 96 additions & 0 deletions config_files/text_generation/text_generation_overfitted_de.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
settings:
referencing_keys:
sample_key: input_ids
prediction_key: logits
model_path: /raid/s3/opengptx/max_lue/modalities/data/checkpoints/2024-04-28__13-06-00/eid_2024-04-28__13-06-00-model-num_steps_256.bin
device: 0
context_length: 2048

text_inference_component:
component_key: inference_component
variant_key: text
config:
device: ${settings.device}
model:
instance_key: checkpointed_model
pass_type: BY_REFERENCE
tokenizer:
instance_key: tokenizer
pass_type: BY_REFERENCE
context_length: ${settings.context_length}
eod_token: <eod>
prompt_template: "{prompt_input}" # "<instruction> Du bist Moody, ein LLM welches Menschen helfen soll. user: {prompt_input}"
temperature: 0
# chat: false

checkpointed_model:
component_key: model
variant_key: checkpointed
config:
checkpoint_loading:
component_key: checkpoint_loading
variant_key: torch
config:
device: ${settings.device}
precision: FP16
model:
instance_key: raw_model
pass_type: BY_REFERENCE
checkpoint_path: ${settings.model_path}

raw_model:
component_key: model
variant_key: gpt2
config:
sample_key: ${settings.referencing_keys.sample_key}
poe_type: NOPE
block_size: ${settings.context_length}
prediction_key: ${settings.referencing_keys.prediction_key}
vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: 12
n_head_q: 12
n_head_kv: 12
ffn_hidden: 2048
n_embd: 768
dropout: 0.0
bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster
attention_config:
qkv_transforms:
- type_hint: RotaryTransform
config:
n_embd: ${raw_model.config.n_embd}
n_head: ${raw_model.config.n_head_q} #it has to be head_q here
seq_length_dim: -2
activation_type: gelu
weight_init:
mean: 0.0
std: 0.02
attention_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
ffn_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5
lm_head_norm:
component_key: layer_norm
variant_key: rms_norm
config:
ndim: ${raw_model.config.n_embd}
bias: true
epsilon: 1e-5

tokenizer:
component_key: tokenizer
variant_key: pretrained_hf_tokenizer
config:
pretrained_model_name_or_path: /raid/s3/opengptx/max_lue/modalities/data/tokenizer/hf_gpt2
padding: false
max_length: ${settings.context_length}
29 changes: 19 additions & 10 deletions config_files/training/config_example_coca.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
settings:
experiment_id: ${modalities_env:experiment_id}
config_file_path: ${modalities_env:config_file_path}
referencing_keys:
sample_key: input_ids
target_key: target_ids
Expand All @@ -13,9 +14,6 @@ settings:
gradient_acc_steps: 1
local_train_micro_batch_size: 3
sequence_length: 256
gradient_clipping:
mode: p2_norm
threshold: 1.0
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
Expand Down Expand Up @@ -130,18 +128,18 @@ eval_dataloaders:
- instance_key: val_dataloader
pass_type: BY_REFERENCE

checkpointing:
component_key: checkpointing
checkpoint_saving:
component_key: checkpoint_saving
variant_key: default
config:
checkpointing_strategy:
component_key: checkpointing_strategy
checkpoint_saving_strategy:
component_key: checkpoint_saving_strategy
variant_key: save_k_most_recent_checkpoints_strategy
config:
k: -1 # -1 to save all checkpoints
checkpointing_execution:
component_key: checkpointing_execution
variant_key: fsdp_to_disc_checkpointing
checkpoint_saving_execution:
component_key: checkpoint_saving_execution
variant_key: fsdp
config:
checkpoint_path: ${settings.paths.checkpointing_path}
global_rank: ${settings.cuda_env.global_rank}
Expand Down Expand Up @@ -244,6 +242,16 @@ optimizer:
instance_key: wrapped_model
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
variant_key: fsdp_logging_only
config:
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
norm_type: P2_NORM


batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
Expand All @@ -267,3 +275,4 @@ evaluation_subscriber:
mode: OFFLINE
experiment_id: ${settings.experiment_id}
directory: "."
config_file_path: ${settings.config_file_path}
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ settings:
gradient_acc_steps: 1
local_train_micro_batch_size: 1
sequence_length: 4096
gradient_clipping:
mode: none
cuda_env:
local_rank: ${cuda_env:LOCAL_RANK}
global_rank: ${cuda_env:RANK}
Expand Down Expand Up @@ -176,6 +174,14 @@ optimizer:
instance_key: wrapped_model
pass_type: BY_REFERENCE

gradient_clipper:
component_key: gradient_clipper
variant_key: fsdp_logging_only
config:
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
norm_type: P2_NORM

batch_progress_subscriber:
component_key: progress_subscriber
Expand Down
Loading

0 comments on commit bc7d9f3

Please sign in to comment.