diff --git a/config_files/training/config_mem_map_mamba.yaml b/config_files/training/config_mamba_small.yaml similarity index 60% rename from config_files/training/config_mem_map_mamba.yaml rename to config_files/training/config_mamba_small.yaml index 67950382..c11b3507 100644 --- a/config_files/training/config_mem_map_mamba.yaml +++ b/config_files/training/config_mamba_small.yaml @@ -1,5 +1,6 @@ settings: experiment_id: ${modalities_env:experiment_id} + config_file_path: ${modalities_env:config_file_path} referencing_keys: sample_key: input_ids target_key: target_ids @@ -7,13 +8,14 @@ settings: training: callback_interval_in_samples: 32768 global_num_training_samples: 2048 - global_num_seen_samples: 0 + global_num_seen_steps: 0 activation_checkpointing_modules: [] gradient_acc_steps: 1 - local_train_micro_batch_size: 16 - sequence_length: 4096 - gradient_clipping: - mode: NONE + local_train_micro_batch_size: 1 + sequence_length: 256 + global_training_log_interval_in_steps: 8 + global_checkpointing_interval_in_steps: 8 + global_evaluation_interval_in_steps: 8 cuda_env: local_rank: ${cuda_env:LOCAL_RANK} global_rank: ${cuda_env:RANK} @@ -21,7 +23,6 @@ settings: paths: checkpointing_path: data/checkpoints - collate_fn: component_key: collate_fn variant_key: gpt_2_llm_collator @@ -31,11 +32,11 @@ collate_fn: train_dataset: component_key: dataset - variant_key: packed_mem_map_dataset_megatron + variant_key: packed_mem_map_dataset_continuous config: - raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpyjama_v2_default_DE_num_docs_1024/redpyjama_v2_default_DE_num_docs_1024.pbin + raw_data_path: ./data/lorem_ipsum.pbin block_size: ${settings.training.sequence_length} - sample_key: ${settings.referencing_keys.sample_key} + sample_key: ${settings.referencing_keys.sample_key} train_dataloader: component_key: data_loader @@ -68,14 +69,6 @@ train_dataloader: instance_key: collate_fn pass_type: BY_REFERENCE -val_dataset: - component_key: dataset - variant_key: packed_mem_map_dataset_megatron - config: - raw_data_path: /raid/s3/opengptx/max_lue/modalities/data/sample_datasets/redpajama_v2/mem_map/redpyjama_v2_default_DE_num_docs_1024/redpyjama_v2_default_DE_num_docs_1024.pbin - block_size: ${settings.training.sequence_length} - sample_key: ${settings.referencing_keys.sample_key} - val_dataloader: component_key: data_loader variant_key: default @@ -85,13 +78,13 @@ val_dataloader: shuffle: false dataloader_tag: "val" dataset: - instance_key: val_dataset + instance_key: train_dataset pass_type: BY_REFERENCE batch_sampler: component_key: batch_sampler variant_key: default config: - batch_size: ${settings.training.local_train_micro_batch_size} + batch_size: 4 drop_last: true sampler: component_key: sampler @@ -99,9 +92,40 @@ val_dataloader: config: rank: ${settings.cuda_env.global_rank} num_replicas: ${settings.cuda_env.world_size} - shuffle: true + shuffle: false dataset: - instance_key: val_dataset + instance_key: train_dataset + pass_type: BY_REFERENCE + collate_fn: + instance_key: collate_fn + pass_type: BY_REFERENCE + +test_dataloader: + component_key: data_loader + variant_key: default + config: + num_workers: 2 + pin_memory: true + shuffle: false + dataloader_tag: "test" + dataset: + instance_key: train_dataset + pass_type: BY_REFERENCE + batch_sampler: + component_key: batch_sampler + variant_key: default + config: + batch_size: 2 + drop_last: true + sampler: + component_key: sampler + variant_key: distributed_sampler + config: + rank: ${settings.cuda_env.global_rank} + num_replicas: ${settings.cuda_env.world_size} + shuffle: false + dataset: + instance_key: train_dataset pass_type: BY_REFERENCE collate_fn: instance_key: collate_fn @@ -110,41 +134,33 @@ val_dataloader: eval_dataloaders: - instance_key: val_dataloader pass_type: BY_REFERENCE + - instance_key: test_dataloader + pass_type: BY_REFERENCE -checkpointing: - component_key: checkpointing +checkpoint_saving: + component_key: checkpoint_saving variant_key: default config: - checkpointing_strategy: - component_key: checkpointing_strategy + checkpoint_saving_strategy: + component_key: checkpoint_saving_strategy variant_key: save_k_most_recent_checkpoints_strategy config: k: -1 # -1 to save all checkpoints - checkpointing_execution: - component_key: checkpointing_execution - variant_key: fsdp_to_disc_checkpointing + checkpoint_saving_execution: + component_key: checkpoint_saving_execution + variant_key: fsdp config: checkpoint_path: ${settings.paths.checkpointing_path} global_rank: ${settings.cuda_env.global_rank} experiment_id: ${settings.experiment_id} - mixed_precision_settings: BF_16 - sharding_strategy: FULL_SHARD - block_names: [ MambaBlock ] -model: - component_key: model - variant_key: mamba +# resolving class types via different enums sucks... +loss_fn: + component_key: loss + variant_key: clm_cross_entropy_loss config: - d_model: 16 - n_layer: 2 - vocab_size: 50257 - rms_norm: True - ssm_cfg: {} - residual_in_fp32: True - fused_add_norm: True - pad_vocab_size_multiple: 8 - tie_embeddings: True - prediction_key: logits + target_key: ${settings.referencing_keys.target_key} + prediction_key: ${settings.referencing_keys.prediction_key} wrapped_model: component_key: model @@ -158,6 +174,42 @@ wrapped_model: sharding_strategy: FULL_SHARD block_names: [ MambaBlock ] +model: + component_key: model + variant_key: mamba + config: + d_model: 768 + n_layer: 24 + vocab_size: 50257 + rms_norm: true + residual_in_fp32: true + fused_add_norm: true + pad_vocab_size_multiple: 8 + tie_embeddings: true + prediction_key: ${settings.referencing_keys.prediction_key} + sample_key: ${settings.referencing_keys.sample_key} + seed: null + dtype: null + initializer_cfg: {} + num_last_tokens: 0 + inference_params: {} + mixer_model_config: + norm_epsilon: 1e-5 + device: null + mamba_block_config: + d_state: 16 + d_conv: 4 + expand: 2 + dt_rank: auto + dt_min: 0.001 + dt_max: 0.1 + dt_init: random + dt_scale: 1.0 + dt_init_floor: 1e-4 + conv_bias: true + bias: false + use_fast_path: true + scheduler: component_key: scheduler variant_key: onecycle_lr @@ -168,41 +220,43 @@ scheduler: max_lr: 6e-4 div_factor: 10 final_div_factor: 1 - total_steps: 64 + total_steps: 16 pct_start: 0.01 anneal_strategy: cos -loss_fn: - component_key: loss - variant_key: clm_cross_entropy_loss - config: - target_key: ${settings.referencing_keys.target_key} - prediction_key: ${settings.referencing_keys.prediction_key} - optimizer: component_key: optimizer variant_key: adam_w config: lr: 0.0001 - betas: [ 0.9, 0.95 ] + betas: [0.9, 0.95] eps: 1e-8 weight_decay: 1e-1 wrapped_model: instance_key: wrapped_model pass_type: BY_REFERENCE +gradient_clipper: + component_key: gradient_clipper + variant_key: fsdp + config: + wrapped_model: + instance_key: wrapped_model + pass_type: BY_REFERENCE + norm_type: P2_NORM + max_norm: 1.0 + batch_progress_subscriber: component_key: progress_subscriber variant_key: rich config: local_rank: ${settings.cuda_env.local_rank} - world_size: ${settings.cuda_env.world_size} - global_num_seen_samples: ${settings.training.global_num_seen_samples} + global_num_seen_steps: ${settings.training.global_num_seen_steps} train_dataloader: instance_key: train_dataloader pass_type: BY_REFERENCE eval_dataloaders: - - instance_key: val_dataloader + instance_key: eval_dataloaders pass_type: BY_REFERENCE @@ -211,7 +265,8 @@ evaluation_subscriber: variant_key: wandb config: local_rank: ${settings.cuda_env.local_rank} - project: modalities + project: modalities_lorem_ipsum mode: ONLINE experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file + directory: "." + config_file_path: ${settings.config_file_path} \ No newline at end of file diff --git a/config_files/training/config_mem_map_mamba_overfitting.yaml b/config_files/training/config_mem_map_mamba_overfitting.yaml deleted file mode 100644 index b1d7d317..00000000 --- a/config_files/training/config_mem_map_mamba_overfitting.yaml +++ /dev/null @@ -1,247 +0,0 @@ -settings: - experiment_id: ${modalities_env:experiment_id} - referencing_keys: - sample_key: input_ids - target_key: target_ids - prediction_key: logits - training: - global_training_log_interval_in_steps: 10 - global_checkpointing_interval_in_steps: 1000 - global_evaluation_interval_in_steps: 64 - global_num_seen_steps: 0 - activation_checkpointing_modules: [] - gradient_acc_steps: 1 - local_train_micro_batch_size: 4 - sequence_length: 2048 - gradient_clipping: - mode: p2_norm - threshold: 1.0 - cuda_env: - local_rank: ${cuda_env:LOCAL_RANK} - global_rank: ${cuda_env:RANK} - world_size: ${cuda_env:WORLD_SIZE} - paths: - checkpointing_path: data/checkpoints - -cuda_env: - local_rank: ${cuda_env:LOCAL_RANK} - global_rank: ${cuda_env:RANK} - world_size: ${cuda_env:WORLD_SIZE} - -collate_fn: - component_key: collate_fn - variant_key: gpt_2_llm_collator - config: - sample_key: ${settings.referencing_keys.sample_key} - target_key: ${settings.referencing_keys.target_key} - -train_dataset: - component_key: dataset - variant_key: packed_mem_map_dataset_continuous - config: - raw_data_path: /raid/s3/opengptx/user/richard-rutmann/projects/Modalities/modalities/data/data_overfitting/data_overfitting_en.pbin - block_size: ${settings.training.sequence_length} - sample_key: ${settings.referencing_keys.sample_key} - -val_dataset: - component_key: dataset - variant_key: packed_mem_map_dataset_continuous - config: - raw_data_path: /raid/s3/opengptx/user/richard-rutmann/projects/Modalities/modalities/data/data_overfitting/data_overfitting_en.pbin - block_size: ${settings.training.sequence_length} - sample_key: ${settings.referencing_keys.sample_key} - -train_dataloader: - component_key: data_loader - variant_key: repeating_data_loader - config: - reshuffle_after_epoch: false - num_epochs: 1 # 100 epochs - dataloader: - component_key: data_loader - variant_key: default - config: - num_workers: 2 - pin_memory: true - shuffle: false - dataloader_tag: train - dataset: - instance_key: train_dataset - pass_type: BY_REFERENCE - batch_sampler: - component_key: batch_sampler - variant_key: default - config: - batch_size: ${settings.training.local_train_micro_batch_size} - drop_last: true - sampler: - component_key: sampler - variant_key: distributed_sampler - config: - rank: ${settings.cuda_env.global_rank} - num_replicas: ${settings.cuda_env.world_size} - shuffle: true - dataset: - instance_key: train_dataset - pass_type: BY_REFERENCE - collate_fn: - instance_key: collate_fn - pass_type: BY_REFERENCE - -val_dataloader: - component_key: data_loader - variant_key: default - config: - num_workers: 2 - pin_memory: true - shuffle: false - dataloader_tag: val - dataset: - instance_key: val_dataset - pass_type: BY_REFERENCE - batch_sampler: - component_key: batch_sampler - variant_key: default - config: - batch_size: ${settings.training.local_train_micro_batch_size} - drop_last: true - sampler: - component_key: sampler - variant_key: distributed_sampler - config: - rank: ${settings.cuda_env.global_rank} - num_replicas: ${settings.cuda_env.world_size} - shuffle: false - dataset: - instance_key: val_dataset - pass_type: BY_REFERENCE - collate_fn: - instance_key: collate_fn - pass_type: BY_REFERENCE - -eval_dataloaders: - - instance_key: val_dataloader - pass_type: BY_REFERENCE - - -checkpointing: - component_key: checkpointing - variant_key: default - config: - checkpointing_strategy: - component_key: checkpointing_strategy - variant_key: save_k_most_recent_checkpoints_strategy - config: - k: -1 # -1 to save all checkpoints - checkpointing_execution: - component_key: checkpointing_execution - variant_key: fsdp_to_disc_checkpointing - config: - checkpoint_path: ${settings.paths.checkpointing_path} - global_rank: ${settings.cuda_env.global_rank} - experiment_id: ${settings.experiment_id} - mixed_precision_settings: BF_16 - sharding_strategy: FULL_SHARD - block_names: [ MambaBlock ] - -wrapped_model: - component_key: model - variant_key: fsdp_wrapped - config: - model: - instance_key: model - pass_type: BY_REFERENCE - sync_module_states: true - mixed_precision_settings: BF_16 - sharding_strategy: FULL_SHARD - block_names: [MambaBlock] - -model: - component_key: model - variant_key: mamba - config: - d_model: 768 - n_layer: 24 - vocab_size: 50257 - rms_norm: true - residual_in_fp32: true - fused_add_norm: true - pad_vocab_size_multiple: 8 - tie_embeddings: true - prediction_key: ${settings.referencing_keys.prediction_key} - sample_key: ${settings.referencing_keys.sample_key} - seed: null - dtype: null - initializer_cfg: {} - num_last_tokens: 0 - inference_params: {} - mixer_model_config: - norm_epsilon: 1e-5 - device: null - mamba_block_config: - d_state: 16 - d_conv: 4 - expand: 2 - dt_rank: auto - dt_min: 0.001 - dt_max: 0.1 - dt_init: random - dt_scale: 1.0 - dt_init_floor: 1e-4 - conv_bias: true - bias: false - use_fast_path: true - - -loss_fn: - component_key: loss - variant_key: clm_cross_entropy_loss - config: - target_key: ${settings.referencing_keys.target_key} - prediction_key: ${settings.referencing_keys.prediction_key} - -optimizer: - component_key: optimizer - variant_key: adam_w - config: - lr: 0.0001 - betas: [0.9, 0.95] - eps: 1e-8 - weight_decay: 1e-1 - wrapped_model: - instance_key: wrapped_model - pass_type: BY_REFERENCE - -scheduler: - component_key: scheduler - variant_key: dummy_lr - config: - optimizer: - instance_key: optimizer - pass_type: BY_REFERENCE - - -batch_progress_subscriber: - component_key: progress_subscriber - variant_key: rich - config: - local_rank: ${settings.cuda_env.local_rank} - world_size: ${settings.cuda_env.world_size} - global_num_seen_steps: ${settings.training.global_num_seen_steps} - train_dataloader: - instance_key: train_dataloader - pass_type: BY_REFERENCE - eval_dataloaders: - - instance_key: val_dataloader - pass_type: BY_REFERENCE - - -evaluation_subscriber: - component_key: results_subscriber - variant_key: wandb - config: - local_rank: ${settings.cuda_env.local_rank} - project: modalities - mode: ONLINE - experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file diff --git a/config_files/training/config_mem_map_mamba_small_scale.yaml b/config_files/training/config_mem_map_mamba_small_scale.yaml deleted file mode 100644 index a056f39e..00000000 --- a/config_files/training/config_mem_map_mamba_small_scale.yaml +++ /dev/null @@ -1,241 +0,0 @@ -settings: - experiment_id: ${modalities_env:experiment_id} - referencing_keys: - sample_key: input_ids - target_key: target_ids - prediction_key: logits - training: - global_training_log_interval_in_steps: 10 - global_checkpointing_interval_in_steps: 1000 - global_evaluation_interval_in_steps: 64 - global_num_seen_steps: 0 - activation_checkpointing_modules: [] - gradient_acc_steps: 1 - local_train_micro_batch_size: 4 - sequence_length: 0 # TODO: Is sequence_length used in training? - gradient_clipping: - mode: p2_norm - threshold: 1.0 - cuda_env: - local_rank: ${cuda_env:LOCAL_RANK} - global_rank: ${cuda_env:RANK} - world_size: ${cuda_env:WORLD_SIZE} - paths: - checkpointing_path: data/checkpoints - -collate_fn: - component_key: collate_fn - variant_key: gpt_2_llm_collator - config: - sample_key: ${settings.referencing_keys.sample_key} - target_key: ${settings.referencing_keys.target_key} - -train_dataset: - component_key: dataset - variant_key: packed_mem_map_dataset_continuous - config: - raw_data_path: /raid/fhgiais/opengptx/michaelf/git_repos/modalities/data-temp/en/modalities/2048/train_2048.pbin - block_size: ${settings.training.sequence_length} - sample_key: ${settings.referencing_keys.sample_key} - -val_dataset: - component_key: dataset - variant_key: packed_mem_map_dataset_continuous - config: - raw_data_path: /raid/fhgiais/opengptx/michaelf/git_repos/modalities/data-temp/en/modalities/2048/valid_2048.pbin - block_size: ${settings.training.sequence_length} - sample_key: ${settings.referencing_keys.sample_key} - -train_dataloader: - component_key: data_loader - variant_key: repeating_data_loader - config: - reshuffle_after_epoch: false - num_epochs: 1 # 100 epochs - dataloader: - component_key: data_loader - variant_key: default - config: - num_workers: 2 - pin_memory: true - shuffle: false - dataloader_tag: train - dataset: - instance_key: train_dataset - pass_type: BY_REFERENCE - batch_sampler: - component_key: batch_sampler - variant_key: default - config: - batch_size: ${settings.training.local_train_micro_batch_size} - drop_last: true - sampler: - component_key: sampler - variant_key: distributed_sampler - config: - rank: ${settings.cuda_env.global_rank} - num_replicas: ${settings.cuda_env.world_size} - shuffle: true - dataset: - instance_key: train_dataset - pass_type: BY_REFERENCE - collate_fn: - instance_key: collate_fn - pass_type: BY_REFERENCE - -val_dataloader: - component_key: data_loader - variant_key: default - config: - num_workers: 2 - pin_memory: true - shuffle: false - dataloader_tag: val - dataset: - instance_key: val_dataset - pass_type: BY_REFERENCE - batch_sampler: - component_key: batch_sampler - variant_key: default - config: - batch_size: ${settings.training.local_train_micro_batch_size} - drop_last: true - sampler: - component_key: sampler - variant_key: distributed_sampler - config: - rank: ${settings.cuda_env.global_rank} - num_replicas: ${settings.cuda_env.world_size} - shuffle: false - dataset: - instance_key: val_dataset - pass_type: BY_REFERENCE - collate_fn: - instance_key: collate_fn - pass_type: BY_REFERENCE - -eval_dataloaders: - - instance_key: val_dataloader - pass_type: BY_REFERENCE - - -checkpointing: - component_key: checkpointing - variant_key: default - config: - checkpointing_strategy: - component_key: checkpointing_strategy - variant_key: save_k_most_recent_checkpoints_strategy - config: - k: 3 # -1 to save all checkpoints - checkpointing_execution: - component_key: checkpointing_execution - variant_key: fsdp_to_disc_checkpointing - config: - checkpoint_path: ${settings.paths.checkpointing_path} - global_rank: ${settings.cuda_env.global_rank} - experiment_id: ${settings.experiment_id} - mixed_precision_settings: BF_16 - sharding_strategy: FULL_SHARD - block_names: [ MambaBlock ] - -wrapped_model: - component_key: model - variant_key: fsdp_wrapped - config: - model: - instance_key: model - pass_type: BY_REFERENCE - sync_module_states: true - mixed_precision_settings: BF_16 - sharding_strategy: FULL_SHARD - block_names: [ MambaBlock ] - -model: - component_key: model - variant_key: mamba - config: - d_model: 768 - n_layer: 24 - vocab_size: 50257 - rms_norm: true - residual_in_fp32: true - fused_add_norm: true - pad_vocab_size_multiple: 8 - tie_embeddings: true - prediction_key: ${settings.referencing_keys.prediction_key} - sample_key: ${settings.referencing_keys.sample_key} - seed: null - dtype: null - initializer_cfg: {} - num_last_tokens: 0 - inference_params: {} - mixer_model_config: - norm_epsilon: 1e-5 - device: null - mamba_block_config: - d_state: 16 - d_conv: 4 - expand: 2 - dt_rank: auto - dt_min: 0.001 - dt_max: 0.1 - dt_init: random - dt_scale: 1.0 - dt_init_floor: 1e-4 - conv_bias: true - bias: false - use_fast_path: true - -loss_fn: - component_key: loss - variant_key: clm_cross_entropy_loss - config: - target_key: ${settings.referencing_keys.target_key} - prediction_key: ${settings.referencing_keys.prediction_key} - -optimizer: - component_key: optimizer - variant_key: adam_w - config: - lr: 0.0001 - betas: [ 0.9, 0.95 ] - eps: 1e-8 - weight_decay: 1e-1 - wrapped_model: - instance_key: wrapped_model - pass_type: BY_REFERENCE - -scheduler: - component_key: scheduler - variant_key: dummy_lr - config: - optimizer: - instance_key: optimizer - pass_type: BY_REFERENCE - - -batch_progress_subscriber: - component_key: progress_subscriber - variant_key: rich - config: - local_rank: ${settings.cuda_env.local_rank} - world_size: ${settings.cuda_env.world_size} - global_num_seen_steps: ${settings.training.global_num_seen_steps} - train_dataloader: - instance_key: train_dataloader - pass_type: BY_REFERENCE - eval_dataloaders: - - instance_key: val_dataloader - pass_type: BY_REFERENCE - - -evaluation_subscriber: - component_key: results_subscriber - variant_key: wandb - config: - local_rank: ${settings.cuda_env.local_rank} - project: modalities - mode: ONLINE - experiment_id: ${settings.experiment_id} - directory: "." \ No newline at end of file diff --git a/src/modalities/__main__.py b/src/modalities/__main__.py index 0d7f2065..fa4e7bb3 100644 --- a/src/modalities/__main__.py +++ b/src/modalities/__main__.py @@ -12,6 +12,7 @@ from modalities.activation_checkpointing import apply_activation_checkpointing_inplace from modalities.batch import EvaluationResultBatch +from modalities.checkpointing.checkpoint_conversion import CheckpointConversion from modalities.config.component_factory import ComponentFactory from modalities.config.config import ProcessGroupBackendType, load_app_config_dict from modalities.config.instantiation_models import ( @@ -28,6 +29,7 @@ from modalities.logging_broker.messages import BatchProgressUpdate, MessageTypes from modalities.logging_broker.publisher import MessagePublisher from modalities.logging_broker.subscriber import MessageSubscriberIF +from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter from modalities.registry.components import COMPONENTS from modalities.registry.registry import Registry from modalities.running_env.cuda_env import CudaEnv @@ -66,6 +68,34 @@ def entry_point_generate_text(config_file_path: FilePath): generate_text(config_file_path) +@main.command(name="convert_pytorch_to_hf_checkpoint") +@click.option( + "--config_file_path", + type=click_pathlib.Path(exists=True), + required=True, + help="Path to config of model checkpoint.", +) +@click.option( + "--output_hf_checkpoint_dir", + type=click_pathlib.Path(exists=False), + required=True, + help="Converted HF checkpoint will be written to this directory.", +) +@click.option( + "--prediction_key", + type=str, + required=True, + help="The key in the models output, where one can find the logits.", +) +def entry_point_convert_pytorch_to_hf_checkpoint( + config_file_path: Path, output_hf_checkpoint_dir: Path, prediction_key: str +) -> HFModelAdapter: + cp = CheckpointConversion(config_file_path, output_hf_checkpoint_dir) + hf_model = cp.convert_pytorch_to_hf_checkpoint(prediction_key=prediction_key) + print(f"Model was successfully converted and saved to {output_hf_checkpoint_dir}") + return hf_model + + @main.group(name="data") def data(): """ diff --git a/src/modalities/checkpointing/checkpoint_conversion.py b/src/modalities/checkpointing/checkpoint_conversion.py new file mode 100644 index 00000000..3197fb5f --- /dev/null +++ b/src/modalities/checkpointing/checkpoint_conversion.py @@ -0,0 +1,24 @@ +from pathlib import Path + +from modalities.config.config import load_app_config_dict +from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapterConfig, HFModelAdapter + + +class CheckpointConversion: + + def __init__( + self, + config_file_path: Path, + output_hf_checkpoint_dir: Path, + ): + self.output_hf_checkpoint_dir = output_hf_checkpoint_dir + if not config_file_path.exists(): + raise ValueError(f"Could not find {config_file_path}.") + + self.config_dict = load_app_config_dict(config_file_path) + + def convert_pytorch_to_hf_checkpoint(self, prediction_key: str) -> HFModelAdapter: + config = HFModelAdapterConfig(config=self.config_dict) + hf_model = HFModelAdapter(config=config, prediction_key=prediction_key, load_checkpoint=True) + hf_model.save_pretrained(self.output_hf_checkpoint_dir, safe_serialization=False) + return hf_model diff --git a/src/modalities/exceptions.py b/src/modalities/exceptions.py index 07e344d5..e5a98e75 100644 --- a/src/modalities/exceptions.py +++ b/src/modalities/exceptions.py @@ -16,3 +16,7 @@ class RunningEnvError(Exception): class TimeRecorderStateError(Exception): pass + + +class ConfigError(Exception): + pass \ No newline at end of file diff --git a/src/modalities/models/huggingface/huggingface_models.py b/src/modalities/models/huggingface/huggingface_model.py similarity index 99% rename from src/modalities/models/huggingface/huggingface_models.py rename to src/modalities/models/huggingface/huggingface_model.py index b980e671..f6cec284 100644 --- a/src/modalities/models/huggingface/huggingface_models.py +++ b/src/modalities/models/huggingface/huggingface_model.py @@ -8,6 +8,7 @@ from modalities.config.lookup_enum import LookupEnum from modalities.models.model import NNModel + # Huggingface Model dependencies # # ModuleUtilsMixin diff --git a/tests/checkpointing/torch/__init__.py b/src/modalities/models/huggingface_adapters/__init__.py similarity index 100% rename from tests/checkpointing/torch/__init__.py rename to src/modalities/models/huggingface_adapters/__init__.py diff --git a/src/modalities/models/huggingface_adapters/hf_adapter.py b/src/modalities/models/huggingface_adapters/hf_adapter.py new file mode 100644 index 00000000..1ee31f9d --- /dev/null +++ b/src/modalities/models/huggingface_adapters/hf_adapter.py @@ -0,0 +1,93 @@ +import json +from dataclasses import dataclass +from pathlib import PosixPath +from typing import Any, Dict, Optional, Tuple, Union, List + +import torch +from transformers import PreTrainedModel, PretrainedConfig +from transformers.utils import ModelOutput + +from modalities.exceptions import ConfigError +from modalities.models.model import NNModel +from modalities.models.utils import get_model_from_config, ModelTypeEnum + + +class HFModelAdapterConfig(PretrainedConfig): + model_type = "modalities" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # self.config is added by the super class via kwargs + if self.config is None: + raise ConfigError("Config is not passed in HFModelAdapterConfig.") + # since the config will be saved to json and json can't handle posixpaths, we need to convert them to strings + self._convert_posixpath_to_str(data_to_be_formatted=self.config) + + def to_json_string(self, use_diff: bool = True) -> str: + json_dict = {"config": self.config.copy(), "model_type": self.model_type} + return json.dumps(json_dict) + + def _convert_posixpath_to_str( + self, data_to_be_formatted: Union[Dict[str, Any], List[Any], PosixPath, Any] + ) -> Union[Dict[str, Any], List[Any], PosixPath, Any]: + """ + Recursively iterate and convert PosixPath values to strings. + """ + if isinstance(data_to_be_formatted, dict): + for key, value in data_to_be_formatted.items(): + data_to_be_formatted[key] = self._convert_posixpath_to_str(data_to_be_formatted=value) + elif isinstance(data_to_be_formatted, list): + for i in range(len(data_to_be_formatted)): + data_to_be_formatted[i] = self._convert_posixpath_to_str(data_to_be_formatted=data_to_be_formatted[i]) + elif isinstance(data_to_be_formatted, PosixPath): + return str(data_to_be_formatted) + return data_to_be_formatted + + +class HFModelAdapter(PreTrainedModel): + config_class = HFModelAdapterConfig + + def __init__(self, config: HFModelAdapterConfig, prediction_key: str, load_checkpoint: bool = False, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.prediction_key = prediction_key + if load_checkpoint: + self.model: NNModel = get_model_from_config(config.config, model_type=ModelTypeEnum.CHECKPOINTED_MODEL) + else: + self.model: NNModel = get_model_from_config(config.config, model_type=ModelTypeEnum.MODEL) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = False, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + ): + # These parameters are required by HuggingFace. We do not use them and hence don't implement them. + if output_attentions or output_hidden_states: + raise NotImplementedError + model_input = {"input_ids": input_ids, "attention_mask": attention_mask} + model_forward_output: Dict[str, torch.Tensor] = self.model.forward(model_input) + if return_dict: + return ModalitiesModelOutput(**model_forward_output) + else: + return model_forward_output[self.prediction_key] + + def prepare_inputs_for_generation( + self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor = None, **kwargs + ) -> Dict[str, Any]: + """ + Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the + generate method. + """ + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + + +@dataclass +class ModalitiesModelOutput(ModelOutput): + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None diff --git a/src/modalities/models/utils.py b/src/modalities/models/utils.py new file mode 100644 index 00000000..df6b04d2 --- /dev/null +++ b/src/modalities/models/utils.py @@ -0,0 +1,30 @@ +from typing import Dict + +from pydantic import BaseModel +from enum import Enum + +from modalities.config.component_factory import ComponentFactory +from modalities.config.pydanctic_if_types import PydanticPytorchModuleType +from modalities.registry.components import COMPONENTS +from modalities.registry.registry import Registry + +class ModelTypeEnum(Enum): + MODEL = "model" + CHECKPOINTED_MODEL = "checkpointed_model" + +def get_model_from_config(config: Dict, model_type: ModelTypeEnum): + registry = Registry(COMPONENTS) + component_factory = ComponentFactory(registry=registry) + + # create the pydantic config for the component factory dynamically based on model_type + if model_type.value == "model": + class PydanticConfig(BaseModel): + model: PydanticPytorchModuleType + elif model_type.value == "checkpointed_model": + class PydanticConfig(BaseModel): + checkpointed_model: PydanticPytorchModuleType + else: + raise NotImplementedError() + + components = component_factory.build_components(config_dict=config, components_model_type=PydanticConfig) + return getattr(components, model_type.value) diff --git a/src/modalities/registry/components.py b/src/modalities/registry/components.py index 62fafb50..fab7726d 100644 --- a/src/modalities/registry/components.py +++ b/src/modalities/registry/components.py @@ -62,7 +62,7 @@ from modalities.models.components.layer_norms import LayerNormConfig, RMSLayerNorm, RMSLayerNormConfig from modalities.models.gpt2.collator import GPT2LLMCollateFn from modalities.models.gpt2.gpt2_model import GPT2LLM, GPT2LLMConfig -from modalities.models.huggingface.huggingface_models import ( +from modalities.models.huggingface.huggingface_model import ( HuggingFacePretrainedModel, HuggingFacePretrainedModelConfig, ) diff --git a/tests/checkpointing/configs_for_testing/gpt2_config_test.yaml b/tests/checkpointing/configs_for_testing/gpt2_config_test.yaml new file mode 100644 index 00000000..52816e25 --- /dev/null +++ b/tests/checkpointing/configs_for_testing/gpt2_config_test.yaml @@ -0,0 +1,63 @@ +model: + component_key: model + variant_key: gpt2 + config: + sample_key: input_ids + poe_type: NOPE + block_size: 256 + prediction_key: logits + vocab_size: 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency + n_layer: 12 + n_head_q: 12 + n_head_kv: 12 + ffn_hidden: 2048 + n_embd: 768 + dropout: 0.0 + bias: true # True: bias in Linears, like GPT-2. False: a bit better and faster + attention_config: + qkv_transforms: + - type_hint: RotaryTransform + config: + n_embd: ${model.config.n_embd} + n_head: ${model.config.n_head_q} #it has to be head_q here + seq_length_dim: -2 + activation_type: gelu + weight_init: + mean: 0.0 + std: 0.02 + attention_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + ffn_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + lm_head_norm: + component_key: layer_norm + variant_key: rms_norm + config: + ndim: ${model.config.n_embd} + bias: true + epsilon: 1e-5 + +checkpointed_model: + component_key: model + variant_key: checkpointed + config: + checkpoint_loading: + component_key: checkpoint_loading + variant_key: torch + config: + device: 0 + precision: BF16 + model: + instance_key: model + pass_type: BY_REFERENCE + checkpoint_path: null \ No newline at end of file diff --git a/tests/checkpointing/configs_for_testing/mamba_config_test.yaml b/tests/checkpointing/configs_for_testing/mamba_config_test.yaml new file mode 100644 index 00000000..45574d1d --- /dev/null +++ b/tests/checkpointing/configs_for_testing/mamba_config_test.yaml @@ -0,0 +1,50 @@ +model: + component_key: model + variant_key: mamba + config: + d_model: 768 + n_layer: 24 + vocab_size: 50257 + rms_norm: true + residual_in_fp32: true + fused_add_norm: true + pad_vocab_size_multiple: 8 + tie_embeddings: true + prediction_key: logits + sample_key: input_ids + seed: null + dtype: null + initializer_cfg: {} + num_last_tokens: 0 + inference_params: {} + mixer_model_config: + norm_epsilon: 1e-5 + device: null + mamba_block_config: + d_state: 16 + d_conv: 4 + expand: 2 + dt_rank: auto + dt_min: 0.001 + dt_max: 0.1 + dt_init: random + dt_scale: 1.0 + dt_init_floor: 1e-4 + conv_bias: true + bias: false + use_fast_path: true + +checkpointed_model: + component_key: model + variant_key: checkpointed + config: + checkpoint_loading: + component_key: checkpoint_loading + variant_key: torch + config: + device: 0 + precision: BF16 + model: + instance_key: model + pass_type: BY_REFERENCE + checkpoint_path: null \ No newline at end of file diff --git a/tests/checkpointing/pytorch/__init__.py b/tests/checkpointing/pytorch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/checkpointing/torch/test_torch_checkpoint_loading.py b/tests/checkpointing/pytorch/test_torch_checkpoint_loading.py similarity index 100% rename from tests/checkpointing/torch/test_torch_checkpoint_loading.py rename to tests/checkpointing/pytorch/test_torch_checkpoint_loading.py diff --git a/tests/checkpointing/test_checkpoint_conversion.py b/tests/checkpointing/test_checkpoint_conversion.py new file mode 100644 index 00000000..a0cbc97c --- /dev/null +++ b/tests/checkpointing/test_checkpoint_conversion.py @@ -0,0 +1,141 @@ +import os +from pathlib import Path, PosixPath + +import pytest +import torch +from transformers import AutoModelForCausalLM, AutoConfig + +from modalities.checkpointing.checkpoint_conversion import CheckpointConversion +from modalities.config.component_factory import ComponentFactory +from modalities.config.config import load_app_config_dict +from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter, HFModelAdapterConfig +from modalities.models.model import NNModel +from modalities.models.utils import get_model_from_config, ModelTypeEnum +from modalities.registry.components import COMPONENTS +from modalities.registry.registry import Registry +from tests.conftest import _ROOT_DIR + + +@pytest.fixture() +def set_env(): + os.environ["LOCAL_RANK"] = "0" + os.environ["RANK"] = "0" + os.environ["WORLD_SIZE"] = "1" + + +@pytest.fixture +def device() -> str: + return "cuda:0" + + +@pytest.fixture() +def component_factory() -> ComponentFactory: + registry = Registry(COMPONENTS) + component_factory = ComponentFactory(registry=registry) + return component_factory + + +@pytest.fixture(params=["gpt2_config_test.yaml", "mamba_config_test.yaml"]) +def config_file_name(request) -> str: + return request.param + + +@pytest.fixture() +def config_file_path(config_file_name: str) -> Path: + config_file_path = _ROOT_DIR / Path("tests/checkpointing/configs_for_testing/" + config_file_name) + return config_file_path + + +@pytest.fixture() +def config_dict(config_file_path: Path) -> dict: + return load_app_config_dict(config_file_path=config_file_path) + + +@pytest.fixture() +def initialized_model(set_env, config_dict: dict) -> NNModel: + return get_model_from_config(config=config_dict, model_type=ModelTypeEnum.MODEL) + + +@pytest.fixture() +def checkpoint_conversion(tmp_path: Path, initialized_model: NNModel, config_file_path: Path) -> CheckpointConversion: + model_file_path = tmp_path / "pytorch_model.bin" + torch.save(initialized_model.state_dict(), model_file_path) + + output_hf_checkpoint_dir = tmp_path / "converted_hf_checkpoint" + checkpoint_conversion = CheckpointConversion( + config_file_path=config_file_path, + output_hf_checkpoint_dir=output_hf_checkpoint_dir, + ) + + # Adding the checkpoint path in tmp folder to the config dict + checkpoint_conversion.config_dict["checkpointed_model"]["config"]["checkpoint_path"] = model_file_path + return checkpoint_conversion + + +@pytest.fixture() +def pytorch_model(checkpoint_conversion: CheckpointConversion) -> NNModel: + return get_model_from_config(config=checkpoint_conversion.config_dict, model_type=ModelTypeEnum.CHECKPOINTED_MODEL) + + +@pytest.fixture() +def hf_model(checkpoint_conversion: CheckpointConversion, prediction_key: str) -> NNModel: + return checkpoint_conversion.convert_pytorch_to_hf_checkpoint(prediction_key=prediction_key) + +@pytest.fixture() +def prediction_key() -> str: + return "logits" + +@pytest.fixture() +def hf_model_from_checkpoint( + checkpoint_conversion: CheckpointConversion, pytorch_model: NNModel, device: str, prediction_key: str +) -> NNModel: + AutoConfig.register(model_type="modalities", config=HFModelAdapterConfig) + AutoModelForCausalLM.register(config_class=HFModelAdapterConfig, model_class=HFModelAdapter) + hf_model_from_checkpoint = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=checkpoint_conversion.output_hf_checkpoint_dir, + torch_dtype=pytorch_model.lm_head.weight.dtype, + prediction_key=prediction_key + ) + hf_model_from_checkpoint = hf_model_from_checkpoint.to(device) + return hf_model_from_checkpoint + + +@pytest.fixture() +def test_tensor(device: str, size: int = 10) -> torch.Tensor: + test_tensor = torch.randint(size, size=(5, size)) + test_tensor = test_tensor.to(device) + return test_tensor + + +def test_models_before_and_after_conversion_produce_same_output( + device: str, + pytorch_model: NNModel, + hf_model: NNModel, + hf_model_from_checkpoint: NNModel, + test_tensor: torch.Tensor, +): + pytorch_model = put_model_to_eval_mode(model=pytorch_model, device=device) + hf_model = put_model_to_eval_mode(model=hf_model, device=device) + + output_pytorch_model = pytorch_model.forward(inputs={"input_ids": test_tensor})["logits"] + output_hf_model = hf_model.forward(input_ids=test_tensor, return_dict=False) + output_hf_model_from_checkpoint = hf_model_from_checkpoint.forward(input_ids=test_tensor, return_dict=False) + + assert (output_hf_model == output_pytorch_model).all() + assert (output_hf_model == output_hf_model_from_checkpoint).all() + + +def put_model_to_eval_mode(model: NNModel, device: str) -> NNModel: + model.eval() + model = model.to(device) + return model + + +def test_models_before_and_after_conversion_are_equal( + pytorch_model: NNModel, + hf_model: NNModel, + hf_model_from_checkpoint: NNModel, +): + for p1, p2, p3 in zip(hf_model.parameters(), pytorch_model.parameters(), hf_model_from_checkpoint.parameters()): + assert torch.equal(p1, p2) + assert torch.equal(p1, p3) diff --git a/tests/models/test_hf_adapter.py b/tests/models/test_hf_adapter.py new file mode 100644 index 00000000..a0791f0e --- /dev/null +++ b/tests/models/test_hf_adapter.py @@ -0,0 +1,32 @@ + +from pathlib import Path +import pytest + +from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapterConfig + + +@pytest.fixture() +def hf_model_adapter_config() -> HFModelAdapterConfig: + return HFModelAdapterConfig(config={}) + + +def test_convert_posixpath_to_str(hf_model_adapter_config: HFModelAdapterConfig): + test_data_to_be_formatted = { + "key1": Path("test/path/1"), + "key2": [ + {"key211": Path("test/path/211"), "key212": 1}, + {"key221": 1, "key222": Path("test/path/222")}, + ], + "key3": 1, + } + expected_result = { + "key1": "test/path/1", + "key2": [ + {"key211": "test/path/211", "key212": 1}, + {"key221": 1, "key222": "test/path/222"}, + ], + "key3": 1, + } + result = hf_model_adapter_config._convert_posixpath_to_str(test_data_to_be_formatted) + assert result == expected_result + \ No newline at end of file