diff --git a/llama2_70b_lora/Dockerfile b/llama2_70b_lora/Dockerfile new file mode 100644 index 000000000..c14813613 --- /dev/null +++ b/llama2_70b_lora/Dockerfile @@ -0,0 +1,8 @@ +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.01-py3 +FROM ${FROM_IMAGE_NAME} + +WORKDIR /workspace/ft-llm +ADD . /workspace/ft-llm + +RUN pip install -r requirements.txt +RUN pip install flash-attn==2.4.1 --no-build-isolation diff --git a/llama2_70b_lora/README.md b/llama2_70b_lora/README.md new file mode 100644 index 000000000..2caddad2f --- /dev/null +++ b/llama2_70b_lora/README.md @@ -0,0 +1,100 @@ +# LoRA benchmark + +LoRA benchmark on GPU (Nvidia A100 80GB). Inspired by [this blog post](https://medium.com/@sourabmangrulkar/falcon-180b-finetuning-using-peft-and-deepspeed-b92643091d99) and [this script](https://github.com/pacman100/DHS-LLM-Workshop/blob/main/chat_assistant/training/train.py). + + +## Setup + +Run the following: +```bash +sudo ./run_docker.sh +cd lora +pip install -r requirements.txt +``` + +> The Docker run command contains `-v /home/regis_huggingface_co/workspace:/root/workspace --workdir /root/workspace`. Feel free to change these flags at your own convenience. + +You will also need to run the following to install flash attention: +``` +pip install flash-attn --no-build-isolation +``` + +> For flash attention, make sure that the following command returns 0: +> ``` +> ninja --version >/dev/null && echo $? +> ``` +> If not, run +> ``` +> pip uninstall -y ninja && pip install ninja +> ``` +> and install `flash-attn` again. +> More information [here](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features). + +Make sure to have requested permission for donwloading Llama2 weights on the Hugging Face Hub: https://huggingface.co/meta-llama/Llama-2-7b-hf +Then, you will need to be connected to your Hugging Face account with a read token running: +``` +huggingface-cli login +``` +Finally please install mlperf logger: +``` +git clone https://github.com/mlperf/logging.git mlperf-logging +pip install -e mlperf-logging +``` +## Download Data and Model +data can be downloaded from: +[mlperf drive - train data](https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing) +[mlperf drive - validation data](https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing) +[mlperf drive - llama-v2 model](https://drive.google.com/drive/folders/1sTeuxkPhwkNPKIPFnOLIYCcK53oB3Ypc?usp=sharing) +As defaults the scripts assume the model is under at ```./llama-v2-fused-qkv``` and the both train and validation are under ```dataset``` folder. + +## Llama2-70B on 8 devices + +Run: +```bash +accelerate launch --config_file configs/default_config.yaml scripts/train.py \ +--model_name meta-llama/Llama-2-70b-hf \ +--dataset_name "tau/scrolls" --dataset_config_name "gov_report" \ +--max_seq_len 8192 \ +--bf16 True \ +--logging_steps 1 \ +--eval_steps 22 \ +--output_dir "/tmp/llama-70b" \ +--per_device_train_batch_size 1 \ +--gradient_accumulation_steps 1 \ +--dataset_text_field "input" \ +--lr_scheduler_type "cosine" \ +--learning_rate 1e-3 \ +--warmup_ratio 0.03 \ +--use_gradient_checkpointing True \ +--use_peft_lora True \ +--lora_r 16 \ +--lora_alpha 32 \ +--lora_dropout 0.1 \ +--max_steps 440 \ +--use_flash_attn \ +--lora_target_modules "q_proj,v_proj,k_proj,o_proj" +``` +where the Accelerate config file is [this one](https://github.com/regisss/lora/blob/main/configs/default_config.yaml). + +> Using flash attention with `--use_flash_attn` is necessary for training on 8k-token sequences. + +Learning curves of such a run can be found here: https://huggingface.co/regisss/test_5/tensorboard + + +## Evaluation + +To run evaluation for summarizing texts, you can run: +- Without LoRA adapter weights: + ``` + python scripts/eval.py --model_name meta-llama/Llama-2-70b-hf --max_new_tokens 900 --seq_length 8192 --do_sample --dataset_name "tau/scrolls" --dataset_config_name "gov_report" + ``` +- With LoRA adapter weights: + ``` + python scripts/eval.py --peft_model_name path_to_my_lora_model --max_new_tokens 900 --seq_length 8192 --do_sample --dataset_name "tau/scrolls" --dataset_config_name "gov_report" + ``` +## expected outcome + +A clean output (train and eval loss) of a singel run with 440 steps can be found under +``` + convergence_example.txt +``` \ No newline at end of file diff --git a/llama2_70b_lora/configs/default_config.yaml b/llama2_70b_lora/configs/default_config.yaml new file mode 100644 index 000000000..e422c0364 --- /dev/null +++ b/llama2_70b_lora/configs/default_config.yaml @@ -0,0 +1,22 @@ +compute_environment: LOCAL_MACHINE +debug: false +deepspeed_config: + gradient_accumulation_steps: 1 + offload_optimizer_device: none + offload_param_device: none + zero3_init_flag: true + zero3_save_16bit_model: true + zero_stage: 3 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/llama2_70b_lora/convergence_example.txt b/llama2_70b_lora/convergence_example.txt new file mode 100644 index 000000000..9d5c9b218 --- /dev/null +++ b/llama2_70b_lora/convergence_example.txt @@ -0,0 +1,508 @@ + 0%| | 0/440 [00:00 0 + and not state.global_step % (state.eval_steps) == 0 + ): + self.mllogger.event( + "train_loss", + value=state.log_history[-1]["loss"], + metadata={"samples_count": state.log_history[-1]["step"]*self.gbs}, + ) + control.should_log = True + + if state.global_step % (state.eval_steps) == 0 and state.global_step > 0: + self.mllogger.end( + constants.BLOCK_STOP, + value="", + metadata={"samples_count": state.log_history[-1]["step"]*self.gbs}, + ) + self.mllogger.event( + constants.EVAL_ACCURACY, + value=state.log_history[-1]["eval_loss"], + metadata={"samples_count": state.log_history[-1]["step"]*self.gbs}, + ) + self.mllogger.start( + constants.BLOCK_START, + value="", + metadata={"samples_count": state.log_history[-1]["step"]}, + ) + control.should_log = True + eval_loss_list = [ + sl["eval_loss"] for sl in state.log_history if "eval_loss" in sl + ] + if eval_loss_list and eval_loss_list[-1] <= self.mllogger.target_eval_loss: + control.should_training_stop = True + self.mllogger.end( + constants.RUN_STOP, + value=eval_loss_list[-1], + metadata={ + "samples_count": state.log_history[-1]["step"]*self.gbs, + "status": "success", + }, + ) + if state.global_step >= state.max_steps: + control.should_training_stop = True + self.mllogger.end( + constants.RUN_STOP, + value=eval_loss_list[-1], + metadata={"samples_count": state.log_history[-1]["step"]*self.gbs, "status": "fail"}, + ) + + return control diff --git a/llama2_70b_lora/scripts/train.py b/llama2_70b_lora/scripts/train.py new file mode 100644 index 000000000..afe09912e --- /dev/null +++ b/llama2_70b_lora/scripts/train.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional + +from datasets import load_dataset +from mlperf_logging_utils import LoraLogger, MLPerfCallback +from transformers import HfArgumentParser, Trainer, TrainingArguments +from utils import create_and_prepare_model, peft_module_casting_to_bf16 + + +@dataclass +class ScriptArguments: + """ + These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train. + """ + + local_rank: Optional[int] = field( + default=-1, metadata={"help": "Used for multi-gpu"} + ) + per_device_train_batch_size: Optional[int] = field(default=1) + per_device_eval_batch_size: Optional[int] = field(default=1) + gradient_accumulation_steps: Optional[int] = field(default=1) + learning_rate: Optional[float] = field(default=2e-4) + max_grad_norm: Optional[float] = field(default=0.0) + weight_decay: Optional[float] = field(default=0.001) + lora_alpha: Optional[int] = field(default=32) + lora_dropout: Optional[float] = field(default=0.1, metadata={"help": "lora dropout is a fixed to 0.1 in closed submission"}) + lora_r: Optional[int] = field(default=16, metadata={"help": "lora rank is a fixed to 16 in closed submission"}) + lora_target_modules: Optional[str] = field( + default=None, + metadata={ + "help": "comma separated list of target modules to apply LoRA layers to" + }, + ) + max_seq_length: Optional[int] = field(default=8192) + model_path: Optional[str] = field( + default="./llama-v2-fused-qkv", + metadata={"help": "Path to the model directory."}, + ) + dataset_path: Optional[str] = field( + default="./dataset.npy", + metadata={"help": "The path to the downloaded dataset."}, + ) + config_path: Optional[str] = field( + default="./configs/default_config.yaml", + metadata={"help": "path to model config"}, + ) + num_train_epochs: Optional[int] = field( + default=1, + metadata={"help": "The number of training epochs for the reward model."}, + ) + fp16: Optional[bool] = field( + default=False, + metadata={"help": "Enables fp16 training."}, + ) + bf16: Optional[bool] = field( + default=False, + metadata={"help": "Enables bf16 training."}, + ) + gradient_checkpointing: Optional[bool] = field( + default=True, + metadata={"help": "Enables gradient checkpointing."}, + ) + optim: Optional[str] = field( + default="adamw_torch", + metadata={"help": "The optimizer to use."}, + ) + lr_scheduler_type: str = field( + default="cosine", + metadata={ + "help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis" + }, + ) + max_steps: int = field( + default=-1, metadata={"help": "How many optimizer update steps to take"} + ) + warmup_ratio: float = field( + default=0.03, metadata={"help": "Fraction of steps to do a warmup for"} + ) + save_steps: int = field( + default=10, metadata={"help": "Save checkpoint every X updates steps."} + ) + eval_steps: int = field(default=22, metadata={"help": "Eval model every X steps."}) + logging_steps: int = field( + default=10, metadata={"help": "Log every X updates steps."} + ) + target_eval_loss: float = field( + default=0.92, metadata={"help": "target eval loss - NOT FINAL."} + ) + output_dir: str = field( + default="results", metadata={"help": "Where to store the final model."} + ) + use_flash_attn: Optional[bool] = field( + default=True, + metadata={"help": "Enables Flash attention for training."}, + ) + use_peft_lora: Optional[bool] = field( + default=True, + metadata={"help": "Enables PEFT LoRA for training."}, + ) + use_gradient_checkpointing: Optional[bool] = field( + default=True, + metadata={"help": "Enables Gradient Checkpointing."}, + ) + push_to_hub: Optional[bool] = field( + default=False, + metadata={"help": "If True, pushes the model to the HF Hub"}, + ) + num_workers: int = field( + default=4, metadata={"help": "Number of dataset workers to use."} + ) + debug: Optional[bool] = field( + default=False, + metadata={ + "help": "If True, tests things like proper saving/loading/logging of model" + }, + ) + dataset_config_name: Optional[str] = field(default="gov_report") + hub_model_id: Optional[str] = field(default=None) + seed: Optional[int] = field(default=42) + + +def main(args): + loralogger = LoraLogger(target_eval_loss=args.target_eval_loss) + training_arguments = TrainingArguments( + output_dir=args.output_dir, + per_device_train_batch_size=args.per_device_train_batch_size, + per_device_eval_batch_size=args.per_device_eval_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + optim=args.optim, + learning_rate=args.learning_rate, + fp16=args.fp16, + bf16=args.bf16, + max_grad_norm=args.max_grad_norm, + weight_decay=args.weight_decay, + warmup_ratio=args.warmup_ratio, + lr_scheduler_type=args.lr_scheduler_type, + num_train_epochs=args.num_train_epochs, + evaluation_strategy="steps", + save_strategy="no", + max_steps=args.max_steps, + eval_steps=args.eval_steps, + save_steps=args.save_steps, + logging_steps=args.logging_steps, + push_to_hub=args.push_to_hub, + gradient_checkpointing=args.use_gradient_checkpointing, + hub_model_id=args.hub_model_id, + report_to="tensorboard", + seed=args.seed, + ) + + model = create_and_prepare_model(args) + model.config.use_cache = False + + # datasets + ## ToDo uncomment once drive goes public + # train_url = "https://drive.google.com/file/d/1-JgY1mEafcJ7qhggt6UR3OEKAciIPd5s/view?usp=sharing" + # eval_url = "https://drive.google.com/file/d/1jrm6Lacrq49AYv0uB_Qy22xRmfPixQvs/view?usp=sharing" + # dataset = load_dataset("parquet", data_files={'train': train_url, 'validation': eval_url}) + dataset = load_dataset( + "parquet", + data_files={ + "train": f"{args.dataset_path}/train-00000-of-00001.parquet", + "validation": f"{args.dataset_path}/validation-00000-of-00001.parquet", + }, + ) + train_dataset, eval_dataset = dataset["train"], dataset["validation"] + + trainer = Trainer( + model=model, + args=training_arguments, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + callbacks=[MLPerfCallback(loralogger, len(train_dataset), len(eval_dataset),args.lora_alpha)], + ) + trainer.accelerator.print(f"{trainer.model}") + if args.use_peft_lora: + trainer.model.print_trainable_parameters() + + if args.use_peft_lora: + peft_module_casting_to_bf16(trainer.model, args) + + trainer.train() + + +if __name__ == "__main__": + parser = HfArgumentParser(ScriptArguments) + args = parser.parse_args_into_dataclasses()[0] + main(args) diff --git a/llama2_70b_lora/scripts/utils.py b/llama2_70b_lora/scripts/utils.py new file mode 100644 index 000000000..84821ffb4 --- /dev/null +++ b/llama2_70b_lora/scripts/utils.py @@ -0,0 +1,175 @@ +from functools import partial +from itertools import chain + +import torch +from datasets import load_dataset +from peft import LoraConfig, get_peft_model +from peft.tuners.lora import LoraLayer +from transformers import AutoModelForCausalLM + + +def group_texts(examples, block_size): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + if "labels" not in result: + result["labels"] = result["input_ids"].copy() + return result + + +def create_datasets(tokenizer, args): + dataset = load_dataset( + args.dataset_name, + args.dataset_config_name, + use_auth_token=True, + num_proc=args.num_workers, + ) + train_dataset = dataset["train"] + valid_dataset = dataset["validation"] + column_names = train_dataset.features + + def tokenize_function(example, eval=False): + output_texts = [] + mask_labels_sizes = [] + for i in range(len(example["input"])): + if "gov_report" in args.dataset_config_name: + output_texts.append( + f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n {example['output'][i]}{tokenizer.eos_token}" + ) + if eval: + mask_labels_sizes.append( + f"### Summarize the following text:\n {example['input'][i]}\n ### Summary:\n" + ) + else: + output_texts.append( + f"### {example['input'][i]}\n ### The answer is:\n {example['output'][i]}{tokenizer.eos_token}" + ) + + input_ids = tokenizer(output_texts).input_ids + + if eval: + labels_ids = tokenizer(mask_labels_sizes).input_ids + masked_labels = [] + for out, lb in zip(input_ids, labels_ids): + ml = out.copy() + ml[: len(lb)] = [-100] * len(lb) + ml[-1] = -100 + masked_labels.append(ml) + return {"input_ids": input_ids, "labels": masked_labels} + else: + return {"input_ids": input_ids} + + train_dataset = train_dataset.map( + tokenize_function, + batched=True, + num_proc=8, + remove_columns=column_names, + ) + valid_dataset = valid_dataset.map( + partial(tokenize_function, eval=True), + batched=True, + num_proc=2, + remove_columns=column_names, + ) + + def filter_function(example): + to_keep = [] + for i in range(len(example["input_ids"])): + if len(example["input_ids"][i]) > args.max_seq_length: + to_keep.append(False) + else: + to_keep.append(True) + return to_keep + + train_dataset = train_dataset.filter( + filter_function, + batched=True, + # with_indices=True, + num_proc=8, + # remove_columns=column_names, + ) + valid_dataset = valid_dataset.filter( + filter_function, + batched=True, + # with_indices=True, + num_proc=2, + # remove_columns=column_names, + ) + print( + f"Before packing, Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" + ) + + packing_method = partial(group_texts, block_size=args.max_seq_length) + # Packing + train_dataset = train_dataset.map( + packing_method, + batched=True, + num_proc=8, + ) + valid_dataset = valid_dataset.map( + packing_method, + batched=True, + num_proc=2, + ) + + print( + f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(valid_dataset)}" + ) + + return train_dataset, valid_dataset + + +def create_and_prepare_model(args): + device_map = None + + model = AutoModelForCausalLM.from_pretrained( + args.model_path, + device_map=device_map, + use_cache=not args.use_gradient_checkpointing, + trust_remote_code=True, + attn_implementation="flash_attention_2", + torch_dtype=torch.bfloat16, + max_position_embeddings=8192, + ) + + peft_config = None + if args.use_peft_lora: + peft_config = LoraConfig( + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + r=args.lora_r, + bias="none", + task_type="CAUSAL_LM", + target_modules=( + None + if args.lora_target_modules is None + else args.lora_target_modules.split(",") + ), + ) + if args.use_gradient_checkpointing: + model.gradient_checkpointing_enable() + model = get_peft_model(model, peft_config) + model.print_trainable_parameters() + + return model + + +def peft_module_casting_to_bf16(model, args): + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if args.bf16: + module = module.to(torch.bfloat16) + if "norm" in name: + module = module.to(torch.float32) + if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]): + if hasattr(module, "weight"): + if args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16)