-
Notifications
You must be signed in to change notification settings - Fork 833
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e5ab2fd
commit 7e7a429
Showing
7 changed files
with
423 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
#!/bin/bash | ||
# Please run this script under ${project_id} in project directory of | ||
# https://github.com/shizhediao/llm-ft | ||
# COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4 | ||
|
||
# Parses arguments | ||
model_name_or_path=gpt2 | ||
dataset_path=data/alpaca/train_conversation | ||
|
||
# Other optional arguments that can improve memory saving | ||
gradient_checkpointing=True | ||
use_flash_attention=0 | ||
gradient_accumulation_steps=1 | ||
batch_size=1 | ||
block_size=256 | ||
per_device_train_batch_size=1 | ||
conversation_template=llama2 | ||
optim=dummy | ||
learning_rate=1e-5 | ||
lr_schedule=cosine | ||
beta1=0.9 | ||
beta2=0.999 | ||
num_epoch=3 | ||
use_deepspeed=1 | ||
seed=42 | ||
|
||
# Safety related arguments | ||
trust_remote_code=0 | ||
|
||
# Enable model parallelism for multiple gpus, modify this if you prefer | ||
# customized deepspeed zero-redundancy optimization settings | ||
num_gpu=$(python -c "import torch; print(torch.cuda.device_count())") | ||
ds_config_file=configs/ds_config_zero0_no_offload.json | ||
if [[ ${num_gpu} -ge 2 ]]; then | ||
ds_config_file=configs/ds_config_zero2_no_offload.json | ||
fi | ||
|
||
while [[ $# -ge 1 ]]; do | ||
key="$1" | ||
case ${key} in | ||
-m|--model_name_or_path) | ||
model_name_or_path="$2" | ||
shift | ||
;; | ||
-d|--dataset_path) | ||
dataset_path="$2" | ||
shift | ||
;; | ||
-o|--output_model_path) | ||
output_dir="$2" | ||
shift | ||
;; | ||
--lisa_activated_layers) | ||
lisa_activated_layers="$2" | ||
shift | ||
;; | ||
--lisa_interval_steps) | ||
lisa_interval_steps="$2" | ||
shift | ||
;; | ||
--gradient_checkpointing) | ||
gradient_checkpointing="$2" | ||
shift | ||
;; | ||
--deepspeed) | ||
ds_config_file="$2" | ||
shift | ||
;; | ||
--use_flash_attention) | ||
use_flash_attention="$2" | ||
shift | ||
;; | ||
--gradient_accumulation_steps) | ||
gradient_accumulation_steps="$2" | ||
shift | ||
;; | ||
--block_size) | ||
block_size="$2" | ||
shift | ||
;; | ||
--conversation_template) | ||
conversation_template="$2" | ||
shift | ||
;; | ||
--per_device_train_batch_size|--batch_size) | ||
per_device_train_batch_size="$2" | ||
batch_size="$2" | ||
shift | ||
;; | ||
--trust_remote_code) | ||
trust_remote_code="$2" | ||
shift | ||
;; | ||
--run_name) | ||
run_name="$2" | ||
shift | ||
;; | ||
--optim) | ||
optim="$2" | ||
shift | ||
;; | ||
--lr) | ||
learning_rate=$2 | ||
shift | ||
;; | ||
--beta1) | ||
beta1=$2 | ||
shift | ||
;; | ||
--beta2) | ||
beta2=$2 | ||
shift | ||
;; | ||
-n|--num_epoch) | ||
num_epoch=$2 | ||
shift | ||
;; | ||
--lr_schedule) | ||
lr_schedule=$2 | ||
shift | ||
;; | ||
--use_deepspeed) | ||
use_deepspeed=$2 | ||
shift | ||
;; | ||
--seed) | ||
seed=$2 | ||
shift | ||
;; | ||
*) | ||
echo "error: unknown option \"${key}\"" 1>&2 | ||
exit 1 | ||
esac | ||
shift | ||
done | ||
|
||
gpu_id=${CUDA_VISIBLE_DEVICES} | ||
deepspeed_args="--master_port=1103${gpu_id::1} --hostfile configs/hostfile --include localhost:${gpu_id}" | ||
|
||
optim_suffix_args="" | ||
if [ "${optim}" == "dummy" ]; then | ||
optim_suffix_args="--use_customized_optim 1" | ||
optim_suffix_args+=" --customized_optim ${optim}" | ||
optim_suffix_args+=" --optim_dummy_beta1 ${beta1}" | ||
optim_suffix_args+=" --optim_dummy_beta2 ${beta2}" | ||
else | ||
optim_suffix_args="--optim ${optim}" | ||
optim_suffix_args+=" --adam_beta1 ${beta1}" | ||
optim_suffix_args+=" --adam_beta2 ${beta2}" | ||
fi | ||
|
||
# Finetune | ||
exp_id=alpaca_${optim}_lr-${learning_rate}_beta1-${beta1}_beta2-${beta2}_lr-sched-${lr_schedule}_model-$(basename ${model_name_or_path})_batch-size-${batch_size}x${gradient_accumulation_steps}_seed-${seed} | ||
echo "$(date): ${exp_id}..." | ||
|
||
tmp_dir=tmp | ||
mkdir -p ${tmp_dir} | ||
|
||
prefix=${exp_id} | ||
if [ -f ${tmp_dir}/${prefix}.mark ]; then | ||
exit 0 | ||
fi | ||
|
||
trap "rm -f ${tmp_dir}/${prefix}.mark" SIGINT SIGTERM SIGKILL | ||
touch ${tmp_dir}/${prefix}.mark | ||
|
||
project_dir=$(cd "$(dirname $0)"/..; pwd) | ||
log_dir=${project_dir}/log/${exp_id} | ||
output_dir=output_models/${exp_id} | ||
mkdir -p ${output_dir} ${log_dir} | ||
|
||
exe="deepspeed ${deepspeed_args}" | ||
if [[ ${use_deepspeed} -eq 0 ]]; then | ||
exe=python | ||
fi | ||
${exe} examples/finetune.py \ | ||
--model_name_or_path ${model_name_or_path} \ | ||
--trust_remote_code ${trust_remote_code} \ | ||
--dataset_path ${dataset_path} \ | ||
--output_dir ${output_dir} --overwrite_output_dir \ | ||
--conversation_template ${conversation_template} \ | ||
--num_train_epochs ${num_epoch} \ | ||
--learning_rate ${learning_rate} \ | ||
--lr_scheduler_type ${lr_schedule} \ | ||
--disable_group_texts 1 \ | ||
--block_size ${block_size} \ | ||
--per_device_train_batch_size ${per_device_train_batch_size} \ | ||
--bf16 \ | ||
--deepspeed configs/ds_config_zero2_no_offload.json \ | ||
--torch_dtype bfloat16 \ | ||
--run_name ${exp_id} \ | ||
--validation_split_percentage 0 \ | ||
--logging_steps 1 \ | ||
--do_train \ | ||
--ddp_timeout 72000 \ | ||
--save_steps 5000 \ | ||
--dataloader_num_workers 1 \ | ||
--gradient_checkpointing ${gradient_checkpointing} \ | ||
--use_flash_attention ${use_flash_attention} \ | ||
--gradient_accumulation_steps ${gradient_accumulation_steps} \ | ||
--seed ${seed} \ | ||
${optim_suffix_args} \ | ||
| tee ${log_dir}/train.log \ | ||
2> ${log_dir}/train.err | ||
|
||
if [[ $? -ne 0 ]]; then | ||
echo "$(date): failed" | ||
rm -f ${tmp_dir}/${prefix}.mark | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#!/usr/bin/env python | ||
# coding=utf-8 | ||
"""Dummy Optimizer. | ||
""" | ||
import math | ||
import warnings | ||
from typing import Callable, Iterable, Tuple | ||
|
||
import torch | ||
from torch import nn | ||
from torch.optim import Optimizer | ||
|
||
class Dummy(Optimizer): | ||
""" | ||
An dummy optimizer that does nothing. | ||
Parameters: | ||
params (:obj:`Iterable[nn.parameter.Parameter]`): | ||
Iterable of parameters to optimize or dictionaries defining parameter groups. | ||
lr (:obj:`float`, `optional`, defaults to 0): | ||
The learning rate to use. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
params: Iterable[nn.parameter.Parameter], | ||
lr: float = 0., | ||
betas: Tuple[float, float] = (0.9, 0.999), | ||
weight_decay: float = 0.0, | ||
): | ||
if lr < 0.0: | ||
raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0") | ||
if not 0.0 <= betas[0] < 1.0: | ||
raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)") | ||
if not 0.0 <= betas[1] < 1.0: | ||
raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)") | ||
defaults = {"lr": lr, "betas": betas, "weight_decay": weight_decay} | ||
super().__init__(params, defaults) | ||
|
||
|
||
@torch.no_grad() | ||
def step(self, closure: Callable=None): | ||
""" | ||
Performs a single optimization step. | ||
Arguments: | ||
closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss. | ||
""" | ||
loss = None | ||
if closure is not None: | ||
loss = closure() | ||
|
||
for group in self.param_groups: | ||
for p in group["params"]: | ||
if p.grad is None: | ||
continue | ||
grad = p.grad | ||
if grad.is_sparse: | ||
raise RuntimeError("Dummy does not support sparse gradients yet") | ||
|
||
state = self.state[p] | ||
|
||
# State initialization | ||
if len(state) == 0: | ||
state["step"] = 0 | ||
state["exp_avg"] = torch.zeros_like(p) | ||
state["exp_avg2"] = torch.zeros_like(p) | ||
|
||
# v := exp_avg | ||
# m := double_exp_avg | ||
v, m = state["exp_avg"], state["exp_avg2"] | ||
beta1, beta2 = group["betas"] | ||
step_size = group["lr"] | ||
|
||
state["step"] += 1 | ||
|
||
p.add_(m, alpha=-0.0) | ||
if group["weight_decay"] > 0.0: | ||
p.add_(p, alpha=(-group["lr"] * group["weight_decay"])) | ||
return loss |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/usr/bin/env python | ||
# coding=utf-8 | ||
"""All optimizers. | ||
""" | ||
from lmflow.optim.dummy import Dummy |
Oops, something went wrong.