Skip to content

Commit

Permalink
Add customized optimizer support
Browse files Browse the repository at this point in the history
  • Loading branch information
research4pan committed Jun 21, 2024
1 parent e5ab2fd commit 7e7a429
Show file tree
Hide file tree
Showing 7 changed files with 423 additions and 3 deletions.
4 changes: 2 additions & 2 deletions scripts/run_finetune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,5 @@ deepspeed ${deepspeed_args} \
--ddp_timeout 72000 \
--save_steps 5000 \
--dataloader_num_workers 1 \
| tee ${log_dir}/train.log \
2> ${log_dir}/train.err
> >(tee ${log_dir}/train.log) \
2> >(tee ${log_dir}/train.err >&2)
209 changes: 209 additions & 0 deletions scripts/run_finetune_with_custom_optim.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#!/bin/bash
# Please run this script under ${project_id} in project directory of
# https://github.com/shizhediao/llm-ft
# COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4

# Parses arguments
model_name_or_path=gpt2
dataset_path=data/alpaca/train_conversation

# Other optional arguments that can improve memory saving
gradient_checkpointing=True
use_flash_attention=0
gradient_accumulation_steps=1
batch_size=1
block_size=256
per_device_train_batch_size=1
conversation_template=llama2
optim=dummy
learning_rate=1e-5
lr_schedule=cosine
beta1=0.9
beta2=0.999
num_epoch=3
use_deepspeed=1
seed=42

# Safety related arguments
trust_remote_code=0

# Enable model parallelism for multiple gpus, modify this if you prefer
# customized deepspeed zero-redundancy optimization settings
num_gpu=$(python -c "import torch; print(torch.cuda.device_count())")
ds_config_file=configs/ds_config_zero0_no_offload.json
if [[ ${num_gpu} -ge 2 ]]; then
ds_config_file=configs/ds_config_zero2_no_offload.json
fi

while [[ $# -ge 1 ]]; do
key="$1"
case ${key} in
-m|--model_name_or_path)
model_name_or_path="$2"
shift
;;
-d|--dataset_path)
dataset_path="$2"
shift
;;
-o|--output_model_path)
output_dir="$2"
shift
;;
--lisa_activated_layers)
lisa_activated_layers="$2"
shift
;;
--lisa_interval_steps)
lisa_interval_steps="$2"
shift
;;
--gradient_checkpointing)
gradient_checkpointing="$2"
shift
;;
--deepspeed)
ds_config_file="$2"
shift
;;
--use_flash_attention)
use_flash_attention="$2"
shift
;;
--gradient_accumulation_steps)
gradient_accumulation_steps="$2"
shift
;;
--block_size)
block_size="$2"
shift
;;
--conversation_template)
conversation_template="$2"
shift
;;
--per_device_train_batch_size|--batch_size)
per_device_train_batch_size="$2"
batch_size="$2"
shift
;;
--trust_remote_code)
trust_remote_code="$2"
shift
;;
--run_name)
run_name="$2"
shift
;;
--optim)
optim="$2"
shift
;;
--lr)
learning_rate=$2
shift
;;
--beta1)
beta1=$2
shift
;;
--beta2)
beta2=$2
shift
;;
-n|--num_epoch)
num_epoch=$2
shift
;;
--lr_schedule)
lr_schedule=$2
shift
;;
--use_deepspeed)
use_deepspeed=$2
shift
;;
--seed)
seed=$2
shift
;;
*)
echo "error: unknown option \"${key}\"" 1>&2
exit 1
esac
shift
done

gpu_id=${CUDA_VISIBLE_DEVICES}
deepspeed_args="--master_port=1103${gpu_id::1} --hostfile configs/hostfile --include localhost:${gpu_id}"

optim_suffix_args=""
if [ "${optim}" == "dummy" ]; then
optim_suffix_args="--use_customized_optim 1"
optim_suffix_args+=" --customized_optim ${optim}"
optim_suffix_args+=" --optim_dummy_beta1 ${beta1}"
optim_suffix_args+=" --optim_dummy_beta2 ${beta2}"
else
optim_suffix_args="--optim ${optim}"
optim_suffix_args+=" --adam_beta1 ${beta1}"
optim_suffix_args+=" --adam_beta2 ${beta2}"
fi

# Finetune
exp_id=alpaca_${optim}_lr-${learning_rate}_beta1-${beta1}_beta2-${beta2}_lr-sched-${lr_schedule}_model-$(basename ${model_name_or_path})_batch-size-${batch_size}x${gradient_accumulation_steps}_seed-${seed}
echo "$(date): ${exp_id}..."

tmp_dir=tmp
mkdir -p ${tmp_dir}

prefix=${exp_id}
if [ -f ${tmp_dir}/${prefix}.mark ]; then
exit 0
fi

trap "rm -f ${tmp_dir}/${prefix}.mark" SIGINT SIGTERM SIGKILL
touch ${tmp_dir}/${prefix}.mark

project_dir=$(cd "$(dirname $0)"/..; pwd)
log_dir=${project_dir}/log/${exp_id}
output_dir=output_models/${exp_id}
mkdir -p ${output_dir} ${log_dir}

exe="deepspeed ${deepspeed_args}"
if [[ ${use_deepspeed} -eq 0 ]]; then
exe=python
fi
${exe} examples/finetune.py \
--model_name_or_path ${model_name_or_path} \
--trust_remote_code ${trust_remote_code} \
--dataset_path ${dataset_path} \
--output_dir ${output_dir} --overwrite_output_dir \
--conversation_template ${conversation_template} \
--num_train_epochs ${num_epoch} \
--learning_rate ${learning_rate} \
--lr_scheduler_type ${lr_schedule} \
--disable_group_texts 1 \
--block_size ${block_size} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--bf16 \
--deepspeed configs/ds_config_zero2_no_offload.json \
--torch_dtype bfloat16 \
--run_name ${exp_id} \
--validation_split_percentage 0 \
--logging_steps 1 \
--do_train \
--ddp_timeout 72000 \
--save_steps 5000 \
--dataloader_num_workers 1 \
--gradient_checkpointing ${gradient_checkpointing} \
--use_flash_attention ${use_flash_attention} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--seed ${seed} \
${optim_suffix_args} \
| tee ${log_dir}/train.log \
2> ${log_dir}/train.err

if [[ $? -ne 0 ]]; then
echo "$(date): failed"
rm -f ${tmp_dir}/${prefix}.mark
fi
34 changes: 34 additions & 0 deletions src/lmflow/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
logger = logging.getLogger(__name__)


class OptimizerNames():
DUMMY = "dummy"


@dataclass
class ModelArguments:
"""
Expand Down Expand Up @@ -645,6 +649,36 @@ class FinetunerArguments(TrainingArguments):
"help": "where the layer attribute stores, e.g. model.model.layers"
}
)
use_customized_optim: bool = field(
default=False,
metadata={
"help": "whether to use customized optimizers."
}
)
customized_optim: str = field(
default="sign_sgd",
metadata={
"help": "name of the customized optimizer."
}
)
customized_optim_args: str = field(
default=None,
metadata={
"help": "optional arguments that are supplied."
}
)
optim_dummy_beta1: float = field(
default=0.9,
metadata={
"help": "A useless argument for dummy optimizer, just for tutorial"
}
)
optim_dummy_beta2: float = field(
default=0.999,
metadata={
"help": "A useless argument for dummy optimizer, just for tutorial"
}
)


@dataclass
Expand Down
Empty file added src/lmflow/optim/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions src/lmflow/optim/dummy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python
# coding=utf-8
"""Dummy Optimizer.
"""
import math
import warnings
from typing import Callable, Iterable, Tuple

import torch
from torch import nn
from torch.optim import Optimizer

class Dummy(Optimizer):
"""
An dummy optimizer that does nothing.
Parameters:
params (:obj:`Iterable[nn.parameter.Parameter]`):
Iterable of parameters to optimize or dictionaries defining parameter groups.
lr (:obj:`float`, `optional`, defaults to 0):
The learning rate to use.
"""

def __init__(
self,
params: Iterable[nn.parameter.Parameter],
lr: float = 0.,
betas: Tuple[float, float] = (0.9, 0.999),
weight_decay: float = 0.0,
):
if lr < 0.0:
raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
if not 0.0 <= betas[0] < 1.0:
raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
if not 0.0 <= betas[1] < 1.0:
raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
defaults = {"lr": lr, "betas": betas, "weight_decay": weight_decay}
super().__init__(params, defaults)


@torch.no_grad()
def step(self, closure: Callable=None):
"""
Performs a single optimization step.
Arguments:
closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()

for group in self.param_groups:
for p in group["params"]:
if p.grad is None:
continue
grad = p.grad
if grad.is_sparse:
raise RuntimeError("Dummy does not support sparse gradients yet")

state = self.state[p]

# State initialization
if len(state) == 0:
state["step"] = 0
state["exp_avg"] = torch.zeros_like(p)
state["exp_avg2"] = torch.zeros_like(p)

# v := exp_avg
# m := double_exp_avg
v, m = state["exp_avg"], state["exp_avg2"]
beta1, beta2 = group["betas"]
step_size = group["lr"]

state["step"] += 1

p.add_(m, alpha=-0.0)
if group["weight_decay"] > 0.0:
p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
return loss
5 changes: 5 additions & 0 deletions src/lmflow/optim/optimizers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env python
# coding=utf-8
"""All optimizers.
"""
from lmflow.optim.dummy import Dummy
Loading

0 comments on commit 7e7a429

Please sign in to comment.