Skip to content

Commit

Permalink
Merge pull request #863 from OptimalScale/yizhenjia-vllm-inferencer
Browse files Browse the repository at this point in the history
[Feature] Add vllm inference example
  • Loading branch information
research4pan authored Jun 20, 2024
2 parents 02f8bcf + 844df44 commit e5ab2fd
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 47 deletions.
35 changes: 26 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Here is an example to finetune a GPT-2 base model.
```sh
cd data && ./download.sh alpaca && cd -

./scripts/run_finetune.sh \
bash ./scripts/run_finetune.sh \
--model_name_or_path gpt2 \
--dataset_path data/alpaca/train_conversation \
--output_model_path output_models/finetuned_gpt2
Expand All @@ -141,7 +141,7 @@ cd data && ./download.sh alpaca && cd -
>```bash
>cd data && ./download.sh alpaca && cd -
>
>./scripts/run_finetune.sh \
>bash ./scripts/run_finetune.sh \
> --model_name_or_path meta-llama/Meta-Llama-3-8B \
> --dataset_path data/alpaca/train_conversation \
> --conversation_template llama3 \
Expand All @@ -155,7 +155,7 @@ cd data && ./download.sh alpaca && cd -
```sh
cd data && ./download.sh alpaca && cd -
./scripts/run_finetune_with_lisa.sh \
bash ./scripts/run_finetune_with_lisa.sh \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--dataset_path data/alpaca/train_conversation \
--output_model_path output_models/finetuned_llama2_7b \
Expand All @@ -169,7 +169,7 @@ cd data && ./download.sh alpaca && cd -
>```bash
>cd data && ./download.sh alpaca && cd -
>
>./scripts/run_finetune_with_lisa.sh \
>bash ./scripts/run_finetune_with_lisa.sh \
> --model_name_or_path meta-llama/Llama-2-7b-hf \
> --dataset_path data/alpaca/train_conversation \
> --conversation_template llama2 \
Expand All @@ -185,7 +185,7 @@ LoRA is a parameter-efficient finetuning algorithm and is more efficient than fu
```sh
cd data && ./download.sh alpaca && cd -
./scripts/run_finetune_with_lora.sh \
bash ./scripts/run_finetune_with_lora.sh \
--model_name_or_path facebook/galactica-1.3b \
--dataset_path data/alpaca/train_conversation \
--output_lora_path output_models/finetuned_galactica_lora
Expand All @@ -197,7 +197,7 @@ cd data && ./download.sh alpaca && cd -
>```bash
>cd data && ./download.sh alpaca && cd -
>
>./scripts/run_finetune_with_lora.sh \
>bash ./scripts/run_finetune_with_lora.sh \
> --model_name_or_path meta-llama/Llama-2-7b-hf \
> --dataset_path data/alpaca/train_conversation \
> --conversation_template llama2 \
Expand All @@ -209,7 +209,7 @@ cd data && ./download.sh alpaca && cd -
>
>Merge LoRA weight and the base model into one using:
>```sh
>./scripts/run_merge_lora.sh \
>bash ./scripts/run_merge_lora.sh \
> --model_name_or_path Qwen/Qwen1.5-1.8B \
> --lora_model_path output_models/lora \
> --output_model_path output_models/lora_merged \
Expand All @@ -219,9 +219,22 @@ cd data && ./download.sh alpaca && cd -
### Inference
After finetuning, you can run the following command to chat with the model.
```sh
./scripts/run_chatbot.sh output_models/finetuned_gpt2
bash ./scripts/run_chatbot.sh output_models/finetuned_gpt2
```
> [!TIP]
> We recommend using vLLM for faster inference.
>
> <details><summary>Faster inference using vLLM</summary>
>
>```bash
>bash ./scripts/run_vllm_inference.sh \
> --model_name_or_path Qwen/Qwen2-0.5B \
> --dataset_path data/alpaca/test_conversation \
> --output_dir data/inference_results \
>```
> </details>
### Deployment
If you want to deploy your own model locally, we provide a gradio-based UI for building chatbots.
Running the following command will launch the demo for robin-7b:
Expand All @@ -240,7 +253,7 @@ You can directly run the LMFlow benchmark evaluation to obtain the results to pa
[LLM comparision](https://docs.google.com/spreadsheets/d/1JYh4_pxNzmNA9I0YM2epgRA7VXBIeIGS64gPJBg5NHA/edit?usp=sharing).
For example, to run GPT2 XL, one may execute
```sh
./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
bash ./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
```
`--model_name_or_path` is required, you may fill in huggingface model name or local model path here.

Expand Down Expand Up @@ -288,6 +301,10 @@ To check the evaluation results, you may check `benchmark.log` in `./output_dir/

LMFlow supports both FlashAttention-1 and the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.

* vLLM

Try vLLM for fast and easy-to-use LLM inference and serving. Thanks for the [great work](https://github.com/vllm-project/vllm)!

</details>

<details> <summary>Long Context</summary>
Expand Down
60 changes: 60 additions & 0 deletions examples/vllm_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
import logging
import os
import sys

from transformers import (
HfArgumentParser
)

from lmflow.datasets import Dataset
from lmflow.models.hf_decoder_model import HFDecoderModel
from lmflow.pipeline.auto_pipeline import AutoPipeline
from lmflow.args import (
ModelArguments,
DatasetArguments,
AutoArguments,
)


logger = logging.getLogger(__name__)


def main():
# Parses arguments
pipeline_name = "vllm_inferencer"
PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)

parser = HfArgumentParser((
ModelArguments,
DatasetArguments,
PipelineArguments
))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()

dataset = Dataset(data_args)
model = HFDecoderModel(model_args)
inferencer = AutoPipeline.get_pipeline(
pipeline_name=pipeline_name,
model_args=model_args,
data_args=data_args,
pipeline_args=pipeline_args
)

res = inferencer.inference(
model,
dataset,
release_gpu=False,
enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
)


if __name__ == "__main__":
main()
77 changes: 77 additions & 0 deletions scripts/run_vllm_inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash
# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.

# Parses arguments
run_name=vllm_inference
model_name_or_path='Qwen/Qwen2-0.5B'
dataset_path=data/alpaca/test_conversation
output_dir=data/inference_results
output_file_name=results.json
apply_chat_template=True

# Safety related arguments
trust_remote_code=0

while [[ $# -ge 1 ]]; do
key="$1"
case ${key} in
-r|--run_name)
run_name="$2"
shift
;;
-m|--model_name_or_path)
model_name_or_path="$2"
shift
;;
-d|--dataset_path)
dataset_path="$2"
shift
;;
--output_dir)
output_dir="$2"
shift
;;
--output_file_name)
output_file_name="$2"
shift
;;
--apply_chat_template)
apply_chat_template="$2"
shift
;;
--trust_remote_code)
trust_remote_code="$2"
shift
;;
*)
echo "error: unknown option \"${key}\"" 1>&2
exit 1
esac
shift
done

# inference
project_dir=$(cd "$(dirname $0)"/..; pwd)
log_dir=${project_dir}/log/${run_name}
output_file_path=${output_dir}/${run_name}/${output_file_name}
mkdir -p ${output_dir}/${run_name} ${log_dir}

python examples/vllm_inference.py \
--use_vllm True \
--trust_remote_code ${trust_remote_code} \
--model_name_or_path ${model_name_or_path} \
--dataset_path ${dataset_path} \
--preprocessing_num_workers 16 \
--random_seed 42 \
--apply_chat_template ${apply_chat_template} \
--num_output_sequences 2 \
--use_beam_search False \
--temperature 1.0 \
--top_p 0.9 \
--max_new_tokens 1024 \
--save_results True \
--results_path ${output_file_path} \
--enable_decode_inference_result False \
--vllm_gpu_memory_utilization 0.95 \
--vllm_tensor_parallel_size 2 \
2>&1 | tee ${log_dir}/vllm_inference.log
15 changes: 6 additions & 9 deletions src/lmflow/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,16 +856,12 @@ class InferencerArguments:
Whether to save inference results, By default False.
results_path : Optional[str]
The **json file** path of inference results, By default None.
memory_safe_vllm_inference_detokenize : Optional[bool]
Whether to detokenize the memory safe vllm inference results.
enable_decode_inference_result : Optional[bool]
Whether to detokenize the inference results.
NOTE: For iterative align pipelines, whether to detokenize depends on
the homogeneity of the policy model and the reward model
(i.e., if they have the same tokenizer).
The reason why `detokenize` for memory safe vllm inference is
included in args is due to the its implementation (i.e., subprocess
rather than within the python codes, thus have to communicate through
command line arguments).
(i.e., if they have the same tokenizer).
use_vllm: bool, optional
Whether to use VLLM for inference, By default False.
vllm_tensor_parallel_size: int, optional
Expand Down Expand Up @@ -964,9 +960,9 @@ class InferencerArguments:
default=True,
metadata={"help": "whether to apply chat template"},
)
memory_safe_vllm_inference_detokenize: Optional[bool] = field(
enable_decode_inference_result: Optional[bool] = field(
default=False,
metadata={"help": "Whether to detokenize the memory safe vllm inference results."},
metadata={"help": "Whether to decode the inference results."},
)

# vllm inference args
Expand Down Expand Up @@ -1254,6 +1250,7 @@ class IterativeAlignerArguments(InferencerArguments):
"finetuner": FinetunerArguments,
"evaluator": EvaluatorArguments,
"inferencer": InferencerArguments,
"vllm_inferencer": InferencerArguments,
"raft_aligner": RaftAlignerArguments,
"dpo_aligner": DPOAlignerArguments,
"rm_tuner": RewardModelingArguments,
Expand Down
2 changes: 1 addition & 1 deletion src/lmflow/models/hf_model_mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def __prepare_model_for_vllm_inference(
self.backend_model_for_inference = LLM(
model=model_args.model_name_or_path,
tokenizer=model_args.model_name_or_path,
dtype=model_args.torch_dtype,
dtype=model_args.torch_dtype if model_args.torch_dtype else "auto",
load_format="auto",
gpu_memory_utilization=vllm_gpu_memory_utilization,
tensor_parallel_size=vllm_tensor_parallel_size,
Expand Down
2 changes: 2 additions & 0 deletions src/lmflow/pipeline/auto_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@ def is_package_version_at_least(package_name, min_version):
from lmflow.pipeline.evaluator import Evaluator
from lmflow.pipeline.finetuner import Finetuner
from lmflow.pipeline.inferencer import Inferencer
from lmflow.pipeline.vllm_inferencer import VLLMInferencer
from lmflow.pipeline.dpo_aligner import DPOAligner
from lmflow.pipeline.rm_tuner import RewardModelingTuner
PIPELINE_MAPPING = {
"evaluator": Evaluator,
"finetuner": Finetuner,
"inferencer": Inferencer,
"vllm_inferencer": VLLMInferencer,
"dpo_aligner": DPOAligner,
"rm_tuner": RewardModelingTuner,
}
Expand Down
6 changes: 3 additions & 3 deletions src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

def main():
# Parses arguments
pipeline_name = "inferencer"
pipeline_name = "vllm_inferencer"
PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)

parser = HfArgumentParser((
Expand All @@ -48,13 +48,13 @@ def main():

dataset = Dataset(data_args)
model = HFDecoderModel(model_args)
inferencer = VLLMInferencer(model_args, pipeline_args)
inferencer = VLLMInferencer(model_args, data_args, pipeline_args)

res = inferencer.inference(
model,
dataset,
release_gpu=False,
detokenize=pipeline_args.memory_safe_vllm_inference_detokenize,
enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
)

# use this as a flag, stdout will be captured by the pipeline
Expand Down
Loading

0 comments on commit e5ab2fd

Please sign in to comment.