Merge pull request #863 from OptimalScale/yizhenjia-vllm-inferencer

[Feature] Add vllm inference example
OptimalScale · Jun 20, 2024 · e5ab2fd · e5ab2fd
2 parents 02f8bcf + 844df44
commit e5ab2fd
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ Here is an example to finetune a GPT-2 base model.
 ```sh
 cd data && ./download.sh alpaca && cd -
 
-./scripts/run_finetune.sh \
+bash ./scripts/run_finetune.sh \
   --model_name_or_path gpt2 \
   --dataset_path data/alpaca/train_conversation \
   --output_model_path output_models/finetuned_gpt2
@@ -141,7 +141,7 @@ cd data && ./download.sh alpaca && cd -
 >```bash
 >cd data && ./download.sh alpaca && cd -
 >
->./scripts/run_finetune.sh \
+>bash ./scripts/run_finetune.sh \
 >  --model_name_or_path meta-llama/Meta-Llama-3-8B \
 >  --dataset_path data/alpaca/train_conversation \
 >  --conversation_template llama3 \
@@ -155,7 +155,7 @@ cd data && ./download.sh alpaca && cd -
 ```sh
 cd data && ./download.sh alpaca && cd -
 
-./scripts/run_finetune_with_lisa.sh \
+bash ./scripts/run_finetune_with_lisa.sh \
   --model_name_or_path meta-llama/Llama-2-7b-hf \
   --dataset_path data/alpaca/train_conversation \
   --output_model_path output_models/finetuned_llama2_7b \
@@ -169,7 +169,7 @@ cd data && ./download.sh alpaca && cd -
 >```bash
 >cd data && ./download.sh alpaca && cd -
 >
->./scripts/run_finetune_with_lisa.sh \
+>bash ./scripts/run_finetune_with_lisa.sh \
 >  --model_name_or_path meta-llama/Llama-2-7b-hf \
 >  --dataset_path data/alpaca/train_conversation \
 >  --conversation_template llama2 \
@@ -185,7 +185,7 @@ LoRA is a parameter-efficient finetuning algorithm and is more efficient than fu
 ```sh
 cd data && ./download.sh alpaca && cd -
 
-./scripts/run_finetune_with_lora.sh \
+bash ./scripts/run_finetune_with_lora.sh \
   --model_name_or_path facebook/galactica-1.3b \
   --dataset_path data/alpaca/train_conversation \
   --output_lora_path output_models/finetuned_galactica_lora
@@ -197,7 +197,7 @@ cd data && ./download.sh alpaca && cd -
 >```bash
 >cd data && ./download.sh alpaca && cd -
 >
->./scripts/run_finetune_with_lora.sh \
+>bash ./scripts/run_finetune_with_lora.sh \
 >  --model_name_or_path meta-llama/Llama-2-7b-hf \
 >  --dataset_path data/alpaca/train_conversation \
 >  --conversation_template llama2 \
@@ -209,7 +209,7 @@ cd data && ./download.sh alpaca && cd -
 >
 >Merge LoRA weight and the base model into one using:  
 >```sh
->./scripts/run_merge_lora.sh \
+>bash ./scripts/run_merge_lora.sh \
 >  --model_name_or_path Qwen/Qwen1.5-1.8B \
 >  --lora_model_path output_models/lora \
 >  --output_model_path output_models/lora_merged \
@@ -219,9 +219,22 @@ cd data && ./download.sh alpaca && cd -
 ### Inference
 After finetuning, you can run the following command to chat with the model.
 ```sh
-./scripts/run_chatbot.sh output_models/finetuned_gpt2
+bash ./scripts/run_chatbot.sh output_models/finetuned_gpt2
 ```
 
+> [!TIP]
+> We recommend using vLLM for faster inference.
+> 
+> <details><summary>Faster inference using vLLM</summary>  
+>
+>```bash
+>bash ./scripts/run_vllm_inference.sh \
+>   --model_name_or_path Qwen/Qwen2-0.5B \
+>   --dataset_path data/alpaca/test_conversation \
+>   --output_dir data/inference_results \
+>```
+> </details>
+
 ### Deployment
 If you want to deploy your own model locally, we provide a gradio-based UI for building chatbots. 
 Running the following command will launch the demo for robin-7b:
@@ -240,7 +253,7 @@ You can directly run the LMFlow benchmark evaluation to obtain the results to pa
 [LLM comparision](https://docs.google.com/spreadsheets/d/1JYh4_pxNzmNA9I0YM2epgRA7VXBIeIGS64gPJBg5NHA/edit?usp=sharing).
 For example, to run GPT2 XL, one may execute
 ```sh
-./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
+bash ./scripts/run_benchmark.sh --model_name_or_path gpt2-xl
 ```
 `--model_name_or_path` is required, you may fill in huggingface model name or local model path here.
 
@@ -288,6 +301,10 @@ To check the evaluation results, you may check `benchmark.log` in `./output_dir/
 
   LMFlow supports both FlashAttention-1 and the latest FlashAttention-2. Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 
+* vLLM
+
+  Try vLLM for fast and easy-to-use LLM inference and serving. Thanks for the [great work](https://github.com/vllm-project/vllm)!
+
 </details>
 
 <details> <summary>Long Context</summary>

diff --git a/examples/vllm_inference.py b/examples/vllm_inference.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import logging
+import os
+import sys
+
+from transformers import (
+    HfArgumentParser
+)
+
+from lmflow.datasets import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.args import (
+    ModelArguments, 
+    DatasetArguments, 
+    AutoArguments,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # Parses arguments
+    pipeline_name = "vllm_inferencer"
+    PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+    parser = HfArgumentParser((
+        ModelArguments, 
+        DatasetArguments,
+        PipelineArguments
+    ))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+    dataset = Dataset(data_args)
+    model = HFDecoderModel(model_args)
+    inferencer = AutoPipeline.get_pipeline(
+        pipeline_name=pipeline_name,
+        model_args=model_args,
+        data_args=data_args,
+        pipeline_args=pipeline_args
+    )
+
+    res = inferencer.inference(
+        model,
+        dataset,
+        release_gpu=False,
+        enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_vllm_inference.sh b/scripts/run_vllm_inference.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+
+# Parses arguments
+run_name=vllm_inference
+model_name_or_path='Qwen/Qwen2-0.5B'
+dataset_path=data/alpaca/test_conversation
+output_dir=data/inference_results
+output_file_name=results.json
+apply_chat_template=True
+
+# Safety related arguments
+trust_remote_code=0
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -r|--run_name)
+      run_name="$2"
+      shift
+      ;;
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    --output_dir)
+      output_dir="$2"
+      shift
+      ;;
+    --output_file_name)
+      output_file_name="$2"
+      shift
+      ;;
+    --apply_chat_template)
+      apply_chat_template="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# inference
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${run_name}
+output_file_path=${output_dir}/${run_name}/${output_file_name}
+mkdir -p ${output_dir}/${run_name} ${log_dir}
+
+python examples/vllm_inference.py \
+  --use_vllm True \
+  --trust_remote_code ${trust_remote_code} \
+  --model_name_or_path ${model_name_or_path} \
+  --dataset_path ${dataset_path} \
+  --preprocessing_num_workers 16 \
+  --random_seed 42 \
+  --apply_chat_template ${apply_chat_template} \
+  --num_output_sequences 2 \
+  --use_beam_search False \
+  --temperature 1.0 \
+  --top_p 0.9 \
+  --max_new_tokens 1024 \
+  --save_results True \
+  --results_path ${output_file_path} \
+  --enable_decode_inference_result False \
+  --vllm_gpu_memory_utilization 0.95 \
+  --vllm_tensor_parallel_size 2 \
+  2>&1 | tee ${log_dir}/vllm_inference.log
diff --git a/src/lmflow/args.py b/src/lmflow/args.py
@@ -856,16 +856,12 @@ class InferencerArguments:
         Whether to save inference results, By default False.
     results_path : Optional[str]
         The **json file** path of inference results, By default None.
-    memory_safe_vllm_inference_detokenize : Optional[bool]
-        Whether to detokenize the memory safe vllm inference results. 
+    enable_decode_inference_result : Optional[bool]
+        Whether to detokenize the inference results. 
 
         NOTE: For iterative align pipelines, whether to detokenize depends on 
         the homogeneity of the policy model and the reward model 
-        (i.e., if they have the same tokenizer). 
-        The reason why `detokenize` for memory safe vllm inference is 
-        included in args is due to the its implementation (i.e., subprocess 
-        rather than within the python codes, thus have to communicate through 
-        command line arguments).
+        (i.e., if they have the same tokenizer).
     use_vllm: bool, optional
         Whether to use VLLM for inference, By default False.
     vllm_tensor_parallel_size: int, optional
@@ -964,9 +960,9 @@ class InferencerArguments:
         default=True,
         metadata={"help": "whether to apply chat template"},
     )
-    memory_safe_vllm_inference_detokenize: Optional[bool] = field(
+    enable_decode_inference_result: Optional[bool] = field(
         default=False,
-        metadata={"help": "Whether to detokenize the memory safe vllm inference results."},
+        metadata={"help": "Whether to decode the inference results."},
     )
 
     # vllm inference args
@@ -1254,6 +1250,7 @@ class IterativeAlignerArguments(InferencerArguments):
     "finetuner": FinetunerArguments,
     "evaluator": EvaluatorArguments,
     "inferencer": InferencerArguments,
+    "vllm_inferencer": InferencerArguments,
     "raft_aligner": RaftAlignerArguments,
     "dpo_aligner": DPOAlignerArguments,
     "rm_tuner": RewardModelingArguments,

diff --git a/src/lmflow/models/hf_model_mixin.py b/src/lmflow/models/hf_model_mixin.py
@@ -449,7 +449,7 @@ def __prepare_model_for_vllm_inference(
         self.backend_model_for_inference = LLM(
             model=model_args.model_name_or_path,
             tokenizer=model_args.model_name_or_path,
-            dtype=model_args.torch_dtype,
+            dtype=model_args.torch_dtype if model_args.torch_dtype else "auto",
             load_format="auto",
             gpu_memory_utilization=vllm_gpu_memory_utilization,
             tensor_parallel_size=vllm_tensor_parallel_size,

diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
@@ -17,12 +17,14 @@ def is_package_version_at_least(package_name, min_version):
 from lmflow.pipeline.evaluator import Evaluator
 from lmflow.pipeline.finetuner import Finetuner
 from lmflow.pipeline.inferencer import Inferencer
+from lmflow.pipeline.vllm_inferencer import VLLMInferencer
 from lmflow.pipeline.dpo_aligner import DPOAligner
 from lmflow.pipeline.rm_tuner import RewardModelingTuner
 PIPELINE_MAPPING = {
     "evaluator": Evaluator,
     "finetuner": Finetuner,
     "inferencer": Inferencer,
+    "vllm_inferencer": VLLMInferencer,
     "dpo_aligner": DPOAligner,
     "rm_tuner": RewardModelingTuner,
 }

diff --git a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
@@ -31,7 +31,7 @@
 
 def main():
     # Parses arguments
-    pipeline_name = "inferencer"
+    pipeline_name = "vllm_inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
     parser = HfArgumentParser((
@@ -48,13 +48,13 @@ def main():
 
     dataset = Dataset(data_args)
     model = HFDecoderModel(model_args)
-    inferencer = VLLMInferencer(model_args, pipeline_args)
+    inferencer = VLLMInferencer(model_args, data_args, pipeline_args)
 
     res = inferencer.inference(
         model,
         dataset,
         release_gpu=False,
-        detokenize=pipeline_args.memory_safe_vllm_inference_detokenize,
+        enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
     )
 
     # use this as a flag, stdout will be captured by the pipeline