support onnx inference

Tlntin · Jul 27, 2024 · d760724 · d760724
1 parent 0e41d31
commit d760724
Show file tree

Hide file tree

Showing 11 changed files with 1,143 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -13,38 +13,46 @@
 
 ### 分步骤运行
 ##### 步骤1：编译模型
-1. 进入export文件夹
+1. 进入export文件夹, 导出onnx。
   ```bash
   cd export
+  python3 export_onnx.py --hf_model_dir="download/[你下载的模型路径]"
+  cd..
   ```
-2. 导出onnx。
+
+2. 验证onnx，返回项目根目录，运行cli_chat.py，测试一下onnx对话是否正常。
   ```bash
-  python3 export_onnx.py --hf_model_dir="download/[你下载的模型路径]"
+  python3 ./cli_chat.py --session_type=onnx 
   ```
 
-3. 改变onnx结构，目前导出的Trilu算子和Cast算子有些问题，需要改一下结构。
+3. 进入export文件夹，改变onnx结构，目前导出的Trilu算子和Cast算子有些问题，atc命令无法识别，需要改一下结构。
   ```bash
+  cd export
   python3 change_node.py
+  cd ..
   ```
 
 4. 转onnx为om模型
   ```bash
+  cd export
   python3 onnx2om.py --hf_model_dir="download/[你下载的模型路径]"
-  ```
-
-5. 返回上层路径
-  ```bash
   cd ..
   ```
 
+
 ##### 步骤2：运行模型
+- 使用下面的命令直接运行模型
+  ```bash
+  python3 ./cli_chat.py --hf_model_dir="download/[你下载的模型路径]"
+  ```
 
 
 
 ### 当前功能
 - [x] 导出onnx, om模型
-- [ ] 模型推理
-- [ ] 流式传输
+- [x] 模型推理，支持onnx推理。
+- [ ] 模型推理，支持acl推理。
+- [x] 流式传输
 - [ ] 兼容OpenAI的api搭建
 - [ ] 支持functional call
 - [ ] 支持模型量化，如weight only, smooth quant等

diff --git a/cli_chat.py b/cli_chat.py
@@ -0,0 +1,98 @@
+import sys
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+from config import InferenceConfig
+from utils.inference import Inference
+import os
+
+project_dir = os.path.dirname(os.path.abspath(__file__))
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--hf_model_dir',
+    type=str,
+    help="model and tokenizer path, only support huggingface model",
+    default=os.path.join(project_dir, "download", "Qwen1_5_0_5B_Chat")
+)
+parser.add_argument(
+    "--session_type",
+    type=str,
+    default="acl",
+    help="acl or onnx",
+    choices=["acl", "onnx"],
+)
+parser.add_argument(
+    '--onnx_model_path',
+    type=str,
+    help="onnx_model_path",
+    default=os.path.join(project_dir, "output", "onnx", "qwen1.5_0.5b_chat.onnx")
+)
+parser.add_argument(
+    "--om_model_path",
+    help="mindspore model path",
+    type=str,
+    default= os.path.join(project_dir, "output", "model", "qwen1.5_0.5b_chat.om")
+)
+parser.add_argument(
+    "--max_input_length",
+    help="max input length",
+    type=int,
+    default=512,
+)
+
+parser.add_argument(
+    "--max_output_length",
+    help="max output length (contain input + new token)",
+    type=int,
+    default=1024,
+)
+
+args = parser.parse_args()
+config = InferenceConfig(
+    hf_model_dir=args.hf_model_dir,
+    om_model_path=args.om_model_path,
+    onnx_model_path=args.onnx_model_path,
+    session_type=args.session_type,
+    max_output_length=args.max_output_length,
+    max_input_length=args.max_input_length,
+    kv_cache_length=args.max_output_length,
+)
+infer_engine=Inference(config)
+
+def inference_cli():
+    print("\n欢迎使用Qwen聊天机器人，输入exit或者quit退出，输入clear清空历史记录")
+    history = []
+    while True:
+        input_text = input("Input: ")
+        if input_text in ["exit", "quit", "exit()", "quit()"]:
+            break
+        if input_text == 'clear':
+            history = []
+            print("Output: 已清理历史对话信息。")
+            continue
+        print("Output: ", end='')
+        response = ""
+        is_first = True
+        first_token_lantency, decode_speed = 0, 0
+        for (
+                new_text,
+                first_token_lantency,
+                decode_speed,
+                total_speed
+            ) in infer_engine.stream_predict(input_text, history=history):
+            if is_first:
+                if len(new_text.strip()) == 0:
+                    continue
+                is_first = False
+            print(new_text, end='', flush=True)
+            response += new_text
+        print("")
+        print(
+            "[INFO] first_token_lantency: {:.4f}s,".format(first_token_lantency),
+            " decode_speed: {:.2f} token/s, ".format(decode_speed),
+            " total_speed(prefill+decode): {:.2f} token/s".format(total_speed),
+        )
+
+        history.append({"role": "assistant", "content": response})
+if __name__ == '__main__':
+    # main()
+    inference_cli()
diff --git a/config.py b/config.py
@@ -0,0 +1,54 @@
+import os
+from transformers.models.qwen2 import Qwen2Config, Qwen2Tokenizer
+
+
+class InferenceConfig:
+    def __init__(
+        self,
+        hf_model_dir: str,
+        om_model_path: str,
+        onnx_model_path: str,
+        session_type: str = "acl", # 支持acl和onnx两种，acl即Ascend C Language
+        device_id: int = 0,
+        sampling_method: str = "top_k",
+        sampling_value: float = 10,
+        temperature: float = 0.7,
+        max_input_length: int = 512, # 输入长度的最大数值
+        max_output_length: int = 1024, # 输出长度的最大值
+        kvcache_method: str = "fixsize", # kv_cache类型，支持basic,fixsize,streamllm,H2O四种，具体可以去kvcache.py查看
+        kv_cache_length: int = 1024, # kvcache的最大长度
+        cache_format: str = 'huggingface-tensor', # kv_cache的格式
+        dtype:str="float16",
+    ):
+        self.tokenizer_dir = hf_model_dir
+        self.session_type = session_type
+        if self.session_type == "acl":
+            assert os.path.exists(om_model_path), print(om_model_path, "not exists")
+        elif self.session_type == "onnx":
+            assert os.path.exists(onnx_model_path), print(onnx_model_path, "not exists")
+        self.om_model_path = om_model_path
+        self.onnx_model_path = onnx_model_path
+        self.device_id = device_id
+        self.sampling_method = sampling_method
+        self.sampling_value = sampling_value
+        self.temperature = temperature
+        self.max_input_length = max_input_length
+        self.max_output_length = max_output_length
+        self.kvcache_method = kvcache_method
+        self.kv_cache_length = kv_cache_length  # max_cache_size
+        self.cache_format = cache_format
+        self.dtype = dtype
+        self.model_config = Qwen2Config.from_pretrained(hf_model_dir)
+        self.num_hidden_layers = self.model_config.num_hidden_layers # n_layer
+        self.num_key_value_heads = self.model_config.num_key_value_heads # head_num
+        self.hidden_size = self.model_config.hidden_size # hidden_dim
+        self.num_attention_heads = self.model_config.num_attention_heads
+        self.per_head_dim = self.hidden_size // self.num_attention_heads # head_dim
+        self.past_key_value_shape = (
+            self.num_hidden_layers,
+            2,
+            1,
+            self.num_key_value_heads,
+            self.kv_cache_length,
+            self.per_head_dim
+        )
diff --git a/export/change_node.py b/export/change_node.py
@@ -32,6 +32,7 @@
     if node.op_type == "Trilu":
         new_node = helper.make_node(
             "Trilu",
+            name="MY_" + node.name,
             inputs=[node.input[0]],
             outputs=node.output,
             upper=0

diff --git a/export/export_onnx.py b/export/export_onnx.py
@@ -16,9 +16,6 @@
 import io
 import argparse
 
-device_str = "npu"
-if device_str == "npu":
-    import torch_npu
 
 now_dir = os.path.dirname(os.path.abspath(__file__))
 project_dir = os.path.dirname(now_dir)
@@ -32,6 +29,20 @@
 
 def parser_arguments():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--device_str",
+        type=str,
+        choices=["npu", "cuda", "cpu"],
+        help="support npu, cuda, cpu",
+        default="npu",
+    )
+    parser.add_argument(
+        "--dtype" ,
+        type=str,
+        help="support float16/float32, if use CPU, only support fp32",
+        choices=["float16", "float32"],
+        default="float16",
+    )
     parser.add_argument(
         '--hf_model_dir',
         type=str,
@@ -54,17 +65,29 @@ def parser_arguments():
 
 
 def export_onnx(
-        base_model: str,
-        output_path: str,
-        kv_cache_length: int,
-        num_hidden_layers: int,
-        num_key_value_heads: int,
-        per_head_dim: int,
+    device_str,
+    dtype: str,
+    hf_model_dir: str,
+    onnx_model_path: str,
+    kv_cache_length: int,
+    num_hidden_layers: int,
+    num_key_value_heads: int,
+    per_head_dim: int,
 ):
+    if device_str == "npu":
+        import torch_npu
+    if dtype == "float16":
+        assert device_str.lower() != "cpu", print("cpu not support fp16")
+        torch_dtype = torch.float16
+    elif dtype == "float32":
+        torch_dtype = torch.float32
+    else:
+        raise Exception("unsupport dtype")
+
     device = torch.device(device_str)
     model = Qwen2ForCausalLM.from_pretrained(
-        base_model,
-        torch_dtype=torch.float16,
+        hf_model_dir,
+        torch_dtype=torch_dtype,
         trust_remote_code=True
     ).to(device)
     quantize_cfg = {
@@ -95,8 +118,8 @@ def export_onnx(
     output_names = ["logits", "out_key_values"]
     dynamic_axes = {
         "input_ids": {0: "batch_size", 1: "seq_length"},
+        "attention_mask": {0: "batch_size", 1: "seq_length+kv_len"},
         "position_ids": {0: "batch_size", 1: "seq_length"},
-        "attention_mask": {0: "batch_size", 1: "all_len"},
         "past_key_values": {2: "batch_size", 4: "kv_len"},
     }
     batch_size = 1
@@ -115,7 +138,7 @@ def export_onnx(
             kv_cache_length,
             per_head_dim
         ),
-        dtype=torch.float16
+        dtype=torch_dtype
     ).to(device)
     input_args = (
         input_ids,
@@ -128,7 +151,6 @@ def export_onnx(
         True,  # output_attentions: Optional[bool] = None,
         None,  # output_hidden_states
         False  # return_dict:
-
     )
     model.eval()
     with torch.no_grad():
@@ -137,11 +159,11 @@ def export_onnx(
         # print(model)
         torch.onnx.export(
             model,
-            f=output_path,
+            f=onnx_model_path,
             args=input_args,
             input_names=input_names,
             output_names=output_names,
-            #dynamic_axes=dynamic_axes,
+            dynamic_axes=dynamic_axes,
             do_constant_folding=False,
             opset_version=14,
             export_params=True
@@ -178,9 +200,11 @@ def export_onnx(
     print("new model config save ok in ", args.hf_model_dir)
     print("begin export onnx")
     export_onnx(
-        args.hf_model_dir,
-        args.onnx_model_path,
-        args.kv_cache_length,
+        device_str=args.device_str,
+        dtype=args.dtype,
+        hf_model_dir=args.hf_model_dir,
+        onnx_model_path=args.onnx_model_path,
+        kv_cache_length=args.kv_cache_length,
         num_hidden_layers=num_hidden_layers,
         num_key_value_heads=num_key_value_heads,
         per_head_dim=per_head_dim