Skip to content

Commit

Permalink
support onnx inference
Browse files Browse the repository at this point in the history
  • Loading branch information
Tlntin committed Jul 27, 2024
1 parent 0e41d31 commit d760724
Show file tree
Hide file tree
Showing 11 changed files with 1,143 additions and 39 deletions.
28 changes: 18 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,38 +13,46 @@

### 分步骤运行
##### 步骤1:编译模型
1. 进入export文件夹
1. 进入export文件夹, 导出onnx。
```bash
cd export
python3 export_onnx.py --hf_model_dir="download/[你下载的模型路径]"
cd..
```
2. 导出onnx。

2. 验证onnx,返回项目根目录,运行cli_chat.py,测试一下onnx对话是否正常。
```bash
python3 export_onnx.py --hf_model_dir="download/[你下载的模型路径]"
python3 ./cli_chat.py --session_type=onnx
```

3. 改变onnx结构,目前导出的Trilu算子和Cast算子有些问题,需要改一下结构。
3. 进入export文件夹,改变onnx结构,目前导出的Trilu算子和Cast算子有些问题,atc命令无法识别,需要改一下结构。
```bash
cd export
python3 change_node.py
cd ..
```

4. 转onnx为om模型
```bash
cd export
python3 onnx2om.py --hf_model_dir="download/[你下载的模型路径]"
```

5. 返回上层路径
```bash
cd ..
```


##### 步骤2:运行模型
- 使用下面的命令直接运行模型
```bash
python3 ./cli_chat.py --hf_model_dir="download/[你下载的模型路径]"
```



### 当前功能
- [x] 导出onnx, om模型
- [ ] 模型推理
- [ ] 流式传输
- [x] 模型推理,支持onnx推理。
- [ ] 模型推理,支持acl推理。
- [x] 流式传输
- [ ] 兼容OpenAI的api搭建
- [ ] 支持functional call
- [ ] 支持模型量化,如weight only, smooth quant等
Expand Down
98 changes: 98 additions & 0 deletions cli_chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import sys
import argparse
from concurrent.futures import ThreadPoolExecutor
from config import InferenceConfig
from utils.inference import Inference
import os

project_dir = os.path.dirname(os.path.abspath(__file__))
parser = argparse.ArgumentParser()
parser.add_argument(
'--hf_model_dir',
type=str,
help="model and tokenizer path, only support huggingface model",
default=os.path.join(project_dir, "download", "Qwen1_5_0_5B_Chat")
)
parser.add_argument(
"--session_type",
type=str,
default="acl",
help="acl or onnx",
choices=["acl", "onnx"],
)
parser.add_argument(
'--onnx_model_path',
type=str,
help="onnx_model_path",
default=os.path.join(project_dir, "output", "onnx", "qwen1.5_0.5b_chat.onnx")
)
parser.add_argument(
"--om_model_path",
help="mindspore model path",
type=str,
default= os.path.join(project_dir, "output", "model", "qwen1.5_0.5b_chat.om")
)
parser.add_argument(
"--max_input_length",
help="max input length",
type=int,
default=512,
)

parser.add_argument(
"--max_output_length",
help="max output length (contain input + new token)",
type=int,
default=1024,
)

args = parser.parse_args()
config = InferenceConfig(
hf_model_dir=args.hf_model_dir,
om_model_path=args.om_model_path,
onnx_model_path=args.onnx_model_path,
session_type=args.session_type,
max_output_length=args.max_output_length,
max_input_length=args.max_input_length,
kv_cache_length=args.max_output_length,
)
infer_engine=Inference(config)

def inference_cli():
print("\n欢迎使用Qwen聊天机器人,输入exit或者quit退出,输入clear清空历史记录")
history = []
while True:
input_text = input("Input: ")
if input_text in ["exit", "quit", "exit()", "quit()"]:
break
if input_text == 'clear':
history = []
print("Output: 已清理历史对话信息。")
continue
print("Output: ", end='')
response = ""
is_first = True
first_token_lantency, decode_speed = 0, 0
for (
new_text,
first_token_lantency,
decode_speed,
total_speed
) in infer_engine.stream_predict(input_text, history=history):
if is_first:
if len(new_text.strip()) == 0:
continue
is_first = False
print(new_text, end='', flush=True)
response += new_text
print("")
print(
"[INFO] first_token_lantency: {:.4f}s,".format(first_token_lantency),
" decode_speed: {:.2f} token/s, ".format(decode_speed),
" total_speed(prefill+decode): {:.2f} token/s".format(total_speed),
)

history.append({"role": "assistant", "content": response})
if __name__ == '__main__':
# main()
inference_cli()
54 changes: 54 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
from transformers.models.qwen2 import Qwen2Config, Qwen2Tokenizer


class InferenceConfig:
def __init__(
self,
hf_model_dir: str,
om_model_path: str,
onnx_model_path: str,
session_type: str = "acl", # 支持acl和onnx两种,acl即Ascend C Language
device_id: int = 0,
sampling_method: str = "top_k",
sampling_value: float = 10,
temperature: float = 0.7,
max_input_length: int = 512, # 输入长度的最大数值
max_output_length: int = 1024, # 输出长度的最大值
kvcache_method: str = "fixsize", # kv_cache类型,支持basic,fixsize,streamllm,H2O四种,具体可以去kvcache.py查看
kv_cache_length: int = 1024, # kvcache的最大长度
cache_format: str = 'huggingface-tensor', # kv_cache的格式
dtype:str="float16",
):
self.tokenizer_dir = hf_model_dir
self.session_type = session_type
if self.session_type == "acl":
assert os.path.exists(om_model_path), print(om_model_path, "not exists")
elif self.session_type == "onnx":
assert os.path.exists(onnx_model_path), print(onnx_model_path, "not exists")
self.om_model_path = om_model_path
self.onnx_model_path = onnx_model_path
self.device_id = device_id
self.sampling_method = sampling_method
self.sampling_value = sampling_value
self.temperature = temperature
self.max_input_length = max_input_length
self.max_output_length = max_output_length
self.kvcache_method = kvcache_method
self.kv_cache_length = kv_cache_length # max_cache_size
self.cache_format = cache_format
self.dtype = dtype
self.model_config = Qwen2Config.from_pretrained(hf_model_dir)
self.num_hidden_layers = self.model_config.num_hidden_layers # n_layer
self.num_key_value_heads = self.model_config.num_key_value_heads # head_num
self.hidden_size = self.model_config.hidden_size # hidden_dim
self.num_attention_heads = self.model_config.num_attention_heads
self.per_head_dim = self.hidden_size // self.num_attention_heads # head_dim
self.past_key_value_shape = (
self.num_hidden_layers,
2,
1,
self.num_key_value_heads,
self.kv_cache_length,
self.per_head_dim
)
1 change: 1 addition & 0 deletions export/change_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
if node.op_type == "Trilu":
new_node = helper.make_node(
"Trilu",
name="MY_" + node.name,
inputs=[node.input[0]],
outputs=node.output,
upper=0
Expand Down
62 changes: 43 additions & 19 deletions export/export_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@
import io
import argparse

device_str = "npu"
if device_str == "npu":
import torch_npu

now_dir = os.path.dirname(os.path.abspath(__file__))
project_dir = os.path.dirname(now_dir)
Expand All @@ -32,6 +29,20 @@

def parser_arguments():
parser = argparse.ArgumentParser()
parser.add_argument(
"--device_str",
type=str,
choices=["npu", "cuda", "cpu"],
help="support npu, cuda, cpu",
default="npu",
)
parser.add_argument(
"--dtype" ,
type=str,
help="support float16/float32, if use CPU, only support fp32",
choices=["float16", "float32"],
default="float16",
)
parser.add_argument(
'--hf_model_dir',
type=str,
Expand All @@ -54,17 +65,29 @@ def parser_arguments():


def export_onnx(
base_model: str,
output_path: str,
kv_cache_length: int,
num_hidden_layers: int,
num_key_value_heads: int,
per_head_dim: int,
device_str,
dtype: str,
hf_model_dir: str,
onnx_model_path: str,
kv_cache_length: int,
num_hidden_layers: int,
num_key_value_heads: int,
per_head_dim: int,
):
if device_str == "npu":
import torch_npu
if dtype == "float16":
assert device_str.lower() != "cpu", print("cpu not support fp16")
torch_dtype = torch.float16
elif dtype == "float32":
torch_dtype = torch.float32
else:
raise Exception("unsupport dtype")

device = torch.device(device_str)
model = Qwen2ForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float16,
hf_model_dir,
torch_dtype=torch_dtype,
trust_remote_code=True
).to(device)
quantize_cfg = {
Expand Down Expand Up @@ -95,8 +118,8 @@ def export_onnx(
output_names = ["logits", "out_key_values"]
dynamic_axes = {
"input_ids": {0: "batch_size", 1: "seq_length"},
"attention_mask": {0: "batch_size", 1: "seq_length+kv_len"},
"position_ids": {0: "batch_size", 1: "seq_length"},
"attention_mask": {0: "batch_size", 1: "all_len"},
"past_key_values": {2: "batch_size", 4: "kv_len"},
}
batch_size = 1
Expand All @@ -115,7 +138,7 @@ def export_onnx(
kv_cache_length,
per_head_dim
),
dtype=torch.float16
dtype=torch_dtype
).to(device)
input_args = (
input_ids,
Expand All @@ -128,7 +151,6 @@ def export_onnx(
True, # output_attentions: Optional[bool] = None,
None, # output_hidden_states
False # return_dict:

)
model.eval()
with torch.no_grad():
Expand All @@ -137,11 +159,11 @@ def export_onnx(
# print(model)
torch.onnx.export(
model,
f=output_path,
f=onnx_model_path,
args=input_args,
input_names=input_names,
output_names=output_names,
#dynamic_axes=dynamic_axes,
dynamic_axes=dynamic_axes,
do_constant_folding=False,
opset_version=14,
export_params=True
Expand Down Expand Up @@ -178,9 +200,11 @@ def export_onnx(
print("new model config save ok in ", args.hf_model_dir)
print("begin export onnx")
export_onnx(
args.hf_model_dir,
args.onnx_model_path,
args.kv_cache_length,
device_str=args.device_str,
dtype=args.dtype,
hf_model_dir=args.hf_model_dir,
onnx_model_path=args.onnx_model_path,
kv_cache_length=args.kv_cache_length,
num_hidden_layers=num_hidden_layers,
num_key_value_heads=num_key_value_heads,
per_head_dim=per_head_dim
Expand Down
Loading

0 comments on commit d760724

Please sign in to comment.