diff --git a/README.md b/README.md index 5ee85cf..e3f8bf8 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,14 @@ python3 ./cli_chat.py --hf_model_dir="download/[你下载的模型路径]" ``` +- demo展示(演示模型,qwen1.5-0.5chat) +![](./image/qwen1.5_0.5b_chat.gif) ### 当前功能 - [x] 导出onnx, om模型 -- [x] 模型推理,支持onnx推理。 -- [ ] 模型推理,支持acl推理。 +- [x] 模型推理,支持onnx推理(仅支持CPU)。 +- [x] 模型推理,支持acl推理。 - [x] 流式传输 - [ ] 兼容OpenAI的api搭建 - [ ] 支持functional call diff --git a/export/onnx2om.py b/export/onnx2om.py index 6a967f1..de20d8c 100644 --- a/export/onnx2om.py +++ b/export/onnx2om.py @@ -65,6 +65,7 @@ def get_soc_version(): if soc_version is not None: break assert soc_version is not None, print("soc_version", soc_version) + print("SoC Version is ", soc_version) return soc_version diff --git a/image/qwen1.5_0.5b_chat.gif b/image/qwen1.5_0.5b_chat.gif new file mode 100644 index 0000000..3ece139 Binary files /dev/null and b/image/qwen1.5_0.5b_chat.gif differ diff --git a/utils/engine.py b/utils/engine.py index 7375834..2db22f6 100644 --- a/utils/engine.py +++ b/utils/engine.py @@ -3,9 +3,13 @@ import acl import numpy as np import os +from functools import reduce +from operator import mul import ctypes from config import InferenceConfig from ctypes import c_void_p, c_int, c_size_t, c_ulong, c_int64,POINTER + + ACL_MEM_MALLOC_HUGE_FIRST = 0 ACL_MEMCPY_HOST_TO_DEVICE = 1 ACL_MEMCPY_DEVICE_TO_HOST = 2 diff --git a/utils/session.py b/utils/session.py index 7258d2e..436d520 100644 --- a/utils/session.py +++ b/utils/session.py @@ -66,10 +66,10 @@ def __init__(self,config:InferenceConfig)->None: options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL # options.execution_mode = ort.ExecutionMode.ORT_PARALLEL self.llm_session = ort.InferenceSession( - config.onnx_model_path, - sess_options=options, - providers=[ - ( + config.onnx_model_path, + sess_options=options, + providers=[ + ( "CANNExecutionProvider", { "device_id": 0, @@ -80,9 +80,9 @@ def __init__(self,config:InferenceConfig)->None: "enable_cann_graph": True }, ), - "CPUExecutionProvider", - ] - ) + "CPUExecutionProvider", + ] + ) def run(self, input_ids:np.ndarray): seq_len=input_ids.shape[-1] @@ -131,7 +131,7 @@ def run_all_logits(self, input_ids: np.ndarray): end = i + 16 if i+16 < seq_len else seq_len cache,mask,pos_ids = self.kv_cache.get_inputs(16) self.input_ids[0:end-i] = input_ids[i:end] - result:List[np.ndarray] = self.model.inference([self.input_ids,pos_ids,mask,cache]) + result:List[np.ndarray] = self.model.inference([self.input_ids, mask, pos_ids, cache]) self.kv_cache.update(end-i,result[1]) logits.append(result[0][0:end-i].reshape(1,-1)) return [np.concatenate(logits).reshape(1,1,-1)] @@ -140,12 +140,17 @@ def run_one(self, input_ids: np.ndarray): self.run_times += 1 cache, mask, pos_ids = self.kv_cache.get_inputs(1) result:List[np.ndarray] = self.model.inference( - [input_ids, pos_ids, mask, cache] + [input_ids, mask, pos_ids, cache] ) - # new_kv_cache = result[1] - # print(" == Debug == ") - # print("new_kv_cache: shape", new_kv_cache.shape) - # print("new_kv_cache: mean: ", new_kv_cache.astype(np.float32).mean().item()) - # print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item()) + # if self.run_times <= 2: + # print(" == Debug == ") + # logits = result[0] + # new_kv_cache = result[1] + # print("logits shape: ", logits.shape) + # print("logits mean: ", logits.astype(np.float32).mean().item()) + # print("logits max: ", logits.astype(np.float32).max().item()) + # print("new_kv_cache: shape", new_kv_cache.shape) + # print("new_kv_cache: mean: ", new_kv_cache.astype(np.float32).mean().item()) + # print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item()) self.kv_cache.update(1,result[1]) return result[0].reshape(1,1,-1)