From d824498b1988eb79a4fcc4ea6d57246663b6a899 Mon Sep 17 00:00:00 2001 From: Tlntin Date: Mon, 21 Oct 2024 22:57:00 +0800 Subject: [PATCH] code optimization --- README.md | 2 + cli_chat.py | 3 +- config.py | 6 +- export/change_node.py | 164 +- export/export_onnx.py | 432 +++--- export/modeling_qwen2.py | 2998 ++++++++++++++++++------------------ export/onnx2om.py | 360 +++-- export/test_onnx_run.py | 16 +- export/test_pytorch_run.py | 19 +- utils/engine.py | 167 +- utils/inference.py | 7 +- utils/kvcache.py | 16 +- utils/session.py | 68 +- 13 files changed, 2167 insertions(+), 2091 deletions(-) diff --git a/README.md b/README.md index a25fcbf..f28a839 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,8 @@ 2. 导出onnx,默认kv-cache长度为1024,可以根据自己的内存、显存来设置更大参数。 ```bash python3 export/export_onnx.py \ + --device_str=npu \ + --dtype=float16 \ --hf_model_dir="./download/Qwen2-1.5B-Instruct" \ --onnx_model_path="./output/onnx/qwen2_1.5b_chat.onnx" \ --kv_cache_length=1024 diff --git a/cli_chat.py b/cli_chat.py index 7010972..d88025a 100644 --- a/cli_chat.py +++ b/cli_chat.py @@ -75,12 +75,13 @@ def inference_cli(): break if input_text == 'clear': history = [] + infer_engine.session.reset() print("Output: 已清理历史对话信息。") continue print("Output: ", end='') response = "" is_first = True - first_token_lantency, decode_speed = 0, 0 + first_token_lantency, decode_speed, total_speed = 0, 0, 0.0 for ( new_text, first_token_lantency, diff --git a/config.py b/config.py index ff31a45..694024c 100644 --- a/config.py +++ b/config.py @@ -8,6 +8,7 @@ def __init__( hf_model_dir: str, om_model_path: str, onnx_model_path: str, + cpu_thread: int = 4, # CPU线程数 session_type: str = "acl", # 支持acl和onnx两种,acl即Ascend C Language device_id: int = 0, sampling_method: str = "top_p", # 支持 greedy, top_p, top_k @@ -30,6 +31,7 @@ def __init__( assert os.path.exists(onnx_model_path), print(onnx_model_path, "not exists") self.om_model_path = om_model_path self.onnx_model_path = onnx_model_path + self.cpu_thread = cpu_thread self.device_id = device_id self.sampling_method = sampling_method self.sampling_value = sampling_value @@ -48,11 +50,9 @@ def __init__( self.num_attention_heads = self.model_config.num_attention_heads self.per_head_dim = self.hidden_size // self.num_attention_heads # head_dim self.past_key_value_shape = ( - self.num_hidden_layers, - 2, self.max_batch, - self.num_key_value_heads, self.kv_cache_length, + self.num_hidden_layers * 2 * self.num_key_value_heads, self.per_head_dim ) self.max_prefill_length = max_prefill_length diff --git a/export/change_node.py b/export/change_node.py index 1469c83..b9122a3 100644 --- a/export/change_node.py +++ b/export/change_node.py @@ -1,82 +1,82 @@ -import os -import onnx -import onnx.helper as helper -from onnx import TensorProto -from tqdm import tqdm -import argparse - - -now_dir = os.path.dirname(os.path.abspath(__file__)) -project_dir = os.path.dirname(now_dir) -output_dir = os.path.join(project_dir, "output") -if not os.path.exists(output_dir): - os.mkdir(output_dir) -old_onnx_dir = os.path.join(output_dir, "onnx") -if not os.path.exists(old_onnx_dir): - os.mkdir(old_onnx_dir) -new_onnx_dir = os.path.join(output_dir, "onnx2") -if not os.path.exists(new_onnx_dir): - os.mkdir(new_onnx_dir) - -now_dir = os.path.dirname(os.path.abspath(__file__)) -project_dir = os.path.dirname(now_dir) -model_name = "qwen2_1.5b_chat.onnx" - -parser = argparse.ArgumentParser() -parser.add_argument( - '--input_model_path', - type=str, - help="raw onnx model convert by pytroch", - default=os.path.join(old_onnx_dir, model_name) -) -parser.add_argument( - "--output_model_path", - help="output onnx model path", - type=str, - default=os.path.join(new_onnx_dir, model_name) -) - -args = parser.parse_args() - -model = onnx.load(args.input_model_path) -new_nodes = [] - -for node in tqdm(model.graph.node, desc="replace node..."): - # 判断节点类型 - new_node = node - if node.op_type == "Trilu": - new_node = helper.make_node( - "Trilu", - name="MY_" + node.name, - inputs=[node.input[0]], - outputs=node.output, - upper=0 - ) - if node.op_type == "Cast": - # 替换为新的算子类型 - to_attribute = next(attr for attr in node.attribute if attr.name == "to") - if to_attribute.i == TensorProto.INT8: - new_node = helper.make_node( - "AscendQuant", - inputs=node.input, - outputs=node.output, - offset=0., - scale=1., - ) - new_nodes.append(new_node) -print("make new graph") -new_graph = helper.make_graph( - new_nodes, - "new_graph", - inputs=model.graph.input, - outputs=model.graph.output, - value_info=model.graph.value_info, - initializer=model.graph.initializer -) -print("make new model") -new_model = helper.make_model(new_graph, producer_name=model.producer_name,opset_imports=model.opset_import,ir_version = model.ir_version) -# new_model.ir_version = model.ir_version -# new_model.opset_import = model.opset_import -# new_model.metadata_props = model.metadata_props -print("will save model in ", args.output_model_path) -onnx.save(new_model, args.output_model_path, save_as_external_data=True) +import os +import onnx +import onnx.helper as helper +from onnx import TensorProto +from tqdm import tqdm +import argparse + + +now_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(now_dir) +output_dir = os.path.join(project_dir, "output") +if not os.path.exists(output_dir): + os.mkdir(output_dir) +old_onnx_dir = os.path.join(output_dir, "onnx") +if not os.path.exists(old_onnx_dir): + os.mkdir(old_onnx_dir) +new_onnx_dir = os.path.join(output_dir, "onnx2") +if not os.path.exists(new_onnx_dir): + os.mkdir(new_onnx_dir) + +now_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(now_dir) +model_name = "qwen2_1.5b_chat.onnx" + +parser = argparse.ArgumentParser() +parser.add_argument( + '--input_model_path', + type=str, + help="raw onnx model convert by pytroch", + default=os.path.join(old_onnx_dir, model_name) +) +parser.add_argument( + "--output_model_path", + help="output onnx model path", + type=str, + default=os.path.join(new_onnx_dir, model_name) +) + +args = parser.parse_args() + +model = onnx.load(args.input_model_path) +new_nodes = [] + +for node in tqdm(model.graph.node, desc="replace node..."): + # 判断节点类型 + new_node = node + if node.op_type == "Trilu": + new_node = helper.make_node( + "Trilu", + name="MY_" + node.name, + inputs=[node.input[0]], + outputs=node.output, + upper=0 + ) + if node.op_type == "Cast": + # 替换为新的算子类型 + to_attribute = next(attr for attr in node.attribute if attr.name == "to") + if to_attribute.i == TensorProto.INT8: + new_node = helper.make_node( + "AscendQuant", + inputs=node.input, + outputs=node.output, + offset=0., + scale=1., + ) + new_nodes.append(new_node) +print("make new graph") +new_graph = helper.make_graph( + new_nodes, + "new_graph", + inputs=model.graph.input, + outputs=model.graph.output, + value_info=model.graph.value_info, + initializer=model.graph.initializer +) +print("make new model") +new_model = helper.make_model(new_graph, producer_name=model.producer_name,opset_imports=model.opset_import,ir_version = model.ir_version) +# new_model.ir_version = model.ir_version +# new_model.opset_import = model.opset_import +# new_model.metadata_props = model.metadata_props +print("will save model in ", args.output_model_path) +onnx.save(new_model, args.output_model_path, save_as_external_data=True) diff --git a/export/export_onnx.py b/export/export_onnx.py index d52e1e9..3d6f7e4 100644 --- a/export/export_onnx.py +++ b/export/export_onnx.py @@ -1,217 +1,215 @@ -"""_summary_ -qwen2 modeling_qwen2.py download: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/qwen2/modeling_qwen2.py -""" - -import os -import json -import sys -from typing import List -import torch -import shutil -# from transformers import AutoModel, Qwen2Config -from transformers.models.qwen2 import Qwen2Config -from modeling_qwen2 import Qwen2ForCausalLM - -import onnx -import io -import argparse - - -now_dir = os.path.dirname(os.path.abspath(__file__)) -project_dir = os.path.dirname(now_dir) -output_dir = os.path.join(project_dir, "output") -if not os.path.exists(output_dir): - os.mkdir(output_dir) -onnx_model_dir = os.path.join(output_dir, "onnx") -if not os.path.exists(onnx_model_dir): - os.mkdir(onnx_model_dir) -if len(os.listdir(onnx_model_dir)) > 0: - print("found some file in {}, will clear it".format(onnx_model_dir)) - for temp_file in os.listdir(onnx_model_dir): - temp_path = os.path.join(onnx_model_dir, temp_file) - os.remove(temp_path) - - -def parser_arguments(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--device_str", - type=str, - choices=["npu", "cuda", "cpu"], - help="support npu, cuda, cpu", - default="npu", - ) - parser.add_argument( - "--dtype" , - type=str, - help="support float16/float32, if use CPU, only support fp32", - choices=["float16", "float32"], - default="float16", - ) - parser.add_argument( - '--hf_model_dir', - type=str, - help="model and tokenizer path, only support huggingface model", - default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct") - ) - parser.add_argument( - "--onnx_model_path", - help="output onnx path", - type=str, - default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx") - ) - parser.add_argument( - "--kv_cache_length", - help="kv-cache length", - type=int, - default=1024, - ) - return parser.parse_args() - - -def export_onnx( - device_str, - dtype: str, - hf_model_dir: str, - onnx_model_path: str, - kv_cache_length: int, - num_hidden_layers: int, - num_key_value_heads: int, - per_head_dim: int, -): - if device_str == "npu": - import torch_npu - if dtype == "float16": - assert device_str.lower() != "cpu", print("cpu not support fp16") - torch_dtype = torch.float16 - elif dtype == "float32": - torch_dtype = torch.float32 - else: - raise Exception("unsupport dtype") - - device = torch.device(device_str) - model = Qwen2ForCausalLM.from_pretrained( - hf_model_dir, - torch_dtype=torch_dtype, - # trust_remote_code=True - ).to(device) - quantize_cfg = { - "query_key_value": { - "type": "W8X8", - "act_scale": False - }, - "dense": { - "type": "W8X8", - "act_scale": False - }, - "dense_h_to_4h": { - "type": "W8X8", - "act_scale": False - }, - "dense_4h_to_h": { - "type": "W8X8", - "act_scale": False - } - } - quantize_cfg = {} - input_names = [ - "input_ids", - "attention_mask", - "position_ids", - "past_key_values" - ] - output_names = ["logits", "out_key_values"] - dynamic_axes = { - "input_ids": {0: "batch_size", 1: "seq_length"}, - "attention_mask": {0: "batch_size", 1: "seq_length+kv_len"}, - "position_ids": {0: "batch_size", 1: "seq_length"}, - "past_key_values": {2: "batch_size", 4: "kv_len"}, - } - batch_size = 1 - seq_len = 1 - all_len = seq_len + kv_cache_length - - input_ids = torch.zeros((batch_size, seq_len)).long().to(device) - attention_mask = torch.zeros((batch_size, all_len)).long().to(device) - position_ids = torch.zeros((batch_size, seq_len)).long().to(device) - past_key_values = torch.rand( - ( - num_hidden_layers, - 2, - 1, - num_key_value_heads, - kv_cache_length, - per_head_dim - ), - dtype=torch_dtype - ).to(device) - input_args = ( - input_ids, - attention_mask, - position_ids, - past_key_values, - # None, # inputs_embeds: Optional[torch.FloatTensor] = None, - # None, # labels: Optional[torch.LongTensor] = None, - # True, # use_cache: Optional[bool] = None, - # True, # output_attentions: Optional[bool] = None, - # None, # output_hidden_states - # False # return_dict: - ) - model.eval() - with torch.no_grad(): - # from quantize import quantize - # quantize(model, cfg=quantize_cfg) - # print(model) - torch.onnx.export( - model, - f=onnx_model_path, - args=input_args, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - do_constant_folding=False, - opset_version=14, - export_params=True - ) - - -if __name__ == "__main__": - args = parser_arguments() - # model_config = Qwen2Config.from_pretrained(args.hf_model_dir) - # copy modeling_qwen2.py to model dir - src_file_path = os.path.join(now_dir, "modeling_qwen2.py") - target_file_path = os.path.join(args.hf_model_dir, "modeling_qwen2.py") - shutil.copy(src_file_path, target_file_path) - # print(model_config) - config_json = os.path.join(args.hf_model_dir, "config.json") - with open(config_json, "rt", encoding="utf-8") as f: - model_config = json.load(f) - model_config["auto_map"] = { - "AutoModel": "modeling_qwen2.Qwen2ForCausalLM", - "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM", - "AutoModelForSeq2SeqLM": "modeling_qwen2.Qwen2ForCausalLM", - "AutoModelForSequenceClassification": "modeling_qwen2.Qwen2ForSequenceClassification" - } - with open(config_json, "wt", encoding="utf-8") as f: - json.dump(model_config, f, indent=4) - test_model_config = Qwen2Config.from_pretrained(args.hf_model_dir) - # print(test_model_config) - test_model_config.torch_dtype = "float16" - test_model_config.save_pretrained(args.hf_model_dir) - num_hidden_layers = test_model_config.num_hidden_layers - num_attention_heads = test_model_config.num_attention_heads - num_key_value_heads = test_model_config.num_key_value_heads - hidden_size = test_model_config.hidden_size - per_head_dim = hidden_size // num_attention_heads - print("new model config save ok in ", args.hf_model_dir) - print("begin export onnx") - export_onnx( - device_str=args.device_str, - dtype=args.dtype, - hf_model_dir=args.hf_model_dir, - onnx_model_path=args.onnx_model_path, - kv_cache_length=args.kv_cache_length, - num_hidden_layers=num_hidden_layers, - num_key_value_heads=num_key_value_heads, - per_head_dim=per_head_dim - ) +"""_summary_ +qwen2 modeling_qwen2.py download: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/qwen2/modeling_qwen2.py +""" + +import os +import json +import sys +from typing import List +import torch +import shutil +# from transformers import AutoModel, Qwen2Config +from transformers.models.qwen2 import Qwen2Config +from modeling_qwen2 import Qwen2ForCausalLM + +import onnx +import io +import argparse + + +now_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(now_dir) +output_dir = os.path.join(project_dir, "output") +if not os.path.exists(output_dir): + os.mkdir(output_dir) +onnx_model_dir = os.path.join(output_dir, "onnx") +if not os.path.exists(onnx_model_dir): + os.mkdir(onnx_model_dir) +if len(os.listdir(onnx_model_dir)) > 0: + print("found some file in {}, will clear it".format(onnx_model_dir)) + for temp_file in os.listdir(onnx_model_dir): + temp_path = os.path.join(onnx_model_dir, temp_file) + os.remove(temp_path) + + +def parser_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--device_str", + type=str, + choices=["npu", "cuda", "cpu"], + help="support npu, cuda, cpu", + default="cpu", + ) + parser.add_argument( + "--dtype" , + type=str, + help="support float16/float32, if use CPU, only support fp32", + choices=["float16", "float32"], + default="float32", + ) + parser.add_argument( + '--hf_model_dir', + type=str, + help="model and tokenizer path, only support huggingface model", + default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct") + ) + parser.add_argument( + "--onnx_model_path", + help="output onnx path", + type=str, + default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx") + ) + parser.add_argument( + "--kv_cache_length", + help="kv-cache length", + type=int, + default=1024, + ) + return parser.parse_args() + + +def export_onnx( + device_str, + dtype: str, + hf_model_dir: str, + onnx_model_path: str, + kv_cache_length: int, + num_hidden_layers: int, + num_key_value_heads: int, + per_head_dim: int, +): + if device_str == "npu": + import torch_npu + if dtype == "float16": + assert device_str.lower() != "cpu", print("cpu not support fp16") + torch_dtype = torch.float16 + elif dtype == "float32": + torch_dtype = torch.float32 + else: + raise Exception("unsupport dtype") + + device = torch.device(device_str) + model = Qwen2ForCausalLM.from_pretrained( + hf_model_dir, + torch_dtype=torch_dtype, + # trust_remote_code=True + ).to(device) + quantize_cfg = { + "query_key_value": { + "type": "W8X8", + "act_scale": False + }, + "dense": { + "type": "W8X8", + "act_scale": False + }, + "dense_h_to_4h": { + "type": "W8X8", + "act_scale": False + }, + "dense_4h_to_h": { + "type": "W8X8", + "act_scale": False + } + } + quantize_cfg = {} + input_names = [ + "input_ids", + "attention_mask", + "position_ids", + "past_key_values" + ] + output_names = ["logits", "out_key_values"] + dynamic_axes = { + "input_ids": {0: "batch_size", 1: "seq_length"}, + "attention_mask": {0: "batch_size", 1: "seq_length+kv_len"}, + "position_ids": {0: "batch_size", 1: "seq_length"}, + "past_key_values": {0: "batch_size", 1: "kv_len"}, + } + batch_size = 1 + seq_len = 1 + all_len = seq_len + kv_cache_length + + input_ids = torch.zeros((batch_size, seq_len)).long().to(device) + attention_mask = torch.zeros((batch_size, all_len)).long().to(device) + position_ids = torch.zeros((batch_size, seq_len)).long().to(device) + past_key_values = torch.rand( + ( + 1, + kv_cache_length, + num_hidden_layers * 2 * num_key_value_heads, + per_head_dim + ), + dtype=torch_dtype + ).to(device) + input_args = ( + input_ids, + attention_mask, + position_ids, + past_key_values, + # None, # inputs_embeds: Optional[torch.FloatTensor] = None, + # None, # labels: Optional[torch.LongTensor] = None, + # True, # use_cache: Optional[bool] = None, + # True, # output_attentions: Optional[bool] = None, + # None, # output_hidden_states + # False # return_dict: + ) + model.eval() + with torch.no_grad(): + # from quantize import quantize + # quantize(model, cfg=quantize_cfg) + # print(model) + torch.onnx.export( + model, + f=onnx_model_path, + args=input_args, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + do_constant_folding=False, + opset_version=14, + export_params=True + ) + + +if __name__ == "__main__": + args = parser_arguments() + # model_config = Qwen2Config.from_pretrained(args.hf_model_dir) + # copy modeling_qwen2.py to model dir + src_file_path = os.path.join(now_dir, "modeling_qwen2.py") + target_file_path = os.path.join(args.hf_model_dir, "modeling_qwen2.py") + shutil.copy(src_file_path, target_file_path) + # print(model_config) + config_json = os.path.join(args.hf_model_dir, "config.json") + with open(config_json, "rt", encoding="utf-8") as f: + model_config = json.load(f) + model_config["auto_map"] = { + "AutoModel": "modeling_qwen2.Qwen2ForCausalLM", + "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM", + "AutoModelForSeq2SeqLM": "modeling_qwen2.Qwen2ForCausalLM", + "AutoModelForSequenceClassification": "modeling_qwen2.Qwen2ForSequenceClassification" + } + with open(config_json, "wt", encoding="utf-8") as f: + json.dump(model_config, f, indent=4) + test_model_config = Qwen2Config.from_pretrained(args.hf_model_dir) + # print(test_model_config) + test_model_config.torch_dtype = "float16" + test_model_config.save_pretrained(args.hf_model_dir) + num_hidden_layers = test_model_config.num_hidden_layers + num_attention_heads = test_model_config.num_attention_heads + num_key_value_heads = test_model_config.num_key_value_heads + hidden_size = test_model_config.hidden_size + per_head_dim = hidden_size // num_attention_heads + print("new model config save ok in ", args.hf_model_dir) + print("begin export onnx") + export_onnx( + device_str=args.device_str, + dtype=args.dtype, + hf_model_dir=args.hf_model_dir, + onnx_model_path=args.onnx_model_path, + kv_cache_length=args.kv_cache_length, + num_hidden_layers=num_hidden_layers, + num_key_value_heads=num_key_value_heads, + per_head_dim=per_head_dim + ) diff --git a/export/modeling_qwen2.py b/export/modeling_qwen2.py index 735bea1..fcc96a8 100644 --- a/export/modeling_qwen2.py +++ b/export/modeling_qwen2.py @@ -1,1488 +1,1510 @@ -# coding=utf-8 -# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch Qwen2 model.""" -import inspect -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - is_flash_attn_greater_or_equal_2_10, - logging, - replace_return_docstrings, -) -from transformers.models.qwen2.configuration_qwen2 import Qwen2Config - - -# if is_flash_attn_2_available(): -# from flash_attn import flash_attn_func, flash_attn_varlen_func -# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa -# -# _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) - - -logger = logging.get_logger(__name__) - - -_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" -_CONFIG_FOR_DOC = "Qwen2Config" - -QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "Qwen/Qwen2-7B-beta", - # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 -] - - -# Copied from transformers.models.llama.modeling_llama._get_unpad_data -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 -class Qwen2RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - Qwen2RMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2 -class Qwen2RotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - - freqs = torch.outer(t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -# Copied from transformers.models.llama.modeling_llama.rotate_half -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 -class Qwen2MLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - -# Copied from transformers.models.llama.modeling_llama.repeat_kv -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class Qwen2Attention(nn.Module): - """ - Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer - and "Generating Long Sequences with Sparse Transformers". - """ - - def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): - super().__init__() - self.config = config - self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " - "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) - - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - self.attention_dropout = config.attention_dropout - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - - self.rotary_emb = Qwen2RotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - # output_attentions: bool = False, - # use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - kv_seq_len += past_key_value[self.layer_idx].shape[3] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - out_cache = (key_states, value_states) - if past_key_value is not None: - # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - cache_key = past_key_value[self.layer_idx][0] - cache_value = past_key_value[self.layer_idx][1] - key_states = torch.cat((cache_key, key_states), dim=2) - value_states = torch.cat((cache_value, value_states), dim=2) - # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - # raise ValueError( - # f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - # f" {attn_weights.size()}" - # ) - - # if attention_mask is not None: - # if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - # raise ValueError( - # f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - # ) - # attn_weights = attn_weights + attention_mask - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - # if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - # raise ValueError( - # f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - # f" {attn_output.size()}" - # ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - # if not output_attentions: - # attn_weights = None - - # return attn_output, attn_weights, past_key_value - return attn_output, attn_weights, out_cache - - -''' -class Qwen2FlashAttention2(Qwen2Attention): - """ - Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` - as the weights of the module stays untouched. The only required change would be on the forward pass - where it needs to correctly call the public API of flash attention and deal with padding tokens - in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom - config.max_window_layers layers. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ): - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - if self.layer_idx is None: - raise ValueError( - f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " - "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " - "with a layer index." - ) - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - # Because the input can be padded, the absolute sequence length depends on the max position id. - rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 - cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - use_sliding_windows = ( - _flash_supports_window_size - and getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and self.config.use_sliding_window - ) - - if not _flash_supports_window_size: - logger.warning_once( - "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" - " make sure to upgrade flash-attn library." - ) - - if past_key_value is not None: - # Activate slicing cache only if the config has a value `sliding_windows` attribute - cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 - if ( - getattr(self.config, "sliding_window", None) is not None - and kv_seq_len > self.config.sliding_window - and cache_has_contents - ): - slicing_tokens = 1 - self.config.sliding_window - - past_key = past_key_value[self.layer_idx][0] - past_value = past_key_value[self.layer_idx][1] - - past_key = past_key[:, :, slicing_tokens:, :].contiguous() - past_value = past_value[:, :, slicing_tokens:, :].contiguous() - - if past_key.shape[-2] != self.config.sliding_window - 1: - raise ValueError( - f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" - f" {past_key.shape}" - ) - - if attention_mask is not None: - attention_mask = attention_mask[:, slicing_tokens:] - attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) - - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 just to be sure everything works as expected. - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - attn_output = self._flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - use_sliding_windows=use_sliding_windows, - ) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, - query_states, - key_states, - value_states, - attention_mask, - query_length, - dropout=0.0, - softmax_scale=None, - use_sliding_windows=False, - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - use_sliding_windows (`bool`, *optional*): - Whether to activate sliding window attention. - """ - if not self._flash_attn_uses_top_left_mask: - causal = self.is_causal - else: - # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. - causal = self.is_causal and query_length != 1 - - # Decide whether to use SWA or not by layer index. - if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: - use_sliding_windows = False - - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - if not use_sliding_windows: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - if not use_sliding_windows: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - ) - else: - attn_output = flash_attn_func( - query_states, - key_states, - value_states, - dropout, - softmax_scale=softmax_scale, - causal=causal, - window_size=(self.config.sliding_window, self.config.sliding_window), - ) - - return attn_output - - # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape - - # On the first iteration we need to properly re-create the padding mask - # by slicing it on the proper place - if kv_seq_len != attention_mask.shape[-1]: - attention_mask_num_tokens = attention_mask.shape[-1] - attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] - - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - - key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) - - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) -''' - - -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2 -class Qwen2SdpaAttention(Qwen2Attention): - """ - Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from Qwen2Attention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - # output_attentions: bool = False, - # use_cache: bool = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # if output_attentions: - # # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - # # logger.warning_once( - # # "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - # # 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - # # ) - # return super().forward( - # hidden_states=hidden_states, - # attention_mask=attention_mask, - # position_ids=position_ids, - # past_key_value=past_key_value, - # # output_attentions=output_attentions, - # # use_cache=use_cache, - # ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - kv_seq_len += past_key_value[self.layer_idx].shape[3] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - output_cache = (key_states, value_states) - if past_key_value is not None: - # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - cache_key = past_key_value[self.layer_idx][0] - cache_value = past_key_value[self.layer_idx][1] - key_states = torch.cat((cache_key, key_states), dim=2) - value_states = torch.cat((cache_value, value_states), dim=2) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - # if attention_mask is not None: - # if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - # raise ValueError( - # f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - # ) - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - # copy from chatglm3-6b - # attention_mask = ~attention_mask - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=attention_mask, - # dropout_p=self.attention_dropout if self.training else 0.0, - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - # is_causal=self.is_causal and attention_mask is None and q_len > 1, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - return attn_output, None, output_cache - - -QWEN2_ATTENTION_CLASSES = { - "eager": Qwen2Attention, - # "flash_attention_2": Qwen2FlashAttention2, - "sdpa": Qwen2SdpaAttention, -} - - -class Qwen2DecoderLayer(nn.Module): - def __init__(self, config: Qwen2Config, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - if config.use_sliding_window and config._attn_implementation != "flash_attention_2": - logger.warning_once( - f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " - "unexpected results may be encountered." - ) - # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) - self.self_attn = Qwen2SdpaAttention(config, layer_idx) - # self.self_attn = Qwen2Attention(config, layer_idx) - self.mlp = Qwen2MLP(config) - self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - # output_attentions: Optional[bool] = False, - # use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. " - "Please make sure use `attention_mask` instead.`" - ) - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): attention mask of size - `(batch, sequence_length)` where padding elements are indicated by 0. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - # output_attentions=output_attentions, - # use_cache=use_cache, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - # if output_attentions: - # outputs += (self_attn_weights,) - - # if use_cache: - # outputs += (present_key_value,) - outputs += (present_key_value,) - - return outputs - - -QWEN2_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`Qwen2Config`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", - QWEN2_START_DOCSTRING, -) -class Qwen2PreTrainedModel(PreTrainedModel): - config_class = Qwen2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["Qwen2DecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -QWEN2_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance; - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", - QWEN2_START_DOCSTRING, -) -class Qwen2Model(Qwen2PreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] - - Args: - config: Qwen2Config - """ - - def __init__(self, config: Qwen2Config): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self._attn_implementation = config._attn_implementation - self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @staticmethod - def get_masks(input_ids, past_key_values, padding_mask=None): - batch_size, seq_length = input_ids.shape - full_attention_mask = torch.ones( - batch_size, - seq_length, - seq_length, - device=input_ids.device, - # dtype=torch.int64 - ) - full_attention_mask.tril_() - past_length = past_key_values.shape[4] - # if past_length is not None: - full_attention_mask = torch.cat( - ( - torch.ones( - batch_size, - seq_length, - past_length, - device=input_ids.device, - # dtype=torch.int64 - ), - full_attention_mask - ), - dim=-1 - ) - if padding_mask is not None: - full_attention_mask = full_attention_mask * padding_mask.unsqueeze( - 1) - # if not past_length and padding_mask is not None: - # full_attention_mask -= padding_mask.unsqueeze(-1) - 1 - full_attention_mask = (full_attention_mask < 0.5).bool() - full_attention_mask.unsqueeze_(1) - return full_attention_mask - - @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[torch.FloatTensor] = None, - # inputs_embeds: Optional[torch.FloatTensor] = None, - # use_cache: Optional[bool] = None, - # output_attentions: Optional[bool] = None, - # output_hidden_states: Optional[bool] = None, - # return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - # output_hidden_states = ( - # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - # ) - # use_cache = use_cache if use_cache is not None else self.config.use_cache - - # return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - batch_size, seq_length = input_ids.shape - # if input_ids is not None and inputs_embeds is not None: - # raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") - # elif input_ids is not None: - # batch_size, seq_length = input_ids.shape - # elif inputs_embeds is not None: - # batch_size, seq_length, _ = inputs_embeds.shape - # else: - # raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") - - # if self.gradient_checkpointing and self.training: - # if use_cache: - # logger.warning_once( - # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - # ) - # use_cache = False - # if past_key_values is not None: - # past_key_values_length = past_key_values.shape[4] - # else: - # past_key_values_length = 0 - - # if use_cache: - # use_legacy_cache = not isinstance(past_key_values, Cache) - # if use_legacy_cache: - # past_key_values = DynamicCache.from_legacy_cache(past_key_values) - # past_key_values_length = past_key_values.get_usable_length(seq_length) - - # if position_ids is None: - # device = input_ids.device if input_ids is not None else inputs_embeds.device - # position_ids = torch.arange( - # past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - # ) - # position_ids = position_ids.unsqueeze(0).view(-1, seq_length) - # else: - # position_ids = position_ids.view(-1, seq_length).long() - position_ids = position_ids.view(-1, seq_length).long() - - # if inputs_embeds is None: - # inputs_embeds = self.embed_tokens(input_ids) - inputs_embeds = self.embed_tokens(input_ids) - """ - if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: - is_padding_right = attention_mask[:, -1].sum().item() != batch_size - if is_padding_right: - raise ValueError( - "You are attempting to perform batched generation with padding_side='right'" - " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " - " call `tokenizer.padding_side = 'left'` before tokenizing the input. " - ) - - if self._attn_implementation == "flash_attention_2": - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - elif self._attn_implementation == "sdpa" and not output_attentions: - # output_attentions=True can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) - else: - # 4d mask is passed through the layers - # [1, 1, 2, 1026], value=-65504 - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - sliding_window=self.config.sliding_window, - ) - """ - # copy from chatglm3-6b for onnx export - full_attention_mask = self.get_masks( - input_ids, - past_key_values, - attention_mask, - ) - # === if use Qwen2Attention === - # dtype = past_key_values.dtype - # device = input_ids.device - # attention_mask = torch.zeros_like(full_attention_mask, dtype=dtype).to(device) - # attention_mask.masked_fill_(full_attention_mask, torch.finfo(dtype).min) - - # == if use Qwen2SdpaAttention === - # copy from chatglm3-6b - attention_mask = ~full_attention_mask - - hidden_states = inputs_embeds - - - # decoder layers - # all_hidden_states = () if output_hidden_states else None - # all_self_attns = () if output_attentions else None - # next_decoder_cache = None - presents = [] - for decoder_layer in self.layers: - # if output_hidden_states: - # all_hidden_states += (hidden_states,) - - # if self.gradient_checkpointing and self.training: - # layer_outputs = self._gradient_checkpointing_func( - # decoder_layer.__call__, - # hidden_states, - # attention_mask, - # position_ids, - # past_key_values, - # output_attentions, - # use_cache, - # ) - # else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_values, - # output_attentions=output_attentions, - # use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - # if use_cache: - # next_decoder_cache = layer_outputs[2 if output_attentions else 1] - presents.extend(layer_outputs[1]) - - # if output_attentions: - # all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - # if output_hidden_states: - # all_hidden_states += (hidden_states,) - - # next_cache = None - # if use_cache: - # next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache - one_shape = [len(presents) // 2, 2] + list(presents[0].shape) - presents = torch.concat(presents).reshape(one_shape) - return ( - hidden_states, - presents, - # all_hidden_states, - # all_self_attns - ) - - # if not return_dict: - # return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - # return BaseModelOutputWithPast( - # last_hidden_state=hidden_states, - # past_key_values=next_cache, - # hidden_states=all_hidden_states, - # attentions=all_self_attns, - # ) - - -class Qwen2ForCausalLM(Qwen2PreTrainedModel): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.model = Qwen2Model(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[torch.FloatTensor] = None, - # inputs_embeds: Optional[torch.FloatTensor] = None, - # labels: Optional[torch.LongTensor] = None, - # use_cache: Optional[bool] = None, - # output_attentions: Optional[bool] = None, - # output_hidden_states: Optional[bool] = None, - # return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, Qwen2ForCausalLM - - >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) - >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - # print("input_ids:", input_ids.shape) # [1, 1] - # print("position_ids:", position_ids.shape) # [1, 1] - # print("attention_mask:", attention_mask.shape) # [1, 21] - # if past_key_values is not None: - # print( - # "past_key_values.shape:", len(past_key_values), - # len(past_key_values[0]), past_key_values[0][0].shape - # ) - # # [24, 2, 1, 16, 20, 64] - # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - # output_hidden_states = ( - # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - # ) - # return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - # inputs_embeds=inputs_embeds, - # use_cache=use_cache, - # output_attentions=output_attentions, - # output_hidden_states=output_hidden_states, - # return_dict=return_dict, - ) - - hidden_states = outputs[0] - logits = self.lm_head(hidden_states) - logits = logits.float() - - # loss = None - # if labels is not None: - # # Shift so that tokens < n predict n - # shift_logits = logits[..., :-1, :].contiguous() - # shift_labels = labels[..., 1:].contiguous() - # # Flatten the tokens - # loss_fct = CrossEntropyLoss() - # shift_logits = shift_logits.view(-1, self.config.vocab_size) - # shift_labels = shift_labels.view(-1) - # # Enable model parallelism - # shift_labels = shift_labels.to(shift_logits.device) - # loss = loss_fct(shift_logits, shift_labels) - - # if not return_dict: - output = (logits,) + outputs[1:] - # return (loss,) + output if loss is not None else output - return output - - # return CausalLMOutputWithPast( - # loss=loss, - # logits=logits, - # past_key_values=outputs.past_key_values, - # hidden_states=outputs.hidden_states, - # attentions=outputs.attentions, - # ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - # Omit tokens covered by past_key_values - if past_key_values is not None: - if isinstance(past_key_values, Cache): - cache_length = past_key_values.get_seq_length() - past_length = past_key_values.seen_tokens - max_cache_length = past_key_values.get_max_length() - else: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None - - # Keep only the unprocessed tokens: - # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where - # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as - # input) - if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: - input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] - # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard - # input_ids based on the past_length. - elif past_length < input_ids.shape[1]: - input_ids = input_ids[:, past_length:] - # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - -@add_start_docstrings( - """ - The Qwen2 Model transformer with a sequence classification head on top (linear layer). - - [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - QWEN2_START_DOCSTRING, -) -class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = Qwen2Model(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Qwen2 model.""" +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from transformers.models.qwen2.configuration_qwen2 import Qwen2Config + + +# if is_flash_attn_2_available(): +# from flash_attn import flash_attn_func, flash_attn_varlen_func +# from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa +# +# _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta" +_CONFIG_FOR_DOC = "Qwen2Config" + +QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "Qwen/Qwen2-7B-beta", + # See all Qwen2 models at https://huggingface.co/models?filter=qwen2 +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2 +class Qwen2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Qwen2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2 +class Qwen2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2 +class Qwen2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Qwen2Attention(nn.Module): + """ + Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer + and "Generating Long Sequences with Sparse Transformers". + """ + + def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.attention_dropout = config.attention_dropout + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.rotary_emb = Qwen2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[torch.Tensor] = None, + # output_attentions: bool = False, + # use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value.shape[1] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + output_cache = ( + key_states.transpose(1, 2), + value_states.transpose(1, 2) + ) + if past_key_value is not None: + # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + cache_key = past_key_value[ + :, + :, + self.layer_idx * 2 * self.num_key_value_heads: (self.layer_idx * 2 + 1) * self.num_key_value_heads + ].transpose(1, 2) + cache_value = past_key_value[ + :, + :, + (self.layer_idx * 2 + 1) * self.num_key_value_heads: (self.layer_idx * 2 + 2) * self.num_key_value_heads + ].transpose(1, 2) + key_states = torch.cat((cache_key, key_states), dim=2) + value_states = torch.cat((cache_value, value_states), dim=2) + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + # raise ValueError( + # f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + # f" {attn_weights.size()}" + # ) + + # if attention_mask is not None: + # if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + # raise ValueError( + # f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + # ) + # attn_weights = attn_weights + attention_mask + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + # if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + # raise ValueError( + # f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + # f" {attn_output.size()}" + # ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + # if not output_attentions: + # attn_weights = None + + # return attn_output, attn_weights, past_key_value + return attn_output, attn_weights, output_cache + + +''' +class Qwen2FlashAttention2(Qwen2Attention): + """ + Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention` + as the weights of the module stays untouched. The only required change would be on the forward pass + where it needs to correctly call the public API of flash attention and deal with padding tokens + in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom + config.max_window_layers layers. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ): + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and self.config.use_sliding_window + ) + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" + " make sure to upgrade flash-attn library." + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Decide whether to use SWA or not by layer index. + if use_sliding_windows and self.layer_idx >= self.config.max_window_layers: + use_sliding_windows = False + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) +''' + + +# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2 +class Qwen2SdpaAttention(Qwen2Attention): + """ + Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Qwen2Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[torch.Tensor] = None, + # output_attentions: bool = False, + # use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # if output_attentions: + # # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + # # logger.warning_once( + # # "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + # # 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + # # ) + # return super().forward( + # hidden_states=hidden_states, + # attention_mask=attention_mask, + # position_ids=position_ids, + # past_key_value=past_key_value, + # # output_attentions=output_attentions, + # # use_cache=use_cache, + # ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + kv_seq_len += past_key_value.shape[1] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + output_cache = ( + key_states.transpose(1, 2), + value_states.transpose(1, 2) + ) + if past_key_value is not None: + # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + cache_key = past_key_value[ + :, + :, + self.layer_idx * 2 * self.num_key_value_heads: (self.layer_idx * 2 + 1) * self.num_key_value_heads + ].transpose(1, 2) + cache_value = past_key_value[ + :, + :, + (self.layer_idx * 2 + 1) * self.num_key_value_heads: (self.layer_idx * 2 + 2) * self.num_key_value_heads + ].transpose(1, 2) + key_states = torch.cat((cache_key, key_states), dim=2) + value_states = torch.cat((cache_value, value_states), dim=2) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # if attention_mask is not None: + # if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + # raise ValueError( + # f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + # ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + # copy from chatglm3-6b + # attention_mask = ~attention_mask + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + # dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + # is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + return attn_output, None, output_cache + + +QWEN2_ATTENTION_CLASSES = { + "eager": Qwen2Attention, + # "flash_attention_2": Qwen2FlashAttention2, + "sdpa": Qwen2SdpaAttention, +} + + +class Qwen2DecoderLayer(nn.Module): + def __init__(self, config: Qwen2Config, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + if config.use_sliding_window and config._attn_implementation != "flash_attention_2": + logger.warning_once( + f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; " + "unexpected results may be encountered." + ) + # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx) + self.self_attn = Qwen2SdpaAttention(config, layer_idx) + # self.self_attn = Qwen2Attention(config, layer_idx) + self.mlp = Qwen2MLP(config) + self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + # output_attentions: Optional[bool] = False, + # use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. " + "Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + # output_attentions=output_attentions, + # use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + # if output_attentions: + # outputs += (self_attn_weights,) + + # if use_cache: + # outputs += (present_key_value,) + outputs += (present_key_value,) + + return outputs + + +QWEN2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Qwen2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2PreTrainedModel(PreTrainedModel): + config_class = Qwen2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Qwen2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +QWEN2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.", + QWEN2_START_DOCSTRING, +) +class Qwen2Model(Qwen2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`] + + Args: + config: Qwen2Config + """ + + def __init__(self, config: Qwen2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._attn_implementation = config._attn_implementation + self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @staticmethod + def get_masks(input_ids, past_key_values, padding_mask=None): + batch_size, seq_length = input_ids.shape + full_attention_mask = torch.ones( + batch_size, + seq_length, + seq_length, + device=input_ids.device, + # dtype=torch.int64 + ) + full_attention_mask.tril_() + past_length = past_key_values.shape[1] + # if past_length is not None: + full_attention_mask = torch.cat( + ( + torch.ones( + batch_size, + seq_length, + past_length, + device=input_ids.device, + # dtype=torch.int64 + ), + full_attention_mask + ), + dim=-1 + ) + if padding_mask is not None: + full_attention_mask = full_attention_mask * padding_mask.unsqueeze( + 1) + # if not past_length and padding_mask is not None: + # full_attention_mask -= padding_mask.unsqueeze(-1) - 1 + full_attention_mask = (full_attention_mask < 0.5).bool() + full_attention_mask.unsqueeze_(1) + return full_attention_mask + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[torch.FloatTensor] = None, + # inputs_embeds: Optional[torch.FloatTensor] = None, + # use_cache: Optional[bool] = None, + # output_attentions: Optional[bool] = None, + # output_hidden_states: Optional[bool] = None, + # return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + # output_hidden_states = ( + # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + # ) + # use_cache = use_cache if use_cache is not None else self.config.use_cache + + # return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + batch_size, seq_length = input_ids.shape + # if input_ids is not None and inputs_embeds is not None: + # raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + # elif input_ids is not None: + # batch_size, seq_length = input_ids.shape + # elif inputs_embeds is not None: + # batch_size, seq_length, _ = inputs_embeds.shape + # else: + # raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # if self.gradient_checkpointing and self.training: + # if use_cache: + # logger.warning_once( + # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + # ) + # use_cache = False + # if past_key_values is not None: + # past_key_values_length = past_key_values.shape[4] + # else: + # past_key_values_length = 0 + + # if use_cache: + # use_legacy_cache = not isinstance(past_key_values, Cache) + # if use_legacy_cache: + # past_key_values = DynamicCache.from_legacy_cache(past_key_values) + # past_key_values_length = past_key_values.get_usable_length(seq_length) + + # if position_ids is None: + # device = input_ids.device if input_ids is not None else inputs_embeds.device + # position_ids = torch.arange( + # past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + # ) + # position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + # else: + # position_ids = position_ids.view(-1, seq_length).long() + position_ids = position_ids.view(-1, seq_length).long() + + # if inputs_embeds is None: + # inputs_embeds = self.embed_tokens(input_ids) + inputs_embeds = self.embed_tokens(input_ids) + """ + if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache: + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + # [1, 1, 2, 1026], value=-65504 + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + """ + # copy from chatglm3-6b for onnx export + full_attention_mask = self.get_masks( + input_ids, + past_key_values, + attention_mask, + ) + # === if use Qwen2Attention === + # dtype = past_key_values.dtype + # device = input_ids.device + # attention_mask = torch.zeros_like(full_attention_mask, dtype=dtype).to(device) + # attention_mask.masked_fill_(full_attention_mask, torch.finfo(dtype).min) + + # == if use Qwen2SdpaAttention === + # copy from chatglm3-6b + attention_mask = ~full_attention_mask + + hidden_states = inputs_embeds + + + # decoder layers + # all_hidden_states = () if output_hidden_states else None + # all_self_attns = () if output_attentions else None + # next_decoder_cache = None + presents = [] + for decoder_layer in self.layers: + # if output_hidden_states: + # all_hidden_states += (hidden_states,) + + # if self.gradient_checkpointing and self.training: + # layer_outputs = self._gradient_checkpointing_func( + # decoder_layer.__call__, + # hidden_states, + # attention_mask, + # position_ids, + # past_key_values, + # output_attentions, + # use_cache, + # ) + # else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + # output_attentions=output_attentions, + # use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + # if use_cache: + # next_decoder_cache = layer_outputs[2 if output_attentions else 1] + presents.extend(layer_outputs[1]) + + # if output_attentions: + # all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + # if output_hidden_states: + # all_hidden_states += (hidden_states,) + + # next_cache = None + # if use_cache: + # next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + one_shape = list(presents[0].shape) + one_shape[2] = one_shape[2] * len(presents) + presents = torch.concat(presents, dim=2) + return ( + hidden_states, + presents, + # all_hidden_states, + # all_self_attns + ) + + # if not return_dict: + # return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + # return BaseModelOutputWithPast( + # last_hidden_state=hidden_states, + # past_key_values=next_cache, + # hidden_states=all_hidden_states, + # attentions=all_self_attns, + # ) + + +class Qwen2ForCausalLM(Qwen2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Qwen2Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[torch.FloatTensor] = None, + # inputs_embeds: Optional[torch.FloatTensor] = None, + # labels: Optional[torch.LongTensor] = None, + # use_cache: Optional[bool] = None, + # output_attentions: Optional[bool] = None, + # output_hidden_states: Optional[bool] = None, + # return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Qwen2ForCausalLM + + >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + # print("input_ids:", input_ids.shape) # [1, 1] + # print("position_ids:", position_ids.shape) # [1, 1] + # print("attention_mask:", attention_mask.shape) # [1, 21] + # if past_key_values is not None: + # print( + # "past_key_values.shape:", len(past_key_values), + # len(past_key_values[0]), past_key_values[0][0].shape + # ) + # # [24, 2, 1, 16, 20, 64] + # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + # output_hidden_states = ( + # output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + # ) + # return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + # inputs_embeds=inputs_embeds, + # use_cache=use_cache, + # output_attentions=output_attentions, + # output_hidden_states=output_hidden_states, + # return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + # loss = None + # if labels is not None: + # # Shift so that tokens < n predict n + # shift_logits = logits[..., :-1, :].contiguous() + # shift_labels = labels[..., 1:].contiguous() + # # Flatten the tokens + # loss_fct = CrossEntropyLoss() + # shift_logits = shift_logits.view(-1, self.config.vocab_size) + # shift_labels = shift_labels.view(-1) + # # Enable model parallelism + # shift_labels = shift_labels.to(shift_logits.device) + # loss = loss_fct(shift_logits, shift_labels) + + # if not return_dict: + output = (logits,) + outputs[1:] + # return (loss,) + output if loss is not None else output + return output + + # return CausalLMOutputWithPast( + # loss=loss, + # logits=logits, + # past_key_values=outputs.past_key_values, + # hidden_states=outputs.hidden_states, + # attentions=outputs.attentions, + # ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Qwen2 Model transformer with a sequence classification head on top (linear layer). + + [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + QWEN2_START_DOCSTRING, +) +class Qwen2ForSequenceClassification(Qwen2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Qwen2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/export/onnx2om.py b/export/onnx2om.py index 2fe17a6..701d0ce 100644 --- a/export/onnx2om.py +++ b/export/onnx2om.py @@ -1,182 +1,180 @@ -import os -import ctypes -import subprocess -import argparse -import math -from transformers.models.qwen2 import Qwen2Config - -now_dir = os.path.dirname(os.path.abspath(__file__)) -project_dir = os.path.dirname(now_dir) -output_dir = os.path.join(project_dir, "output") -if not os.path.exists(output_dir): - os.mkdir(output_dir) -onnx_model_dir = os.path.join(output_dir, "onnx2") -if not os.path.exists(onnx_model_dir): - os.mkdir(onnx_model_dir) -model_dir = os.path.join(output_dir, "model") -if not os.path.exists(model_dir): - os.mkdir(model_dir) - -parser = argparse.ArgumentParser() -parser.add_argument( - '--soc_version', - type=str, - default="auto", - help="NPU full name, like Ascend310B1、Ascend310B4、Ascend310P1、Ascend910A、Ascend910B..., default is `auto`, will auto detect soc version.", -) -parser.add_argument( - '--hf_model_dir', - type=str, - help="model and tokenizer path, only support huggingface model", - default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct") -) -parser.add_argument( - "--onnx_model_path", - help="output onnx path", - type=str, - default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx") -) -parser.add_argument( - "--om_model_path", - help=".om model path", - type=str, - default= os.path.join(model_dir, "qwen2_1.5b_chat") -) -parser.add_argument( - "--max_batch", - help="max batch", - type=int, - default=1, -) -parser.add_argument( - "--max_prefill_length", - help="max prefill length in first inference. " - "Attention max_prefill_length + max_output_length <= kv_cache_length. " - "the number must by 2^xx, like 1, 2, 4, 8, 16, 32, 64, 128, 256... " - "Note! The higher this number, the longer it will take to compile.", - type=int, - default=8, -) -parser.add_argument( - "--kv_cache_length", - help="kv-cache length", - type=int, - default=1024, -) - - -args = parser.parse_args() - - -def get_soc_version(): - """ - _summary_ - 获取芯片信息,返回具体的芯片型号 - Returns: - _type_: _description_ - """ - max_len = 512 - rtsdll = ctypes.CDLL(f"libruntime.so") - c_char_t = ctypes.create_string_buffer(b"\xff" * max_len, max_len) - rtsdll.rtGetSocVersion.restype = ctypes.c_uint64 - rt_error = rtsdll.rtGetSocVersion(c_char_t, ctypes.c_uint32(max_len)) - if rt_error: - print("rt_error:", rt_error) - return "" - soc_full_name = c_char_t.value.decode("utf-8") - find_str = "Short_SoC_version=" - ascend_home_dir = os.environ.get("ASCEND_HOME_PATH") - assert ascend_home_dir is not None, \ - print("ASCEND_HOME_PATH is None, you need run `source /usr/local/Ascend/ascend-toolkit/set_env.sh`") - with open(f"{ascend_home_dir}/compiler/data/platform_config/{soc_full_name}.ini", "r") as f: - for line in f: - if find_str in line: - start_index = line.find(find_str) - soc_short_name = line[start_index + len(find_str):].strip() - return { - "soc_full_name": soc_full_name, - "soc_short_name": soc_short_name - } - raise Exception("can't get you soc version") - -max_batch = args.max_batch -model_config = Qwen2Config.from_pretrained(args.hf_model_dir) -num_hidden_layers = model_config.num_hidden_layers -num_key_value_heads = model_config.num_key_value_heads -hidden_size = model_config.hidden_size -num_attention_heads = model_config.num_attention_heads -per_head_dim = hidden_size // num_attention_heads -kv_cache_length = args.kv_cache_length -max_prefill_log2 = int(math.log2(args.max_prefill_length)) -max_prefill_length = 2 ** max_prefill_log2 -prefill_length_range = list(range(0, max_prefill_log2 + 1)) -prefill_length_range = [2 ** idx for idx in prefill_length_range] -assert (max_prefill_length < kv_cache_length), \ - print("max_input_length max be smaller than kv_cache_length, because max_input_length + max_output_length <= kv_cache") -input_ids_length_range = prefill_length_range -attention_length_range = [ - length + kv_cache_length - for length in prefill_length_range -] -position_length_range = prefill_length_range -input_ids_shape = [ - f"1~{max_batch}" if max_batch > 1 else "1", - "-1" if max_prefill_length > 1 else "1", -] -attention_mask_shape = [ - f"1~{max_batch}" if max_batch > 1 else "1", - "-1" if max_prefill_length > 1 else str(1 + kv_cache_length) -] -position_ids_shape = [ - f"1~{max_batch}" if max_batch > 1 else "1", - "-1" if max_prefill_length > 1 else "1" -] -dynamic_dims = [] -for dynamic_dim in zip( - input_ids_length_range, attention_length_range, position_length_range -): - dynamic_dim = [str(dim) for dim in dynamic_dim] - dynamic_dims.append(",".join(dynamic_dim)) -past_key_values_shape = [ - num_hidden_layers, - 2, - f"1~{max_batch}" if max_batch > 1 else "1", - num_key_value_heads, - kv_cache_length, - per_head_dim -] -past_key_values_shape = [str(x) for x in past_key_values_shape] -if args.soc_version == "auto": - print("[INFO] soc_version is `auto`, will auto detect soc version") - soc_dict = get_soc_version() - print("[INFO] {}".format(soc_dict)) - soc_version = soc_dict["soc_full_name"] -else: - soc_version = args.soc_version -command_lines = [ - "atc", - "--framework=5", - '--model="{}"'.format(args.onnx_model_path), - '--output="{}"'.format(args.om_model_path), - "--soc_version={}".format(soc_version), - "--precision_mode=must_keep_origin_dtype", - "--input_format=ND", - '--input_shape="input_ids:{};attention_mask:{};position_ids:{};past_key_values:{}"'.format( - ",".join(input_ids_shape), - ",".join(attention_mask_shape), - ",".join(position_ids_shape), - ",".join(past_key_values_shape) - ), -] -if max_prefill_length > 1: - command_lines.append( - "--dynamic_dims \"{}\"".format(";".join(dynamic_dims)) - ) -print("============ run command ==============") -print(" ".join(command_lines)) -print("=======================================") -subprocess.run( - " ".join(command_lines), - shell=True, - check=True, +import os +import ctypes +import subprocess +import argparse +import math +from transformers.models.qwen2 import Qwen2Config + +now_dir = os.path.dirname(os.path.abspath(__file__)) +project_dir = os.path.dirname(now_dir) +output_dir = os.path.join(project_dir, "output") +if not os.path.exists(output_dir): + os.mkdir(output_dir) +onnx_model_dir = os.path.join(output_dir, "onnx2") +if not os.path.exists(onnx_model_dir): + os.mkdir(onnx_model_dir) +model_dir = os.path.join(output_dir, "model") +if not os.path.exists(model_dir): + os.mkdir(model_dir) + +parser = argparse.ArgumentParser() +parser.add_argument( + '--soc_version', + type=str, + default="auto", + help="NPU full name, like Ascend310B1、Ascend310B4、Ascend310P1、Ascend910A、Ascend910B..., default is `auto`, will auto detect soc version.", +) +parser.add_argument( + '--hf_model_dir', + type=str, + help="model and tokenizer path, only support huggingface model", + default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct") +) +parser.add_argument( + "--onnx_model_path", + help="output onnx path", + type=str, + default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx") +) +parser.add_argument( + "--om_model_path", + help=".om model path", + type=str, + default= os.path.join(model_dir, "qwen2_1.5b_chat") +) +parser.add_argument( + "--max_batch", + help="max batch", + type=int, + default=1, +) +parser.add_argument( + "--max_prefill_length", + help="max prefill length in first inference. " + "Attention max_prefill_length + max_output_length <= kv_cache_length. " + "the number must by 2^xx, like 1, 2, 4, 8, 16, 32, 64, 128, 256... " + "Note! The higher this number, the longer it will take to compile.", + type=int, + default=8, +) +parser.add_argument( + "--kv_cache_length", + help="kv-cache length", + type=int, + default=1024, +) + + +args = parser.parse_args() + + +def get_soc_version(): + """ + _summary_ + 获取芯片信息,返回具体的芯片型号 + Returns: + _type_: _description_ + """ + max_len = 512 + rtsdll = ctypes.CDLL(f"libruntime.so") + c_char_t = ctypes.create_string_buffer(b"\xff" * max_len, max_len) + rtsdll.rtGetSocVersion.restype = ctypes.c_uint64 + rt_error = rtsdll.rtGetSocVersion(c_char_t, ctypes.c_uint32(max_len)) + if rt_error: + print("rt_error:", rt_error) + return "" + soc_full_name = c_char_t.value.decode("utf-8") + find_str = "Short_SoC_version=" + ascend_home_dir = os.environ.get("ASCEND_HOME_PATH") + assert ascend_home_dir is not None, \ + print("ASCEND_HOME_PATH is None, you need run `source /usr/local/Ascend/ascend-toolkit/set_env.sh`") + with open(f"{ascend_home_dir}/compiler/data/platform_config/{soc_full_name}.ini", "r") as f: + for line in f: + if find_str in line: + start_index = line.find(find_str) + soc_short_name = line[start_index + len(find_str):].strip() + return { + "soc_full_name": soc_full_name, + "soc_short_name": soc_short_name + } + raise Exception("can't get you soc version") + +max_batch = args.max_batch +model_config = Qwen2Config.from_pretrained(args.hf_model_dir) +num_hidden_layers = model_config.num_hidden_layers +num_key_value_heads = model_config.num_key_value_heads +hidden_size = model_config.hidden_size +num_attention_heads = model_config.num_attention_heads +per_head_dim = hidden_size // num_attention_heads +kv_cache_length = args.kv_cache_length +max_prefill_log2 = int(math.log2(args.max_prefill_length)) +max_prefill_length = 2 ** max_prefill_log2 +prefill_length_range = list(range(0, max_prefill_log2 + 1)) +prefill_length_range = [2 ** idx for idx in prefill_length_range] +assert (max_prefill_length < kv_cache_length), \ + print("max_input_length max be smaller than kv_cache_length, because max_input_length + max_output_length <= kv_cache") +input_ids_length_range = prefill_length_range +attention_length_range = [ + length + kv_cache_length + for length in prefill_length_range +] +position_length_range = prefill_length_range +input_ids_shape = [ + f"1~{max_batch}" if max_batch > 1 else "1", + "-1" if max_prefill_length > 1 else "1", +] +attention_mask_shape = [ + f"1~{max_batch}" if max_batch > 1 else "1", + "-1" if max_prefill_length > 1 else str(1 + kv_cache_length) +] +position_ids_shape = [ + f"1~{max_batch}" if max_batch > 1 else "1", + "-1" if max_prefill_length > 1 else "1" +] +dynamic_dims = [] +for dynamic_dim in zip( + input_ids_length_range, attention_length_range, position_length_range +): + dynamic_dim = [str(dim) for dim in dynamic_dim] + dynamic_dims.append(",".join(dynamic_dim)) +past_key_values_shape = [ + f"1~{max_batch}" if max_batch > 1 else "1", + kv_cache_length, + num_hidden_layers * 2 * num_key_value_heads, + per_head_dim +] +past_key_values_shape = [str(x) for x in past_key_values_shape] +if args.soc_version == "auto": + print("[INFO] soc_version is `auto`, will auto detect soc version") + soc_dict = get_soc_version() + print("[INFO] {}".format(soc_dict)) + soc_version = soc_dict["soc_full_name"] +else: + soc_version = args.soc_version +command_lines = [ + "atc", + "--framework=5", + '--model="{}"'.format(args.onnx_model_path), + '--output="{}"'.format(args.om_model_path), + "--soc_version={}".format(soc_version), + "--precision_mode=must_keep_origin_dtype", + "--input_format=ND", + '--input_shape="input_ids:{};attention_mask:{};position_ids:{};past_key_values:{}"'.format( + ",".join(input_ids_shape), + ",".join(attention_mask_shape), + ",".join(position_ids_shape), + ",".join(past_key_values_shape) + ), +] +if max_prefill_length > 1: + command_lines.append( + "--dynamic_dims \"{}\"".format(";".join(dynamic_dims)) + ) +print("============ run command ==============") +print(" ".join(command_lines)) +print("=======================================") +subprocess.run( + " ".join(command_lines), + shell=True, + check=True, ) \ No newline at end of file diff --git a/export/test_onnx_run.py b/export/test_onnx_run.py index 35cf339..065d080 100644 --- a/export/test_onnx_run.py +++ b/export/test_onnx_run.py @@ -41,11 +41,9 @@ def create_kv_cache(config: Qwen2Config, kv_cache_length=1024): return np.zeros( [ - config.num_hidden_layers, - 2, 1, - config.num_key_value_heads, kv_cache_length, + config.num_hidden_layers * 2 * config.num_key_value_heads, config.hidden_size // config.num_attention_heads ], dtype=np_dtype @@ -68,15 +66,13 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size """ self.kv_cache shape ( - self.num_hidden_layers, - 2, 1, - self.num_key_value_heads, self.kv_cache_length, + self.num_hidden_layers * 2 * self.num_key_value_heads, self.per_head_dim ) """ - cache = kv_cache[:, :, :, :, :past_kv_size] + cache = kv_cache[:, :past_kv_size] mask = np.ones((1, past_kv_size + seq_len), dtype=np.int64) mask[:, real_kv_size: past_kv_size] = 0 pos_id = np.arange( @@ -108,6 +104,10 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size print("input_ids", input_ids) options = onnxruntime.SessionOptions() +options.intra_op_num_threads = 4 +options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL +options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + llm_session = onnxruntime.InferenceSession( args.onnx_model_path, sess_options=options, @@ -138,5 +138,3 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size print("new_kv_cache: shape", new_kv_cache.shape) print("new_kv_cache: mean: ", new_kv_cache.astype(np.float32).mean().item()) print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item()) - - diff --git a/export/test_pytorch_run.py b/export/test_pytorch_run.py index 669a479..943e4d8 100644 --- a/export/test_pytorch_run.py +++ b/export/test_pytorch_run.py @@ -45,11 +45,9 @@ def create_kv_cache(config: Qwen2Config, kv_cache_length=1024): return torch.zeros( [ - config.num_hidden_layers, - 2, 1, - config.num_key_value_heads, kv_cache_length, + config.num_hidden_layers * 2 * config.num_key_value_heads, config.hidden_size // config.num_attention_heads ], dtype=torch_dtype @@ -72,15 +70,13 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size """ self.kv_cache shape ( - self.num_hidden_layers, - 2, 1, - self.num_key_value_heads, self.kv_cache_length, + self.num_hidden_layers * 2 * self.num_key_value_heads, self.per_head_dim ) """ - cache = kv_cache[:, :, :, :, :past_kv_size] + cache = kv_cache[:, :past_kv_size] mask = torch.ones((1, past_kv_size + seq_len), dtype=torch.long).to(device_str) mask[:, real_kv_size: past_kv_size] = 0 pos_id = torch.arange( @@ -115,12 +111,12 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size )["input_ids"].to(device_str) print("input_ids", input_ids) kv_cache1 = create_kv_cache(model_config) -now_kv_cache, attn_mask, position_ids = get_inputs(kv_cache1, 2, ) +now_kv_cache, attn_mask, position_ids = get_inputs(kv_cache1, 1) print("now_kv_cache shape: ", now_kv_cache.shape) print("attention_mask shape: ", attn_mask.shape) print("position_ids shape: ", position_ids.shape) outputs = model.forward( - input_ids[:, :2], + input_ids[:, :1], attn_mask, position_ids, now_kv_cache, @@ -129,13 +125,13 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size ) print("==== pytorch runtime ====") print("output length: ", len(outputs)) -logits = outputs[0][:, :-1, :] # 1: -0.10800 +logits = outputs[0] # 1: -0.10800 # logits = outputs[0][:, -1:, :] # 2: -0.008756 print("logits shape: ", logits.shape) print("logits mean: ", logits.float().mean().item()) print("logits max: ", logits.float().max().item()) -new_kv_cache = outputs[1][:, :, :, :, :-1, :] # 1: 0.0009: +new_kv_cache = outputs[1] # 1: 0.0009: # new_kv_cache = outputs[1][:, :, :, :, -1:, :] # 2: 0.003526 print("new_kv_cache: shape:", new_kv_cache.shape) @@ -143,4 +139,3 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size print("new_kv_cache: mean: ", new_kv_cache.float().mean().item()) # print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item()) print("new_kv_cache: max: ", new_kv_cache.float().max().item()) - diff --git a/utils/engine.py b/utils/engine.py index d3795d3..b4f44af 100644 --- a/utils/engine.py +++ b/utils/engine.py @@ -13,6 +13,7 @@ ACL_MEM_MALLOC_HUGE_FIRST = 0 ACL_MEMCPY_HOST_TO_DEVICE = 1 ACL_MEMCPY_DEVICE_TO_HOST = 2 +ACL_MEMCPY_DEVICE_TO_DEVICE = 3 ACL_MEM_MALLOC_NORMAL_ONLY = 2 NPY_FLOAT32 = 11 @@ -78,7 +79,11 @@ def __init__(self, config: InferenceConfig, context=None,callback=None): self.exit_flag = False self.max_batch = config.max_batch self.kv_cache_length = config.kv_cache_length - self.kv_cache = np.zeros(config.past_key_value_shape, dtype=np.float16) + # kv_cache的长度和max_output_length的长度一样 + self.past_kv_size=self.kv_cache_length + self.input_pos = 0 + self.real_kv_size = 0 + # self.kv_cache = np.zeros(config.past_key_value_shape, dtype=np.float16) self.input_dataset, self.output_dataset = None, None self.inputs:List[Dict[str,]] = [] self.outputs:List[Dict[str,]] = [] @@ -93,6 +98,75 @@ def __init__(self, config: InferenceConfig, context=None,callback=None): check_ret("acl.util.start_thread", ret) ret = acl.rt.subscribe_report(self.tid, self.stream) check_ret("acl.rt.subscribe_report", ret) + + def get_inputs(self, seq_len: int) -> List[np.ndarray]: + """ + 获取指定长度的kv_cache, 顺便生成mask和position_id + Args: + seq_len (int): 待获取的kv-cache长度 + + Returns: + List[np.ndarray]: _description_ + """ + + """ + self.kv_cache shape ( + 1, + self.kv_cache_length, + self.num_hidden_layers * 2 * self.num_key_value_heads, + self.per_head_dim + ) + """ + mask = np.ones((1,self.past_kv_size + seq_len), dtype=np.int64) + mask[:, self.real_kv_size: self.past_kv_size] = 0 + pos_id =np.arange( + self.input_pos, + self.input_pos + seq_len, + dtype=np.int64 + ).reshape(1,-1) + return mask, pos_id + + def reset(self): + # 重置kv-cache + self.input_pos=0 + self.real_kv_size=0 + ret = acl.rt.memset( + self.inputs[3]["buffer"], # 内存的起始地址。 + self.inputs[3]["size"], # 内存的最大长度,单位Byte。 + 0, + self.inputs[3]["size"] # 需要设置为指定值的内存长度,单位Byte。 + ) + check_ret("reset device kv-cache", ret) + + def update_kv_cache(self, seq_len): + self.input_pos = self.real_kv_size + seq_len + if seq_len + self.real_kv_size > self.kv_cache_length: + seq_len = self.kv_cache_length - self.real_kv_size + if seq_len <= 0: + return + # 用device memory完成下面的操作 + # self.kv_cache[:, self.real_kv_size: self.real_kv_size + seq_len] = new_kv_cache[:, 0: seq_len] + # kv-cache shape + """ + new_kv_cache_shape = [ + self.max_batch, + seq_length, + self.config.num_hidden_layers * 2 * self.config.num_key_value_heads, + self.config.per_head_dim + ] + """ + base_size = self.config.num_hidden_layers * 2 * self.config.num_key_value_heads * self.config.per_head_dim + # print("base_size: ", base_size) + # 默认是void指针,想要往前切片,需要将数据个数 * 2(代表float16)偏移 + ret = acl.rt.memcpy( + self.inputs[3]["buffer"] + (base_size * self.real_kv_size * self.max_batch) * 2, # 目的内存地址指针地址。 + base_size * (self.kv_cache_length - self.real_kv_size) * 2, # 目的内存地址的最大内存长度,单位Byte。 + self.outputs[1]["buffer"], + base_size * seq_len * 2, + ACL_MEMCPY_DEVICE_TO_DEVICE + ) + check_ret("update device cache", ret) + self.real_kv_size += seq_len def unload(self): if self.callback_func: @@ -138,18 +212,10 @@ def allocate_memory(self): self.input_dataset = acl.mdl.create_dataset() input_size = acl.mdl.get_num_inputs(self.model_desc) self.inputs = [] + # 给输入分配Device内存 for i in range(input_size): buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i) - # if i == 3: - # buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST) - # self.kv_cache = acl.util.ptr_to_numpy( - # buffer, self.config.past_key_value_shape, 23 # 23:NPY_HALF,NPY_FLOAT16 - # ) - # data = acl.create_data_buffer(buffer, buffer_size) - # _, ret = acl.mdl.add_dataset_buffer(self.input_dataset, data) - # check_ret("add_dataset_buffer",ret) - # self.inputs.append({"buffer": buffer, "size": buffer_size}) - # else: + # print(f"input[{i}], buffer size = {buffer_size}") buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST) check_ret("alloc input memory",ret) data = acl.create_data_buffer(buffer, buffer_size) @@ -160,16 +226,22 @@ def allocate_memory(self): self.output_dataset = acl.mdl.create_dataset() output_size = acl.mdl.get_num_outputs(self.model_desc) self.outputs = [] + # 给输出分配device和host内存 for i in range(output_size): buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i) + # print(f"output[{i}], buffer size = {buffer_size}") data_type = acl.mdl.get_output_data_type(self.model_desc, i) buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST) check_ret("alloc output memory",ret) data = acl.create_data_buffer(buffer, buffer_size) _, ret = acl.mdl.add_dataset_buffer(self.output_dataset, data) check_ret("add_dataset_buffer",ret) - buffer_host, ret = acl.rt.malloc_host(buffer_size) - check_ret("alloc output host memory",ret) + if i == 0: + buffer_host, ret = acl.rt.malloc_host(buffer_size) + check_ret("alloc output host memory",ret) + # 对于new_kv_cache,不需要分配host内存,后面直接在device内存进行更新,节省内存 + else: + buffer_host = None self.outputs.append( { "buffer": buffer, @@ -183,20 +255,26 @@ def free_memory(self): """ 释放内存 """ - for item in self.input_data: + for i, item in enumerate(self.input_data): ret = acl.rt.free(item["buffer"]) + check_ret(f"free input[{i}] device memory",ret) ret = acl.mdl.destroy_dataset(self.input_dataset) - for item in self.output_data: + for i, item in enumerate(self.output_data): ret = acl.rt.free(item["buffer"]) - ret = acl.rt.free_host(item["buffer_host"]) + check_ret("free output device memory",ret) + # 分配结果只分配了logitst的CPU内存,所以释放的时候也只释放logists的 + if i == 0: + ret = acl.rt.free_host(item["buffer_host"]) ret = acl.mdl.destroy_dataset(self.output_dataset) - def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=False) -> List[np.ndarray]: + def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=False, is_prefill=False) -> List[np.ndarray]: """ 执行推理,同步方式 Args: input_data_list (_type_): _description_ seq_length: 推理长度 + is_dynamic: 是否动态推理 + is_prefill: 是否是prefill阶段 Returns: List[np.ndarray]: _description_ @@ -204,9 +282,7 @@ def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic= start = time.time() acl.rt.set_context(self.context) for i in range(len(input_data_list)): - # if i == 3: - # continue - # else: + # 内存拷贝,忽略kv_cache,待会直接在device侧更新 input_data = input_data_list[i] input_size = input_data.size input_itemsize = input_data.itemsize @@ -270,16 +346,6 @@ def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic= new_kv_cache_itemsize = new_kv_cache_size * output_itemsize2 output_sizes = [logits_size, new_kv_cache_size] output_itemsizes = [logits_itemsize, new_kv_cache_itemsize] - logits_shape = [self.max_batch, seq_length, self.config.vocab_size] - new_kv_cache_shape = [ - self.config.num_hidden_layers, - 2, - self.max_batch, - self.config.num_key_value_heads, - seq_length, - self.config.per_head_dim - ] - output_shapes = [logits_shape, new_kv_cache_shape] ret = acl.mdl.execute( self.model_id, @@ -287,31 +353,42 @@ def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic= self.output_dataset ) check_ret("model_execute", ret) - inference_result = [] - for output_idx, out in enumerate(self.outputs): + """ + 获取输出结果, 从GPU拷贝输出数据到CPU + # 输出结果1:logits + # 输出结果2:new_kv_cache + prefill结果可以跳过logits的拷贝 + """ + # == update device kv cache == + self.update_kv_cache(seq_len=seq_length) + # 非prefill阶段才拷贝logits作为输出 + if not is_prefill: + # === update logits === if is_dynamic: - output_itemsize = output_itemsizes[output_idx] - output_size = output_sizes[output_idx] + output_itemsize = output_itemsizes[0] + output_size = output_sizes[0] else: - output_itemsize = out["size"] - output_size = output_itemsize // np.dtype(out["dtype"]).itemsize + output_itemsize = self.outputs[0]["size"] + output_size = output_itemsize // np.dtype(self.outputs[0]["dtype"]).itemsize + logits_shape = [self.max_batch, seq_length, self.config.vocab_size] ret = acl.rt.memcpy( - out['buffer_host'], - out["size"], - out["buffer"], + self.outputs[0]['buffer_host'], + self.outputs[0]["size"], + self.outputs[0]["buffer"], output_itemsize, ACL_MEMCPY_DEVICE_TO_HOST ) check_ret("memcpy output", ret) - bytes_out = acl.util.ptr_to_bytes(out['buffer_host'], out["size"]) - out_data = np.frombuffer( + bytes_out = acl.util.ptr_to_bytes(self.outputs[0]['buffer_host'], self.outputs[0]["size"]) + logits = np.frombuffer( bytes_out, - dtype=out['dtype'], + dtype=self.outputs[0]['dtype'], count=output_size, - ).reshape(output_shapes[output_idx]) - inference_result.append(out_data) - return inference_result + ).reshape(logits_shape) + return logits + else: + return None def inference_async(self, data, other_args) -> List[np.ndarray]: """ diff --git a/utils/inference.py b/utils/inference.py index 89b30a0..f654e2f 100644 --- a/utils/inference.py +++ b/utils/inference.py @@ -166,14 +166,13 @@ def stream_predict( if show_progress: prefill_show_progress = True # reset counter - self.session.run_times = 0 - self.session.kv_cache.real_kv_size = 0 + self.session.reset() else: prefill_show_progress = False logits = self.session.run( input_ids, - show_progress=prefill_show_progress - )[0] + show_progress=prefill_show_progress, + ) input_ids = self.sample_logits( logits[0][-1:], self.sampling_method, diff --git a/utils/kvcache.py b/utils/kvcache.py index ad77f42..ec78590 100644 --- a/utils/kvcache.py +++ b/utils/kvcache.py @@ -65,8 +65,8 @@ def get_inputs(self, seq_len: int) -> List[np.ndarray]: self.per_head_dim ) """ - cache = self.kv_cache[:, :, :, :, :self.past_kv_size] - mask = np.ones((1,self.past_kv_size + seq_len),dtype=np.int64) + cache = self.kv_cache[:, :self.past_kv_size] + mask = np.ones((1,self.past_kv_size + seq_len), dtype=np.int64) mask[:, self.real_kv_size: self.past_kv_size] = 0 pos_id =np.arange( self.input_pos, @@ -146,11 +146,9 @@ def update( ) -> None: """ self.kv_cache shape ( - self.num_hidden_layers, - 2, 1, - self.num_key_value_heads, self.kv_cache_length, + self.num_hidden_layers * 2 * self.num_key_value_heads, self.per_head_dim ) """ @@ -161,10 +159,10 @@ def update( return if self.cache_format=="huggingface-tensor": temp_shape = list(self.past_key_value_shape) - temp_shape[-2] = -1 + temp_shape[1] = -1 new_kv_cache = new_kv_cache.reshape(temp_shape) - self.kv_cache[:, :, :, :, self.real_kv_size: self.real_kv_size + seq_len] = \ - new_kv_cache[:, :, :, :, 0: seq_len] + self.kv_cache[:, self.real_kv_size: self.real_kv_size + seq_len] = \ + new_kv_cache[:, 0: seq_len] self.real_kv_size += seq_len class FixSizeStreamLLM(KVCacheManger): @@ -180,7 +178,7 @@ def update( score:Optional[np.ndarray] = None ): self.input_pos+=seq_len - while self.past_len+ seq_len > self.kv_cache_length: + while self.past_len+ seq_len > self.kv_cache_length: self.update_part(new_kv_cache, self.past_len, self.kv_cache_length - self.past_len) seq_len -= (self.kv_cache_length-self.past_len) self.past_len= self.head_len diff --git a/utils/session.py b/utils/session.py index 2b4f81f..c8a9839 100644 --- a/utils/session.py +++ b/utils/session.py @@ -12,7 +12,6 @@ class Session: def __init__(self, config: InferenceConfig) -> None: - self.kv_cache = create_kv_cache(config) self.run_times = 0 def run(self,input_ids:np.ndarray, show_progress: bool = False): @@ -39,8 +38,12 @@ def rollback(self,seq_len): class OnnxSession(Session): def __init__(self,config:InferenceConfig)->None: super().__init__(config) + self.kv_cache = create_kv_cache(config) import onnxruntime options = onnxruntime.SessionOptions() + options.intra_op_num_threads = config.cpu_thread + options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL + options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL self.llm_session = onnxruntime.InferenceSession( config.onnx_model_path, sess_options=options, @@ -118,12 +121,15 @@ def __init__(self, config:InferenceConfig): self.model = ACLModel(config, self.context) self.max_batch = config.max_batch self.input_ids = np.zeros((1,16),dtype=np.int64) - self.kv_cache.kv_cache = self.model.kv_cache + # self.kv_cache = create_kv_cache(config) + # self.kv_cache.kv_cache = self.model.kv_cache self.max_prefill_length = config.max_prefill_length self.prefill_log2_number = int(math.log2(self.max_prefill_length)) self.prefill_log2_list = list(range(self.prefill_log2_number, -1, -1)) self.prefill_log2_list = [2**index for index in self.prefill_log2_list] + def reset(self): + self.model.reset(); def __del__(self): destroy_resource(self.device_id, self.context) @@ -145,9 +151,10 @@ def decompose_number(self, n, start_index=0): return [power] + self.decompose_number(n - power, i) return [] - def run(self, input_ids: np.ndarray, show_progress:bool=False): + def run(self, input_ids: np.ndarray, show_progress: bool = False): seq_len = input_ids.shape[-1] logits = None + is_prefill = True is_dynamic = bool(self.max_prefill_length > 1) # dynamic inference if is_dynamic: @@ -155,12 +162,15 @@ def run(self, input_ids: np.ndarray, show_progress:bool=False): if show_progress: seq_list = tqdm(seq_list, desc="prefill") start_i = 0 - for seq in seq_list: + for (ii, seq) in enumerate(seq_list): end_i = start_i + seq + if (ii == len(seq_list) - 1): + is_prefill = False logits = self.run_some( input_ids[:, start_i: end_i], seq, is_dynamic, + is_prefill=is_prefill ) start_i += seq # if show_progress: @@ -172,46 +182,24 @@ def run(self, input_ids: np.ndarray, show_progress:bool=False): else: idx_list = range(seq_len) for i in idx_list: - logits = self.run_some(input_ids[:,i]) - return [logits] + if (i == len(idx_list) - 1): + is_prefill = False + logits = self.run_some(input_ids[:,i], is_prefill=is_prefill) + return logits def run_some( self, input_ids: np.ndarray, seq_length: int = 1, - is_dynamic: bool = False + is_dynamic: bool = False, + is_prefill: bool = False, ): - # print( - # "self.run_times: ", self.run_times, - # "real kv size: ", self.kv_cache.real_kv_size - # ) self.run_times += seq_length - cache, mask, pos_ids = self.kv_cache.get_inputs(seq_length) - result:List[np.ndarray] = self.model.inference( - [input_ids, mask, pos_ids, cache], seq_length, is_dynamic - ) - # if self.run_times <= 20: - # print(" === Debug === ") - # print("run times: ", self.run_times) - # logits = result[0] - # new_kv_cache = result[1] - # print("logits shape: ", logits.shape) - # print("logits mean: ", logits.astype(np.float32).mean().item()) - # print("logits max: ", logits.astype(np.float32).max().item()) - # print("new_kv_cache: shape", new_kv_cache.shape) - # print("new_kv_cache: mean: ", new_kv_cache.astype(np.float32).mean().item()) - # print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item()) - self.kv_cache.update(seq_length, result[1]) - return result[0].reshape(self.max_batch, seq_length,-1) - - def run_all_logits(self, input_ids: np.ndarray): - seq_len, i = input_ids.shape[-1], 0 - logits = [] - while i < seq_len: - end = i + 16 if i+16 < seq_len else seq_len - cache,mask,pos_ids = self.kv_cache.get_inputs(16) - self.input_ids[0:end-i] = input_ids[i:end] - result:List[np.ndarray] = self.model.inference([self.input_ids, mask, pos_ids, cache]) - self.kv_cache.update(end-i,result[1]) - logits.append(result[0][0:end-i].reshape(1,-1)) - return [np.concatenate(logits).reshape(1,1,-1)] \ No newline at end of file + mask, pos_ids = self.model.get_inputs(seq_length) + logits = self.model.inference( + [input_ids, mask, pos_ids], seq_length, is_dynamic, is_prefill=is_prefill + ) + if not is_prefill: + return logits.reshape(self.max_batch, seq_length,-1) + else: + return None