From d824498b1988eb79a4fcc4ea6d57246663b6a899 Mon Sep 17 00:00:00 2001
From: Tlntin <tlntindeng01@gmail.com>
Date: Mon, 21 Oct 2024 22:57:00 +0800
Subject: [PATCH] code optimization

---
 README.md                  |    2 +
 cli_chat.py                |    3 +-
 config.py                  |    6 +-
 export/change_node.py      |  164 +-
 export/export_onnx.py      |  432 +++---
 export/modeling_qwen2.py   | 2998 ++++++++++++++++++------------------
 export/onnx2om.py          |  360 +++--
 export/test_onnx_run.py    |   16 +-
 export/test_pytorch_run.py |   19 +-
 utils/engine.py            |  167 +-
 utils/inference.py         |    7 +-
 utils/kvcache.py           |   16 +-
 utils/session.py           |   68 +-
 13 files changed, 2167 insertions(+), 2091 deletions(-)

diff --git a/README.md b/README.md
index a25fcbf..f28a839 100644
--- a/README.md
+++ b/README.md
@@ -58,6 +58,8 @@
 2. 导出onnx，默认kv-cache长度为1024，可以根据自己的内存、显存来设置更大参数。
   ```bash
   python3 export/export_onnx.py \
+    --device_str=npu \
+    --dtype=float16 \
     --hf_model_dir="./download/Qwen2-1.5B-Instruct" \
     --onnx_model_path="./output/onnx/qwen2_1.5b_chat.onnx" \
     --kv_cache_length=1024
diff --git a/cli_chat.py b/cli_chat.py
index 7010972..d88025a 100644
--- a/cli_chat.py
+++ b/cli_chat.py
@@ -75,12 +75,13 @@ def inference_cli():
             break
         if input_text == 'clear':
             history = []
+            infer_engine.session.reset()
             print("Output: 已清理历史对话信息。")
             continue
         print("Output: ", end='')
         response = ""
         is_first = True
-        first_token_lantency, decode_speed = 0, 0
+        first_token_lantency, decode_speed, total_speed = 0, 0, 0.0
         for (
                 new_text,
                 first_token_lantency,
diff --git a/config.py b/config.py
index ff31a45..694024c 100644
--- a/config.py
+++ b/config.py
@@ -8,6 +8,7 @@ def __init__(
         hf_model_dir: str,
         om_model_path: str,
         onnx_model_path: str,
+        cpu_thread: int = 4, # CPU线程数
         session_type: str = "acl", # 支持acl和onnx两种，acl即Ascend C Language
         device_id: int = 0,
         sampling_method: str = "top_p", # 支持 greedy, top_p, top_k
@@ -30,6 +31,7 @@ def __init__(
             assert os.path.exists(onnx_model_path), print(onnx_model_path, "not exists")
         self.om_model_path = om_model_path
         self.onnx_model_path = onnx_model_path
+        self.cpu_thread = cpu_thread
         self.device_id = device_id
         self.sampling_method = sampling_method
         self.sampling_value = sampling_value
@@ -48,11 +50,9 @@ def __init__(
         self.num_attention_heads = self.model_config.num_attention_heads
         self.per_head_dim = self.hidden_size // self.num_attention_heads # head_dim
         self.past_key_value_shape = (
-            self.num_hidden_layers,
-            2,
             self.max_batch,
-            self.num_key_value_heads,
             self.kv_cache_length,
+            self.num_hidden_layers * 2 * self.num_key_value_heads,
             self.per_head_dim
         )
         self.max_prefill_length = max_prefill_length
diff --git a/export/change_node.py b/export/change_node.py
index 1469c83..b9122a3 100644
--- a/export/change_node.py
+++ b/export/change_node.py
@@ -1,82 +1,82 @@
-import os
-import onnx
-import onnx.helper as helper
-from onnx import TensorProto
-from tqdm import tqdm
-import argparse
-
-
-now_dir = os.path.dirname(os.path.abspath(__file__))
-project_dir = os.path.dirname(now_dir)
-output_dir = os.path.join(project_dir, "output")
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-old_onnx_dir = os.path.join(output_dir, "onnx")
-if not os.path.exists(old_onnx_dir):
-    os.mkdir(old_onnx_dir)
-new_onnx_dir = os.path.join(output_dir, "onnx2")
-if not os.path.exists(new_onnx_dir):
-    os.mkdir(new_onnx_dir)
-
-now_dir = os.path.dirname(os.path.abspath(__file__))
-project_dir = os.path.dirname(now_dir)
-model_name = "qwen2_1.5b_chat.onnx"
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    '--input_model_path',
-    type=str,
-    help="raw onnx model convert by pytroch",
-    default=os.path.join(old_onnx_dir, model_name)
-)
-parser.add_argument(
-    "--output_model_path",
-    help="output onnx model path",
-    type=str,
-    default=os.path.join(new_onnx_dir, model_name)
-)
-
-args = parser.parse_args()
-
-model = onnx.load(args.input_model_path)
-new_nodes = []
-
-for node in tqdm(model.graph.node, desc="replace node..."):
-    # 判断节点类型
-    new_node = node
-    if node.op_type == "Trilu":
-        new_node = helper.make_node(
-            "Trilu",
-            name="MY_" + node.name,
-            inputs=[node.input[0]],
-            outputs=node.output,
-            upper=0
-        )
-    if node.op_type == "Cast":
-        # 替换为新的算子类型
-        to_attribute = next(attr for attr in node.attribute if attr.name == "to")
-        if to_attribute.i == TensorProto.INT8:
-            new_node = helper.make_node(
-                "AscendQuant",
-                inputs=node.input,
-                outputs=node.output,
-                offset=0.,
-                scale=1.,
-            )
-    new_nodes.append(new_node)
-print("make new graph")
-new_graph = helper.make_graph(
-    new_nodes,
-    "new_graph",
-    inputs=model.graph.input,
-    outputs=model.graph.output,
-    value_info=model.graph.value_info,
-    initializer=model.graph.initializer
-)
-print("make new model")
-new_model = helper.make_model(new_graph, producer_name=model.producer_name,opset_imports=model.opset_import,ir_version = model.ir_version)
-# new_model.ir_version = model.ir_version
-# new_model.opset_import = model.opset_import
-# new_model.metadata_props = model.metadata_props
-print("will save model in ", args.output_model_path)
-onnx.save(new_model, args.output_model_path, save_as_external_data=True)
+import os
+import onnx
+import onnx.helper as helper
+from onnx import TensorProto
+from tqdm import tqdm
+import argparse
+
+
+now_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(now_dir)
+output_dir = os.path.join(project_dir, "output")
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+old_onnx_dir = os.path.join(output_dir, "onnx")
+if not os.path.exists(old_onnx_dir):
+    os.mkdir(old_onnx_dir)
+new_onnx_dir = os.path.join(output_dir, "onnx2")
+if not os.path.exists(new_onnx_dir):
+    os.mkdir(new_onnx_dir)
+
+now_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(now_dir)
+model_name = "qwen2_1.5b_chat.onnx"
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--input_model_path',
+    type=str,
+    help="raw onnx model convert by pytroch",
+    default=os.path.join(old_onnx_dir, model_name)
+)
+parser.add_argument(
+    "--output_model_path",
+    help="output onnx model path",
+    type=str,
+    default=os.path.join(new_onnx_dir, model_name)
+)
+
+args = parser.parse_args()
+
+model = onnx.load(args.input_model_path)
+new_nodes = []
+
+for node in tqdm(model.graph.node, desc="replace node..."):
+    # 判断节点类型
+    new_node = node
+    if node.op_type == "Trilu":
+        new_node = helper.make_node(
+            "Trilu",
+            name="MY_" + node.name,
+            inputs=[node.input[0]],
+            outputs=node.output,
+            upper=0
+        )
+    if node.op_type == "Cast":
+        # 替换为新的算子类型
+        to_attribute = next(attr for attr in node.attribute if attr.name == "to")
+        if to_attribute.i == TensorProto.INT8:
+            new_node = helper.make_node(
+                "AscendQuant",
+                inputs=node.input,
+                outputs=node.output,
+                offset=0.,
+                scale=1.,
+            )
+    new_nodes.append(new_node)
+print("make new graph")
+new_graph = helper.make_graph(
+    new_nodes,
+    "new_graph",
+    inputs=model.graph.input,
+    outputs=model.graph.output,
+    value_info=model.graph.value_info,
+    initializer=model.graph.initializer
+)
+print("make new model")
+new_model = helper.make_model(new_graph, producer_name=model.producer_name,opset_imports=model.opset_import,ir_version = model.ir_version)
+# new_model.ir_version = model.ir_version
+# new_model.opset_import = model.opset_import
+# new_model.metadata_props = model.metadata_props
+print("will save model in ", args.output_model_path)
+onnx.save(new_model, args.output_model_path, save_as_external_data=True)
diff --git a/export/export_onnx.py b/export/export_onnx.py
index d52e1e9..3d6f7e4 100644
--- a/export/export_onnx.py
+++ b/export/export_onnx.py
@@ -1,217 +1,215 @@
-"""_summary_
-qwen2 modeling_qwen2.py download: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/qwen2/modeling_qwen2.py
-"""
-
-import os
-import json
-import sys
-from typing import List
-import torch
-import shutil
-# from transformers import AutoModel, Qwen2Config
-from transformers.models.qwen2 import Qwen2Config
-from modeling_qwen2 import Qwen2ForCausalLM
-
-import onnx
-import io
-import argparse
-
-
-now_dir = os.path.dirname(os.path.abspath(__file__))
-project_dir = os.path.dirname(now_dir)
-output_dir = os.path.join(project_dir, "output")
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-onnx_model_dir = os.path.join(output_dir, "onnx")
-if not os.path.exists(onnx_model_dir):
-    os.mkdir(onnx_model_dir)
-if len(os.listdir(onnx_model_dir)) > 0:
-    print("found some file in {}, will clear it".format(onnx_model_dir))
-    for temp_file in os.listdir(onnx_model_dir):
-        temp_path = os.path.join(onnx_model_dir, temp_file)
-        os.remove(temp_path)
-
-
-def parser_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--device_str",
-        type=str,
-        choices=["npu", "cuda", "cpu"],
-        help="support npu, cuda, cpu",
-        default="npu",
-    )
-    parser.add_argument(
-        "--dtype" ,
-        type=str,
-        help="support float16/float32, if use CPU, only support fp32",
-        choices=["float16", "float32"],
-        default="float16",
-    )
-    parser.add_argument(
-        '--hf_model_dir',
-        type=str,
-        help="model and tokenizer path, only support huggingface model",
-        default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct")
-    )
-    parser.add_argument(
-        "--onnx_model_path",
-        help="output onnx path",
-        type=str,
-        default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx")
-    )
-    parser.add_argument(
-        "--kv_cache_length",
-        help="kv-cache length",
-        type=int,
-        default=1024,
-    )
-    return parser.parse_args()
-
-
-def export_onnx(
-    device_str,
-    dtype: str,
-    hf_model_dir: str,
-    onnx_model_path: str,
-    kv_cache_length: int,
-    num_hidden_layers: int,
-    num_key_value_heads: int,
-    per_head_dim: int,
-):
-    if device_str == "npu":
-        import torch_npu
-    if dtype == "float16":
-        assert device_str.lower() != "cpu", print("cpu not support fp16")
-        torch_dtype = torch.float16
-    elif dtype == "float32":
-        torch_dtype = torch.float32
-    else:
-        raise Exception("unsupport dtype")
-
-    device = torch.device(device_str)
-    model = Qwen2ForCausalLM.from_pretrained(
-        hf_model_dir,
-        torch_dtype=torch_dtype,
-        # trust_remote_code=True
-    ).to(device)
-    quantize_cfg = {
-        "query_key_value": {
-            "type": "W8X8",
-            "act_scale": False
-        },
-        "dense": {
-            "type": "W8X8",
-            "act_scale": False
-        },
-        "dense_h_to_4h": {
-            "type": "W8X8",
-            "act_scale": False
-        },
-        "dense_4h_to_h": {
-            "type": "W8X8",
-            "act_scale": False
-        }
-    }
-    quantize_cfg = {}
-    input_names = [
-        "input_ids",
-        "attention_mask",
-        "position_ids",
-        "past_key_values"
-    ]
-    output_names = ["logits", "out_key_values"]
-    dynamic_axes = {
-        "input_ids": {0: "batch_size", 1: "seq_length"},
-        "attention_mask": {0: "batch_size", 1: "seq_length+kv_len"},
-        "position_ids": {0: "batch_size", 1: "seq_length"},
-        "past_key_values": {2: "batch_size", 4: "kv_len"},
-    }
-    batch_size = 1
-    seq_len = 1
-    all_len = seq_len + kv_cache_length
-
-    input_ids = torch.zeros((batch_size, seq_len)).long().to(device)
-    attention_mask = torch.zeros((batch_size, all_len)).long().to(device)
-    position_ids = torch.zeros((batch_size, seq_len)).long().to(device)
-    past_key_values = torch.rand(
-        (
-            num_hidden_layers,
-            2,
-            1,
-            num_key_value_heads,
-            kv_cache_length,
-            per_head_dim
-        ),
-        dtype=torch_dtype
-    ).to(device)
-    input_args = (
-        input_ids,
-        attention_mask,
-        position_ids,
-        past_key_values,
-        # None,  # inputs_embeds: Optional[torch.FloatTensor] = None,
-        # None,  # labels: Optional[torch.LongTensor] = None,
-        # True,  # use_cache: Optional[bool] = None,
-        # True,  # output_attentions: Optional[bool] = None,
-        # None,  # output_hidden_states
-        # False  # return_dict:
-    )
-    model.eval()
-    with torch.no_grad():
-        # from quantize import quantize
-        # quantize(model, cfg=quantize_cfg)
-        # print(model)
-        torch.onnx.export(
-            model,
-            f=onnx_model_path,
-            args=input_args,
-            input_names=input_names,
-            output_names=output_names,
-            dynamic_axes=dynamic_axes,
-            do_constant_folding=False,
-            opset_version=14,
-            export_params=True
-        )
-
-
-if __name__ == "__main__":
-    args = parser_arguments()
-    # model_config = Qwen2Config.from_pretrained(args.hf_model_dir)
-    # copy modeling_qwen2.py to model dir
-    src_file_path = os.path.join(now_dir, "modeling_qwen2.py")
-    target_file_path = os.path.join(args.hf_model_dir, "modeling_qwen2.py")
-    shutil.copy(src_file_path, target_file_path)
-    # print(model_config)
-    config_json = os.path.join(args.hf_model_dir, "config.json")
-    with open(config_json, "rt", encoding="utf-8") as f:
-        model_config = json.load(f)
-    model_config["auto_map"] = {
-        "AutoModel": "modeling_qwen2.Qwen2ForCausalLM",
-        "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM",
-        "AutoModelForSeq2SeqLM": "modeling_qwen2.Qwen2ForCausalLM",
-        "AutoModelForSequenceClassification": "modeling_qwen2.Qwen2ForSequenceClassification"
-    }
-    with open(config_json, "wt", encoding="utf-8") as f:
-        json.dump(model_config, f, indent=4)
-    test_model_config = Qwen2Config.from_pretrained(args.hf_model_dir)
-    # print(test_model_config)
-    test_model_config.torch_dtype = "float16"
-    test_model_config.save_pretrained(args.hf_model_dir)
-    num_hidden_layers = test_model_config.num_hidden_layers
-    num_attention_heads = test_model_config.num_attention_heads
-    num_key_value_heads = test_model_config.num_key_value_heads
-    hidden_size = test_model_config.hidden_size
-    per_head_dim = hidden_size // num_attention_heads
-    print("new model config save ok in ", args.hf_model_dir)
-    print("begin export onnx")
-    export_onnx(
-        device_str=args.device_str,
-        dtype=args.dtype,
-        hf_model_dir=args.hf_model_dir,
-        onnx_model_path=args.onnx_model_path,
-        kv_cache_length=args.kv_cache_length,
-        num_hidden_layers=num_hidden_layers,
-        num_key_value_heads=num_key_value_heads,
-        per_head_dim=per_head_dim
-    )
+"""_summary_
+qwen2 modeling_qwen2.py download: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/qwen2/modeling_qwen2.py
+"""
+
+import os
+import json
+import sys
+from typing import List
+import torch
+import shutil
+# from transformers import AutoModel, Qwen2Config
+from transformers.models.qwen2 import Qwen2Config
+from modeling_qwen2 import Qwen2ForCausalLM
+
+import onnx
+import io
+import argparse
+
+
+now_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(now_dir)
+output_dir = os.path.join(project_dir, "output")
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+onnx_model_dir = os.path.join(output_dir, "onnx")
+if not os.path.exists(onnx_model_dir):
+    os.mkdir(onnx_model_dir)
+if len(os.listdir(onnx_model_dir)) > 0:
+    print("found some file in {}, will clear it".format(onnx_model_dir))
+    for temp_file in os.listdir(onnx_model_dir):
+        temp_path = os.path.join(onnx_model_dir, temp_file)
+        os.remove(temp_path)
+
+
+def parser_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--device_str",
+        type=str,
+        choices=["npu", "cuda", "cpu"],
+        help="support npu, cuda, cpu",
+        default="cpu",
+    )
+    parser.add_argument(
+        "--dtype" ,
+        type=str,
+        help="support float16/float32, if use CPU, only support fp32",
+        choices=["float16", "float32"],
+        default="float32",
+    )
+    parser.add_argument(
+        '--hf_model_dir',
+        type=str,
+        help="model and tokenizer path, only support huggingface model",
+        default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct")
+    )
+    parser.add_argument(
+        "--onnx_model_path",
+        help="output onnx path",
+        type=str,
+        default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx")
+    )
+    parser.add_argument(
+        "--kv_cache_length",
+        help="kv-cache length",
+        type=int,
+        default=1024,
+    )
+    return parser.parse_args()
+
+
+def export_onnx(
+    device_str,
+    dtype: str,
+    hf_model_dir: str,
+    onnx_model_path: str,
+    kv_cache_length: int,
+    num_hidden_layers: int,
+    num_key_value_heads: int,
+    per_head_dim: int,
+):
+    if device_str == "npu":
+        import torch_npu
+    if dtype == "float16":
+        assert device_str.lower() != "cpu", print("cpu not support fp16")
+        torch_dtype = torch.float16
+    elif dtype == "float32":
+        torch_dtype = torch.float32
+    else:
+        raise Exception("unsupport dtype")
+
+    device = torch.device(device_str)
+    model = Qwen2ForCausalLM.from_pretrained(
+        hf_model_dir,
+        torch_dtype=torch_dtype,
+        # trust_remote_code=True
+    ).to(device)
+    quantize_cfg = {
+        "query_key_value": {
+            "type": "W8X8",
+            "act_scale": False
+        },
+        "dense": {
+            "type": "W8X8",
+            "act_scale": False
+        },
+        "dense_h_to_4h": {
+            "type": "W8X8",
+            "act_scale": False
+        },
+        "dense_4h_to_h": {
+            "type": "W8X8",
+            "act_scale": False
+        }
+    }
+    quantize_cfg = {}
+    input_names = [
+        "input_ids",
+        "attention_mask",
+        "position_ids",
+        "past_key_values"
+    ]
+    output_names = ["logits", "out_key_values"]
+    dynamic_axes = {
+        "input_ids": {0: "batch_size", 1: "seq_length"},
+        "attention_mask": {0: "batch_size", 1: "seq_length+kv_len"},
+        "position_ids": {0: "batch_size", 1: "seq_length"},
+        "past_key_values": {0: "batch_size", 1: "kv_len"},
+    }
+    batch_size = 1
+    seq_len = 1
+    all_len = seq_len + kv_cache_length
+
+    input_ids = torch.zeros((batch_size, seq_len)).long().to(device)
+    attention_mask = torch.zeros((batch_size, all_len)).long().to(device)
+    position_ids = torch.zeros((batch_size, seq_len)).long().to(device)
+    past_key_values = torch.rand(
+        (
+            1,
+            kv_cache_length,
+            num_hidden_layers * 2 * num_key_value_heads,
+            per_head_dim
+        ),
+        dtype=torch_dtype
+    ).to(device)
+    input_args = (
+        input_ids,
+        attention_mask,
+        position_ids,
+        past_key_values,
+        # None,  # inputs_embeds: Optional[torch.FloatTensor] = None,
+        # None,  # labels: Optional[torch.LongTensor] = None,
+        # True,  # use_cache: Optional[bool] = None,
+        # True,  # output_attentions: Optional[bool] = None,
+        # None,  # output_hidden_states
+        # False  # return_dict:
+    )
+    model.eval()
+    with torch.no_grad():
+        # from quantize import quantize
+        # quantize(model, cfg=quantize_cfg)
+        # print(model)
+        torch.onnx.export(
+            model,
+            f=onnx_model_path,
+            args=input_args,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            do_constant_folding=False,
+            opset_version=14,
+            export_params=True
+        )
+
+
+if __name__ == "__main__":
+    args = parser_arguments()
+    # model_config = Qwen2Config.from_pretrained(args.hf_model_dir)
+    # copy modeling_qwen2.py to model dir
+    src_file_path = os.path.join(now_dir, "modeling_qwen2.py")
+    target_file_path = os.path.join(args.hf_model_dir, "modeling_qwen2.py")
+    shutil.copy(src_file_path, target_file_path)
+    # print(model_config)
+    config_json = os.path.join(args.hf_model_dir, "config.json")
+    with open(config_json, "rt", encoding="utf-8") as f:
+        model_config = json.load(f)
+    model_config["auto_map"] = {
+        "AutoModel": "modeling_qwen2.Qwen2ForCausalLM",
+        "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM",
+        "AutoModelForSeq2SeqLM": "modeling_qwen2.Qwen2ForCausalLM",
+        "AutoModelForSequenceClassification": "modeling_qwen2.Qwen2ForSequenceClassification"
+    }
+    with open(config_json, "wt", encoding="utf-8") as f:
+        json.dump(model_config, f, indent=4)
+    test_model_config = Qwen2Config.from_pretrained(args.hf_model_dir)
+    # print(test_model_config)
+    test_model_config.torch_dtype = "float16"
+    test_model_config.save_pretrained(args.hf_model_dir)
+    num_hidden_layers = test_model_config.num_hidden_layers
+    num_attention_heads = test_model_config.num_attention_heads
+    num_key_value_heads = test_model_config.num_key_value_heads
+    hidden_size = test_model_config.hidden_size
+    per_head_dim = hidden_size // num_attention_heads
+    print("new model config save ok in ", args.hf_model_dir)
+    print("begin export onnx")
+    export_onnx(
+        device_str=args.device_str,
+        dtype=args.dtype,
+        hf_model_dir=args.hf_model_dir,
+        onnx_model_path=args.onnx_model_path,
+        kv_cache_length=args.kv_cache_length,
+        num_hidden_layers=num_hidden_layers,
+        num_key_value_heads=num_key_value_heads,
+        per_head_dim=per_head_dim
+    )
diff --git a/export/modeling_qwen2.py b/export/modeling_qwen2.py
index 735bea1..fcc96a8 100644
--- a/export/modeling_qwen2.py
+++ b/export/modeling_qwen2.py
@@ -1,1488 +1,1510 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Qwen2 model."""
-import inspect
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
-
-
-# if is_flash_attn_2_available():
-#     from flash_attn import flash_attn_func, flash_attn_varlen_func
-#     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-# 
-#     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-
-
-logger = logging.get_logger(__name__)
-
-
-_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
-_CONFIG_FOR_DOC = "Qwen2Config"
-
-QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "Qwen/Qwen2-7B-beta",
-    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
-class Qwen2RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Qwen2RMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
-class Qwen2RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-# Copied from transformers.models.llama.modeling_llama.rotate_half
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
-class Qwen2MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
-# Copied from transformers.models.llama.modeling_llama.repeat_kv
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class Qwen2Attention(nn.Module):
-    """
-    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
-    and "Generating Long Sequences with Sparse Transformers".
-    """
-
-    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        if layer_idx is None:
-            logger.warning_once(
-                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
-                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
-                "when creating this class."
-            )
-
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-        self.attention_dropout = config.attention_dropout
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-
-        self.rotary_emb = Qwen2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        # output_attentions: bool = False,
-        # use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[self.layer_idx].shape[3]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        out_cache = (key_states, value_states)
-        if past_key_value is not None:
-            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            cache_key = past_key_value[self.layer_idx][0]
-            cache_value = past_key_value[self.layer_idx][1]
-            key_states = torch.cat((cache_key, key_states), dim=2)
-            value_states = torch.cat((cache_value, value_states), dim=2)
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-        #     raise ValueError(
-        #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-        #         f" {attn_weights.size()}"
-        #     )
-
-        # if attention_mask is not None:
-        #     if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-        #         raise ValueError(
-        #             f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-        #         )
-        #     attn_weights = attn_weights + attention_mask
-        attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        # if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        #     raise ValueError(
-        #         f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-        #         f" {attn_output.size()}"
-        #     )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        # if not output_attentions:
-        #     attn_weights = None
-
-        # return attn_output, attn_weights, past_key_value
-        return attn_output, attn_weights, out_cache
-
-
-'''
-class Qwen2FlashAttention2(Qwen2Attention):
-    """
-    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
-    as the weights of the module stays untouched. The only required change would be on the forward pass
-    where it needs to correctly call the public API of flash attention and deal with padding tokens
-    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
-    config.max_window_layers layers.
-    """
-
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        use_sliding_windows = (
-            _flash_supports_window_size
-            and getattr(self.config, "sliding_window", None) is not None
-            and kv_seq_len > self.config.sliding_window
-            and self.config.use_sliding_window
-        )
-
-        if not _flash_supports_window_size:
-            logger.warning_once(
-                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
-                " make sure to upgrade flash-attn library."
-            )
-
-        if past_key_value is not None:
-            # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
-            if (
-                getattr(self.config, "sliding_window", None) is not None
-                and kv_seq_len > self.config.sliding_window
-                and cache_has_contents
-            ):
-                slicing_tokens = 1 - self.config.sliding_window
-
-                past_key = past_key_value[self.layer_idx][0]
-                past_value = past_key_value[self.layer_idx][1]
-
-                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
-                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
-
-                if past_key.shape[-2] != self.config.sliding_window - 1:
-                    raise ValueError(
-                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
-                        f" {past_key.shape}"
-                    )
-
-                if attention_mask is not None:
-                    attention_mask = attention_mask[:, slicing_tokens:]
-                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
-
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        # Reashape to the expected shape for Flash Attention
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = self._flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            dropout=dropout_rate,
-            use_sliding_windows=use_sliding_windows,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self,
-        query_states,
-        key_states,
-        value_states,
-        attention_mask,
-        query_length,
-        dropout=0.0,
-        softmax_scale=None,
-        use_sliding_windows=False,
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-            use_sliding_windows (`bool`, *optional*):
-                Whether to activate sliding window attention.
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-
-        # Decide whether to use SWA or not by layer index.
-        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
-            use_sliding_windows = False
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            if not use_sliding_windows:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output_unpad = flash_attn_varlen_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    cu_seqlens_q=cu_seqlens_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_q=max_seqlen_in_batch_q,
-                    max_seqlen_k=max_seqlen_in_batch_k,
-                    dropout_p=dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            if not use_sliding_windows:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                )
-            else:
-                attn_output = flash_attn_func(
-                    query_states,
-                    key_states,
-                    value_states,
-                    dropout,
-                    softmax_scale=softmax_scale,
-                    causal=causal,
-                    window_size=(self.config.sliding_window, self.config.sliding_window),
-                )
-
-        return attn_output
-
-    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-
-        # On the first iteration we need to properly re-create the padding mask
-        # by slicing it on the proper place
-        if kv_seq_len != attention_mask.shape[-1]:
-            attention_mask_num_tokens = attention_mask.shape[-1]
-            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
-
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-
-        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
-
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-'''
-
-
-# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
-class Qwen2SdpaAttention(Qwen2Attention):
-    """
-    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
-    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
-    SDPA API.
-    """
-
-    # Adapted from Qwen2Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        # output_attentions: bool = False,
-        # use_cache: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # if output_attentions:
-        #     # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-        #     # logger.warning_once(
-        #     #     "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-        #     #     'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-        #     # )
-        #     return super().forward(
-        #         hidden_states=hidden_states,
-        #         attention_mask=attention_mask,
-        #         position_ids=position_ids,
-        #         past_key_value=past_key_value,
-        #         # output_attentions=output_attentions,
-        #         # use_cache=use_cache,
-        #     )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-            kv_seq_len += past_key_value[self.layer_idx].shape[3]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        output_cache = (key_states, value_states)
-        if past_key_value is not None:
-            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-            cache_key = past_key_value[self.layer_idx][0]
-            cache_value = past_key_value[self.layer_idx][1]
-            key_states = torch.cat((cache_key, key_states), dim=2)
-            value_states = torch.cat((cache_value, value_states), dim=2)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        # if attention_mask is not None:
-        #     if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-        #         raise ValueError(
-        #             f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-        #         )
-
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-        # copy from chatglm3-6b
-        # attention_mask = ~attention_mask
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            # dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            # is_causal=self.is_causal and attention_mask is None and q_len > 1,
-        )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None, output_cache
-
-
-QWEN2_ATTENTION_CLASSES = {
-    "eager": Qwen2Attention,
-    # "flash_attention_2": Qwen2FlashAttention2,
-    "sdpa": Qwen2SdpaAttention,
-}
-
-
-class Qwen2DecoderLayer(nn.Module):
-    def __init__(self, config: Qwen2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
-            logger.warning_once(
-                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
-                "unexpected results may be encountered."
-            )
-        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        self.self_attn = Qwen2SdpaAttention(config, layer_idx)
-        # self.self_attn = Qwen2Attention(config, layer_idx)
-        self.mlp = Qwen2MLP(config)
-        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        # output_attentions: Optional[bool] = False,
-        # use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
-                "Please make sure use `attention_mask` instead.`"
-            )
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            # output_attentions=output_attentions,
-            # use_cache=use_cache,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        # if output_attentions:
-        #     outputs += (self_attn_weights,)
-
-        # if use_cache:
-        #     outputs += (present_key_value,)
-        outputs += (present_key_value,)
-
-        return outputs
-
-
-QWEN2_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`Qwen2Config`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2PreTrainedModel(PreTrainedModel):
-    config_class = Qwen2Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-QWEN2_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance;
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2Model(Qwen2PreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
-
-    Args:
-        config: Qwen2Config
-    """
-
-    def __init__(self, config: Qwen2Config):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self._attn_implementation = config._attn_implementation
-        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @staticmethod
-    def get_masks(input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(
-            batch_size,
-            seq_length,
-            seq_length,
-            device=input_ids.device,
-            # dtype=torch.int64
-        )
-        full_attention_mask.tril_()
-        past_length = past_key_values.shape[4]
-        # if past_length is not None:
-        full_attention_mask = torch.cat(
-            (
-                torch.ones(
-                    batch_size,
-                    seq_length,
-                    past_length,
-                    device=input_ids.device,
-                    # dtype=torch.int64
-                ),
-                full_attention_mask
-            ),
-            dim=-1
-        )
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(
-                1)
-        # if not past_length and padding_mask is not None:
-        #     full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[torch.FloatTensor] = None,
-        # inputs_embeds: Optional[torch.FloatTensor] = None,
-        # use_cache: Optional[bool] = None,
-        # output_attentions: Optional[bool] = None,
-        # output_hidden_states: Optional[bool] = None,
-        # return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        # output_hidden_states = (
-        #     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        # )
-        # use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        batch_size, seq_length = input_ids.shape
-        # if input_ids is not None and inputs_embeds is not None:
-        #     raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        # elif input_ids is not None:
-        #     batch_size, seq_length = input_ids.shape
-        # elif inputs_embeds is not None:
-        #     batch_size, seq_length, _ = inputs_embeds.shape
-        # else:
-        #     raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        # if self.gradient_checkpointing and self.training:
-        #     if use_cache:
-        #         logger.warning_once(
-        #             "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-        #         )
-        #         use_cache = False
-        # if past_key_values is not None:
-        # past_key_values_length = past_key_values.shape[4]
-        # else:
-        #     past_key_values_length = 0
-
-        #     if use_cache:
-        #         use_legacy_cache = not isinstance(past_key_values, Cache)
-        #         if use_legacy_cache:
-        #             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        #         past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        # if position_ids is None:
-        #     device = input_ids.device if input_ids is not None else inputs_embeds.device
-        #     position_ids = torch.arange(
-        #         past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-        #     )
-        #     position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        # else:
-        #     position_ids = position_ids.view(-1, seq_length).long()
-        position_ids = position_ids.view(-1, seq_length).long()
-
-        # if inputs_embeds is None:
-        #     inputs_embeds = self.embed_tokens(input_ids)
-        inputs_embeds = self.embed_tokens(input_ids)
-        """
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            # [1, 1, 2, 1026], value=-65504
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-        """
-        # copy from chatglm3-6b for onnx export
-        full_attention_mask = self.get_masks(
-            input_ids,
-            past_key_values,
-            attention_mask,
-        )
-        #  === if use Qwen2Attention ===
-        # dtype = past_key_values.dtype
-        # device = input_ids.device
-        # attention_mask = torch.zeros_like(full_attention_mask, dtype=dtype).to(device)
-        # attention_mask.masked_fill_(full_attention_mask, torch.finfo(dtype).min)
-
-        # == if use Qwen2SdpaAttention ===
-        # copy from chatglm3-6b
-        attention_mask = ~full_attention_mask
-
-        hidden_states = inputs_embeds
-
-
-        # decoder layers
-        # all_hidden_states = () if output_hidden_states else None
-        # all_self_attns = () if output_attentions else None
-        # next_decoder_cache = None
-        presents = []
-        for decoder_layer in self.layers:
-            # if output_hidden_states:
-            #     all_hidden_states += (hidden_states,)
-
-            # if self.gradient_checkpointing and self.training:
-            #     layer_outputs = self._gradient_checkpointing_func(
-            #         decoder_layer.__call__,
-            #         hidden_states,
-            #         attention_mask,
-            #         position_ids,
-            #         past_key_values,
-            #         output_attentions,
-            #         use_cache,
-            #     )
-            # else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                # output_attentions=output_attentions,
-                # use_cache=use_cache,
-            )
-
-            hidden_states = layer_outputs[0]
-
-            # if use_cache:
-                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-            presents.extend(layer_outputs[1])
-
-            # if output_attentions:
-            #     all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        # if output_hidden_states:
-        #     all_hidden_states += (hidden_states,)
-
-        # next_cache = None
-        # if use_cache:
-            # next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
-        one_shape = [len(presents) // 2, 2] + list(presents[0].shape)
-        presents = torch.concat(presents).reshape(one_shape)
-        return (
-            hidden_states,
-            presents,
-            # all_hidden_states,
-            # all_self_attns
-        )
-
-        # if not return_dict:
-        #     return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        # return BaseModelOutputWithPast(
-        #     last_hidden_state=hidden_states,
-        #     past_key_values=next_cache,
-        #     hidden_states=all_hidden_states,
-        #     attentions=all_self_attns,
-        # )
-
-
-class Qwen2ForCausalLM(Qwen2PreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = Qwen2Model(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[torch.FloatTensor] = None,
-        # inputs_embeds: Optional[torch.FloatTensor] = None,
-        # labels: Optional[torch.LongTensor] = None,
-        # use_cache: Optional[bool] = None,
-        # output_attentions: Optional[bool] = None,
-        # output_hidden_states: Optional[bool] = None,
-        # return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
-
-        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-        # print("input_ids:", input_ids.shape)  # [1, 1]
-        # print("position_ids:", position_ids.shape)  # [1, 1]
-        # print("attention_mask:", attention_mask.shape)  # [1, 21]
-        # if past_key_values is not None:
-        #     print(
-        #         "past_key_values.shape:", len(past_key_values),
-        #         len(past_key_values[0]), past_key_values[0][0].shape
-        #     )
-        #     # [24, 2, 1, 16, 20, 64]
-        # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        # output_hidden_states = (
-        #     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        # )
-        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            # inputs_embeds=inputs_embeds,
-            # use_cache=use_cache,
-            # output_attentions=output_attentions,
-            # output_hidden_states=output_hidden_states,
-            # return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        # loss = None
-        # if labels is not None:
-        #     # Shift so that tokens < n predict n
-        #     shift_logits = logits[..., :-1, :].contiguous()
-        #     shift_labels = labels[..., 1:].contiguous()
-        #     # Flatten the tokens
-        #     loss_fct = CrossEntropyLoss()
-        #     shift_logits = shift_logits.view(-1, self.config.vocab_size)
-        #     shift_labels = shift_labels.view(-1)
-        #     # Enable model parallelism
-        #     shift_labels = shift_labels.to(shift_logits.device)
-        #     loss = loss_fct(shift_logits, shift_labels)
-
-        # if not return_dict:
-        output = (logits,) + outputs[1:]
-        # return (loss,) + output if loss is not None else output
-        return output
-
-        # return CausalLMOutputWithPast(
-        #     loss=loss,
-        #     logits=logits,
-        #     past_key_values=outputs.past_key_values,
-        #     hidden_states=outputs.hidden_states,
-        #     attentions=outputs.attentions,
-        # )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-
-            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-            if (
-                max_cache_length is not None
-                and attention_mask is not None
-                and cache_length + input_ids.shape[1] > max_cache_length
-            ):
-                attention_mask = attention_mask[:, -max_cache_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
-
-    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    QWEN2_START_DOCSTRING,
-)
-class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = Qwen2Model(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Qwen2 model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+
+
+# if is_flash_attn_2_available():
+#     from flash_attn import flash_attn_func, flash_attn_varlen_func
+#     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+# 
+#     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
+_CONFIG_FOR_DOC = "Qwen2Config"
+
+QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Qwen/Qwen2-7B-beta",
+    # See all Qwen2 models at https://huggingface.co/models?filter=qwen2
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[torch.Tensor] = None,
+        # output_attentions: bool = False,
+        # use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value.shape[1]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        output_cache = (
+            key_states.transpose(1, 2),
+            value_states.transpose(1, 2)
+        )
+        if past_key_value is not None:
+            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            cache_key = past_key_value[
+                :,
+                :,
+                self.layer_idx * 2 * self.num_key_value_heads: (self.layer_idx * 2 + 1) * self.num_key_value_heads
+            ].transpose(1, 2)
+            cache_value = past_key_value[
+                :,
+                :,
+                (self.layer_idx * 2 + 1) * self.num_key_value_heads: (self.layer_idx * 2 + 2) * self.num_key_value_heads
+            ].transpose(1, 2)
+            key_states = torch.cat((cache_key, key_states), dim=2)
+            value_states = torch.cat((cache_value, value_states), dim=2)
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        # if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        #     raise ValueError(
+        #         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+        #         f" {attn_weights.size()}"
+        #     )
+
+        # if attention_mask is not None:
+        #     if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+        #         raise ValueError(
+        #             f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+        #         )
+        #     attn_weights = attn_weights + attention_mask
+        attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        # if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        #     raise ValueError(
+        #         f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+        #         f" {attn_output.size()}"
+        #     )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        # if not output_attentions:
+        #     attn_weights = None
+
+        # return attn_output, attn_weights, past_key_value
+        return attn_output, attn_weights, output_cache
+
+
+'''
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+            and self.config.use_sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Decide whether to use SWA or not by layer index.
+        if use_sliding_windows and self.layer_idx >= self.config.max_window_layers:
+            use_sliding_windows = False
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+'''
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Qwen2
+class Qwen2SdpaAttention(Qwen2Attention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[torch.Tensor] = None,
+        # output_attentions: bool = False,
+        # use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # if output_attentions:
+        #     # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+        #     # logger.warning_once(
+        #     #     "Qwen2Model is using Qwen2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+        #     #     'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+        #     # )
+        #     return super().forward(
+        #         hidden_states=hidden_states,
+        #         attention_mask=attention_mask,
+        #         position_ids=position_ids,
+        #         past_key_value=past_key_value,
+        #         # output_attentions=output_attentions,
+        #         # use_cache=use_cache,
+        #     )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += past_key_value.shape[1]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        output_cache = (
+            key_states.transpose(1, 2),
+            value_states.transpose(1, 2)
+        )
+        if past_key_value is not None:
+            # cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            cache_key = past_key_value[
+                :,
+                :,
+                self.layer_idx * 2 * self.num_key_value_heads: (self.layer_idx * 2 + 1) * self.num_key_value_heads
+            ].transpose(1, 2)
+            cache_value = past_key_value[
+                :,
+                :,
+                (self.layer_idx * 2 + 1) * self.num_key_value_heads: (self.layer_idx * 2 + 2) * self.num_key_value_heads
+            ].transpose(1, 2)
+            key_states = torch.cat((cache_key, key_states), dim=2)
+            value_states = torch.cat((cache_value, value_states), dim=2)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        # if attention_mask is not None:
+        #     if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+        #         raise ValueError(
+        #             f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+        #         )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # copy from chatglm3-6b
+        # attention_mask = ~attention_mask
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            # dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            # is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, output_cache
+
+
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    # "flash_attention_2": Qwen2FlashAttention2,
+    "sdpa": Qwen2SdpaAttention,
+}
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        # self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = Qwen2SdpaAttention(config, layer_idx)
+        # self.self_attn = Qwen2Attention(config, layer_idx)
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        # output_attentions: Optional[bool] = False,
+        # use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. "
+                "Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            # output_attentions=output_attentions,
+            # use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        # if output_attentions:
+        #     outputs += (self_attn_weights,)
+
+        # if use_cache:
+        #     outputs += (present_key_value,)
+        outputs += (present_key_value,)
+
+        return outputs
+
+
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def get_masks(input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(
+            batch_size,
+            seq_length,
+            seq_length,
+            device=input_ids.device,
+            # dtype=torch.int64
+        )
+        full_attention_mask.tril_()
+        past_length = past_key_values.shape[1]
+        # if past_length is not None:
+        full_attention_mask = torch.cat(
+            (
+                torch.ones(
+                    batch_size,
+                    seq_length,
+                    past_length,
+                    device=input_ids.device,
+                    # dtype=torch.int64
+                ),
+                full_attention_mask
+            ),
+            dim=-1
+        )
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(
+                1)
+        # if not past_length and padding_mask is not None:
+        #     full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[torch.FloatTensor] = None,
+        # inputs_embeds: Optional[torch.FloatTensor] = None,
+        # use_cache: Optional[bool] = None,
+        # output_attentions: Optional[bool] = None,
+        # output_hidden_states: Optional[bool] = None,
+        # return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # output_hidden_states = (
+        #     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        # )
+        # use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        batch_size, seq_length = input_ids.shape
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        # elif input_ids is not None:
+        #     batch_size, seq_length = input_ids.shape
+        # elif inputs_embeds is not None:
+        #     batch_size, seq_length, _ = inputs_embeds.shape
+        # else:
+        #     raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # if self.gradient_checkpointing and self.training:
+        #     if use_cache:
+        #         logger.warning_once(
+        #             "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+        #         )
+        #         use_cache = False
+        # if past_key_values is not None:
+        # past_key_values_length = past_key_values.shape[4]
+        # else:
+        #     past_key_values_length = 0
+
+        #     if use_cache:
+        #         use_legacy_cache = not isinstance(past_key_values, Cache)
+        #         if use_legacy_cache:
+        #             past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        #         past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        # if position_ids is None:
+        #     device = input_ids.device if input_ids is not None else inputs_embeds.device
+        #     position_ids = torch.arange(
+        #         past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+        #     )
+        #     position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        # else:
+        #     position_ids = position_ids.view(-1, seq_length).long()
+        position_ids = position_ids.view(-1, seq_length).long()
+
+        # if inputs_embeds is None:
+        #     inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.embed_tokens(input_ids)
+        """
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            # [1, 1, 2, 1026], value=-65504
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+        """
+        # copy from chatglm3-6b for onnx export
+        full_attention_mask = self.get_masks(
+            input_ids,
+            past_key_values,
+            attention_mask,
+        )
+        #  === if use Qwen2Attention ===
+        # dtype = past_key_values.dtype
+        # device = input_ids.device
+        # attention_mask = torch.zeros_like(full_attention_mask, dtype=dtype).to(device)
+        # attention_mask.masked_fill_(full_attention_mask, torch.finfo(dtype).min)
+
+        # == if use Qwen2SdpaAttention ===
+        # copy from chatglm3-6b
+        attention_mask = ~full_attention_mask
+
+        hidden_states = inputs_embeds
+
+
+        # decoder layers
+        # all_hidden_states = () if output_hidden_states else None
+        # all_self_attns = () if output_attentions else None
+        # next_decoder_cache = None
+        presents = []
+        for decoder_layer in self.layers:
+            # if output_hidden_states:
+            #     all_hidden_states += (hidden_states,)
+
+            # if self.gradient_checkpointing and self.training:
+            #     layer_outputs = self._gradient_checkpointing_func(
+            #         decoder_layer.__call__,
+            #         hidden_states,
+            #         attention_mask,
+            #         position_ids,
+            #         past_key_values,
+            #         output_attentions,
+            #         use_cache,
+            #     )
+            # else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                # output_attentions=output_attentions,
+                # use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            # if use_cache:
+                # next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            presents.extend(layer_outputs[1])
+
+            # if output_attentions:
+            #     all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        # if output_hidden_states:
+        #     all_hidden_states += (hidden_states,)
+
+        # next_cache = None
+        # if use_cache:
+            # next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        one_shape = list(presents[0].shape)
+        one_shape[2] = one_shape[2] * len(presents)
+        presents = torch.concat(presents, dim=2)
+        return (
+            hidden_states,
+            presents,
+            # all_hidden_states,
+            # all_self_attns
+        )
+
+        # if not return_dict:
+        #     return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        # return BaseModelOutputWithPast(
+        #     last_hidden_state=hidden_states,
+        #     past_key_values=next_cache,
+        #     hidden_states=all_hidden_states,
+        #     attentions=all_self_attns,
+        # )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[torch.FloatTensor] = None,
+        # inputs_embeds: Optional[torch.FloatTensor] = None,
+        # labels: Optional[torch.LongTensor] = None,
+        # use_cache: Optional[bool] = None,
+        # output_attentions: Optional[bool] = None,
+        # output_hidden_states: Optional[bool] = None,
+        # return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        # print("input_ids:", input_ids.shape)  # [1, 1]
+        # print("position_ids:", position_ids.shape)  # [1, 1]
+        # print("attention_mask:", attention_mask.shape)  # [1, 21]
+        # if past_key_values is not None:
+        #     print(
+        #         "past_key_values.shape:", len(past_key_values),
+        #         len(past_key_values[0]), past_key_values[0][0].shape
+        #     )
+        #     # [24, 2, 1, 16, 20, 64]
+        # output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        # output_hidden_states = (
+        #     output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        # )
+        # return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            # inputs_embeds=inputs_embeds,
+            # use_cache=use_cache,
+            # output_attentions=output_attentions,
+            # output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        # loss = None
+        # if labels is not None:
+        #     # Shift so that tokens < n predict n
+        #     shift_logits = logits[..., :-1, :].contiguous()
+        #     shift_labels = labels[..., 1:].contiguous()
+        #     # Flatten the tokens
+        #     loss_fct = CrossEntropyLoss()
+        #     shift_logits = shift_logits.view(-1, self.config.vocab_size)
+        #     shift_labels = shift_labels.view(-1)
+        #     # Enable model parallelism
+        #     shift_labels = shift_labels.to(shift_logits.device)
+        #     loss = loss_fct(shift_logits, shift_labels)
+
+        # if not return_dict:
+        output = (logits,) + outputs[1:]
+        # return (loss,) + output if loss is not None else output
+        return output
+
+        # return CausalLMOutputWithPast(
+        #     loss=loss,
+        #     logits=logits,
+        #     past_key_values=outputs.past_key_values,
+        #     hidden_states=outputs.hidden_states,
+        #     attentions=outputs.attentions,
+        # )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Qwen2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2Model(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/export/onnx2om.py b/export/onnx2om.py
index 2fe17a6..701d0ce 100644
--- a/export/onnx2om.py
+++ b/export/onnx2om.py
@@ -1,182 +1,180 @@
-import os
-import ctypes
-import subprocess
-import argparse
-import math
-from transformers.models.qwen2 import Qwen2Config
-
-now_dir = os.path.dirname(os.path.abspath(__file__))
-project_dir = os.path.dirname(now_dir)
-output_dir = os.path.join(project_dir, "output")
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-onnx_model_dir = os.path.join(output_dir, "onnx2")
-if not os.path.exists(onnx_model_dir):
-    os.mkdir(onnx_model_dir)
-model_dir = os.path.join(output_dir, "model")
-if not os.path.exists(model_dir):
-    os.mkdir(model_dir)
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    '--soc_version',
-    type=str,
-    default="auto",
-    help="NPU full name, like Ascend310B1、Ascend310B4、Ascend310P1、Ascend910A、Ascend910B..., default is `auto`, will auto detect soc version.",
-)
-parser.add_argument(
-    '--hf_model_dir',
-    type=str,
-    help="model and tokenizer path, only support huggingface model",
-    default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct")
-)
-parser.add_argument(
-    "--onnx_model_path",
-    help="output onnx path",
-    type=str,
-    default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx")
-)
-parser.add_argument(
-    "--om_model_path",
-    help=".om model path",
-    type=str,
-    default= os.path.join(model_dir, "qwen2_1.5b_chat")
-)
-parser.add_argument(
-    "--max_batch",
-    help="max batch",
-    type=int,
-    default=1,
-)
-parser.add_argument(
-    "--max_prefill_length",
-    help="max prefill length in first inference. "
-        "Attention max_prefill_length + max_output_length <= kv_cache_length. "
-        "the number must by 2^xx, like 1, 2, 4, 8, 16, 32, 64, 128, 256... "
-        "Note! The higher this number, the longer it will take to compile.",
-    type=int,
-    default=8,
-)
-parser.add_argument(
-    "--kv_cache_length",
-    help="kv-cache length",
-    type=int,
-    default=1024,
-)
-
-
-args = parser.parse_args()
-
-
-def get_soc_version():
-    """
-    _summary_
-    获取芯片信息，返回具体的芯片型号
-    Returns:
-        _type_: _description_
-    """
-    max_len = 512
-    rtsdll = ctypes.CDLL(f"libruntime.so")
-    c_char_t = ctypes.create_string_buffer(b"\xff" * max_len, max_len)
-    rtsdll.rtGetSocVersion.restype = ctypes.c_uint64
-    rt_error = rtsdll.rtGetSocVersion(c_char_t, ctypes.c_uint32(max_len))
-    if rt_error:
-        print("rt_error:", rt_error)
-        return ""
-    soc_full_name = c_char_t.value.decode("utf-8")
-    find_str = "Short_SoC_version="
-    ascend_home_dir = os.environ.get("ASCEND_HOME_PATH")
-    assert ascend_home_dir is not None, \
-        print("ASCEND_HOME_PATH is None, you need run `source /usr/local/Ascend/ascend-toolkit/set_env.sh`")
-    with open(f"{ascend_home_dir}/compiler/data/platform_config/{soc_full_name}.ini", "r") as f:
-        for line in f:
-            if find_str in line:
-                start_index = line.find(find_str)
-                soc_short_name = line[start_index + len(find_str):].strip()
-                return {
-                   "soc_full_name": soc_full_name,
-                   "soc_short_name": soc_short_name
-                }
-    raise Exception("can't get you soc version")
-
-max_batch = args.max_batch
-model_config = Qwen2Config.from_pretrained(args.hf_model_dir)
-num_hidden_layers = model_config.num_hidden_layers
-num_key_value_heads = model_config.num_key_value_heads
-hidden_size = model_config.hidden_size
-num_attention_heads = model_config.num_attention_heads
-per_head_dim = hidden_size // num_attention_heads
-kv_cache_length = args.kv_cache_length
-max_prefill_log2 = int(math.log2(args.max_prefill_length))
-max_prefill_length = 2 ** max_prefill_log2 
-prefill_length_range = list(range(0, max_prefill_log2 + 1))
-prefill_length_range = [2 ** idx for idx in prefill_length_range]
-assert (max_prefill_length < kv_cache_length), \
-    print("max_input_length max be smaller than kv_cache_length, because max_input_length + max_output_length <= kv_cache")
-input_ids_length_range = prefill_length_range
-attention_length_range = [
-    length + kv_cache_length
-    for length in prefill_length_range
-]
-position_length_range = prefill_length_range
-input_ids_shape = [
-    f"1~{max_batch}" if max_batch > 1 else "1",
-    "-1" if max_prefill_length > 1 else "1",
-]
-attention_mask_shape = [
-    f"1~{max_batch}" if max_batch > 1 else "1",
-    "-1" if max_prefill_length > 1 else str(1 + kv_cache_length)
-]
-position_ids_shape = [
-    f"1~{max_batch}" if max_batch > 1 else "1",
-    "-1" if max_prefill_length > 1 else "1"
-]
-dynamic_dims = []
-for dynamic_dim in zip(
-    input_ids_length_range, attention_length_range, position_length_range
-):
-    dynamic_dim = [str(dim) for dim in dynamic_dim]
-    dynamic_dims.append(",".join(dynamic_dim))
-past_key_values_shape = [
-    num_hidden_layers,
-    2,
-    f"1~{max_batch}" if max_batch > 1 else "1",
-    num_key_value_heads,
-    kv_cache_length,
-    per_head_dim
-]
-past_key_values_shape = [str(x) for x in past_key_values_shape]
-if args.soc_version == "auto":
-    print("[INFO] soc_version is `auto`, will auto detect soc version")
-    soc_dict = get_soc_version()
-    print("[INFO] {}".format(soc_dict))
-    soc_version = soc_dict["soc_full_name"]
-else:
-    soc_version = args.soc_version
-command_lines = [
-    "atc",
-    "--framework=5",
-    '--model="{}"'.format(args.onnx_model_path),
-    '--output="{}"'.format(args.om_model_path),
-    "--soc_version={}".format(soc_version),
-    "--precision_mode=must_keep_origin_dtype",
-    "--input_format=ND",
-    '--input_shape="input_ids:{};attention_mask:{};position_ids:{};past_key_values:{}"'.format(
-        ",".join(input_ids_shape),
-        ",".join(attention_mask_shape),
-        ",".join(position_ids_shape),
-        ",".join(past_key_values_shape)
-    ),
-]
-if max_prefill_length > 1:
-    command_lines.append(
-        "--dynamic_dims \"{}\"".format(";".join(dynamic_dims))
-    )
-print("============ run command ==============")
-print(" ".join(command_lines))
-print("=======================================")
-subprocess.run(
-    " ".join(command_lines),
-    shell=True,
-    check=True,
+import os
+import ctypes
+import subprocess
+import argparse
+import math
+from transformers.models.qwen2 import Qwen2Config
+
+now_dir = os.path.dirname(os.path.abspath(__file__))
+project_dir = os.path.dirname(now_dir)
+output_dir = os.path.join(project_dir, "output")
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+onnx_model_dir = os.path.join(output_dir, "onnx2")
+if not os.path.exists(onnx_model_dir):
+    os.mkdir(onnx_model_dir)
+model_dir = os.path.join(output_dir, "model")
+if not os.path.exists(model_dir):
+    os.mkdir(model_dir)
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    '--soc_version',
+    type=str,
+    default="auto",
+    help="NPU full name, like Ascend310B1、Ascend310B4、Ascend310P1、Ascend910A、Ascend910B..., default is `auto`, will auto detect soc version.",
+)
+parser.add_argument(
+    '--hf_model_dir',
+    type=str,
+    help="model and tokenizer path, only support huggingface model",
+    default=os.path.join(project_dir, "download", "Qwen2-1.5B-Instruct")
+)
+parser.add_argument(
+    "--onnx_model_path",
+    help="output onnx path",
+    type=str,
+    default=os.path.join(onnx_model_dir, "qwen2_1.5b_chat.onnx")
+)
+parser.add_argument(
+    "--om_model_path",
+    help=".om model path",
+    type=str,
+    default= os.path.join(model_dir, "qwen2_1.5b_chat")
+)
+parser.add_argument(
+    "--max_batch",
+    help="max batch",
+    type=int,
+    default=1,
+)
+parser.add_argument(
+    "--max_prefill_length",
+    help="max prefill length in first inference. "
+        "Attention max_prefill_length + max_output_length <= kv_cache_length. "
+        "the number must by 2^xx, like 1, 2, 4, 8, 16, 32, 64, 128, 256... "
+        "Note! The higher this number, the longer it will take to compile.",
+    type=int,
+    default=8,
+)
+parser.add_argument(
+    "--kv_cache_length",
+    help="kv-cache length",
+    type=int,
+    default=1024,
+)
+
+
+args = parser.parse_args()
+
+
+def get_soc_version():
+    """
+    _summary_
+    获取芯片信息，返回具体的芯片型号
+    Returns:
+        _type_: _description_
+    """
+    max_len = 512
+    rtsdll = ctypes.CDLL(f"libruntime.so")
+    c_char_t = ctypes.create_string_buffer(b"\xff" * max_len, max_len)
+    rtsdll.rtGetSocVersion.restype = ctypes.c_uint64
+    rt_error = rtsdll.rtGetSocVersion(c_char_t, ctypes.c_uint32(max_len))
+    if rt_error:
+        print("rt_error:", rt_error)
+        return ""
+    soc_full_name = c_char_t.value.decode("utf-8")
+    find_str = "Short_SoC_version="
+    ascend_home_dir = os.environ.get("ASCEND_HOME_PATH")
+    assert ascend_home_dir is not None, \
+        print("ASCEND_HOME_PATH is None, you need run `source /usr/local/Ascend/ascend-toolkit/set_env.sh`")
+    with open(f"{ascend_home_dir}/compiler/data/platform_config/{soc_full_name}.ini", "r") as f:
+        for line in f:
+            if find_str in line:
+                start_index = line.find(find_str)
+                soc_short_name = line[start_index + len(find_str):].strip()
+                return {
+                   "soc_full_name": soc_full_name,
+                   "soc_short_name": soc_short_name
+                }
+    raise Exception("can't get you soc version")
+
+max_batch = args.max_batch
+model_config = Qwen2Config.from_pretrained(args.hf_model_dir)
+num_hidden_layers = model_config.num_hidden_layers
+num_key_value_heads = model_config.num_key_value_heads
+hidden_size = model_config.hidden_size
+num_attention_heads = model_config.num_attention_heads
+per_head_dim = hidden_size // num_attention_heads
+kv_cache_length = args.kv_cache_length
+max_prefill_log2 = int(math.log2(args.max_prefill_length))
+max_prefill_length = 2 ** max_prefill_log2 
+prefill_length_range = list(range(0, max_prefill_log2 + 1))
+prefill_length_range = [2 ** idx for idx in prefill_length_range]
+assert (max_prefill_length < kv_cache_length), \
+    print("max_input_length max be smaller than kv_cache_length, because max_input_length + max_output_length <= kv_cache")
+input_ids_length_range = prefill_length_range
+attention_length_range = [
+    length + kv_cache_length
+    for length in prefill_length_range
+]
+position_length_range = prefill_length_range
+input_ids_shape = [
+    f"1~{max_batch}" if max_batch > 1 else "1",
+    "-1" if max_prefill_length > 1 else "1",
+]
+attention_mask_shape = [
+    f"1~{max_batch}" if max_batch > 1 else "1",
+    "-1" if max_prefill_length > 1 else str(1 + kv_cache_length)
+]
+position_ids_shape = [
+    f"1~{max_batch}" if max_batch > 1 else "1",
+    "-1" if max_prefill_length > 1 else "1"
+]
+dynamic_dims = []
+for dynamic_dim in zip(
+    input_ids_length_range, attention_length_range, position_length_range
+):
+    dynamic_dim = [str(dim) for dim in dynamic_dim]
+    dynamic_dims.append(",".join(dynamic_dim))
+past_key_values_shape = [
+    f"1~{max_batch}" if max_batch > 1 else "1",
+    kv_cache_length,
+    num_hidden_layers * 2 * num_key_value_heads,
+    per_head_dim
+]
+past_key_values_shape = [str(x) for x in past_key_values_shape]
+if args.soc_version == "auto":
+    print("[INFO] soc_version is `auto`, will auto detect soc version")
+    soc_dict = get_soc_version()
+    print("[INFO] {}".format(soc_dict))
+    soc_version = soc_dict["soc_full_name"]
+else:
+    soc_version = args.soc_version
+command_lines = [
+    "atc",
+    "--framework=5",
+    '--model="{}"'.format(args.onnx_model_path),
+    '--output="{}"'.format(args.om_model_path),
+    "--soc_version={}".format(soc_version),
+    "--precision_mode=must_keep_origin_dtype",
+    "--input_format=ND",
+    '--input_shape="input_ids:{};attention_mask:{};position_ids:{};past_key_values:{}"'.format(
+        ",".join(input_ids_shape),
+        ",".join(attention_mask_shape),
+        ",".join(position_ids_shape),
+        ",".join(past_key_values_shape)
+    ),
+]
+if max_prefill_length > 1:
+    command_lines.append(
+        "--dynamic_dims \"{}\"".format(";".join(dynamic_dims))
+    )
+print("============ run command ==============")
+print(" ".join(command_lines))
+print("=======================================")
+subprocess.run(
+    " ".join(command_lines),
+    shell=True,
+    check=True,
 )
\ No newline at end of file
diff --git a/export/test_onnx_run.py b/export/test_onnx_run.py
index 35cf339..065d080 100644
--- a/export/test_onnx_run.py
+++ b/export/test_onnx_run.py
@@ -41,11 +41,9 @@
 def create_kv_cache(config: Qwen2Config, kv_cache_length=1024):
     return np.zeros(
         [
-            config.num_hidden_layers,
-            2,
             1,
-            config.num_key_value_heads,
             kv_cache_length,
+            config.num_hidden_layers * 2 * config.num_key_value_heads,
             config.hidden_size // config.num_attention_heads
         ],
         dtype=np_dtype
@@ -68,15 +66,13 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 
     """
     self.kv_cache shape (
-        self.num_hidden_layers,
-        2,
         1,
-        self.num_key_value_heads,
         self.kv_cache_length,
+        self.num_hidden_layers * 2 * self.num_key_value_heads,
         self.per_head_dim
     )
     """
-    cache = kv_cache[:, :, :, :, :past_kv_size]
+    cache = kv_cache[:, :past_kv_size]
     mask = np.ones((1, past_kv_size + seq_len), dtype=np.int64)
     mask[:, real_kv_size: past_kv_size] = 0
     pos_id = np.arange(
@@ -108,6 +104,10 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 print("input_ids", input_ids)
 
 options = onnxruntime.SessionOptions()
+options.intra_op_num_threads = 4
+options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+
 llm_session = onnxruntime.InferenceSession(
     args.onnx_model_path,
     sess_options=options,
@@ -138,5 +138,3 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 print("new_kv_cache: shape", new_kv_cache.shape)
 print("new_kv_cache: mean: ", new_kv_cache.astype(np.float32).mean().item())
 print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item())
-
-
diff --git a/export/test_pytorch_run.py b/export/test_pytorch_run.py
index 669a479..943e4d8 100644
--- a/export/test_pytorch_run.py
+++ b/export/test_pytorch_run.py
@@ -45,11 +45,9 @@
 def create_kv_cache(config: Qwen2Config, kv_cache_length=1024):
     return torch.zeros(
         [
-            config.num_hidden_layers,
-            2,
             1,
-            config.num_key_value_heads,
             kv_cache_length,
+            config.num_hidden_layers * 2 * config.num_key_value_heads,
             config.hidden_size // config.num_attention_heads
         ],
         dtype=torch_dtype
@@ -72,15 +70,13 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 
     """
     self.kv_cache shape (
-        self.num_hidden_layers,
-        2,
         1,
-        self.num_key_value_heads,
         self.kv_cache_length,
+        self.num_hidden_layers * 2 * self.num_key_value_heads,
         self.per_head_dim
     )
     """
-    cache = kv_cache[:, :, :, :, :past_kv_size]
+    cache = kv_cache[:, :past_kv_size]
     mask = torch.ones((1, past_kv_size + seq_len), dtype=torch.long).to(device_str)
     mask[:, real_kv_size: past_kv_size] = 0
     pos_id = torch.arange(
@@ -115,12 +111,12 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 )["input_ids"].to(device_str)
 print("input_ids", input_ids)
 kv_cache1 = create_kv_cache(model_config)
-now_kv_cache, attn_mask, position_ids = get_inputs(kv_cache1, 2, )
+now_kv_cache, attn_mask, position_ids = get_inputs(kv_cache1, 1)
 print("now_kv_cache shape: ", now_kv_cache.shape)
 print("attention_mask shape: ", attn_mask.shape)
 print("position_ids shape: ", position_ids.shape)
 outputs = model.forward(
-    input_ids[:, :2],
+    input_ids[:, :1],
     attn_mask,
     position_ids,
     now_kv_cache,
@@ -129,13 +125,13 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 )
 print("==== pytorch runtime ====")
 print("output length: ", len(outputs))
-logits = outputs[0][:, :-1, :]  # 1: -0.10800
+logits = outputs[0]  # 1: -0.10800
 # logits = outputs[0][:, -1:, :]  # 2: -0.008756
 
 print("logits shape: ", logits.shape)
 print("logits mean: ", logits.float().mean().item())
 print("logits max: ", logits.float().max().item())
-new_kv_cache = outputs[1][:, :, :, :, :-1, :]  # 1: 0.0009:
+new_kv_cache = outputs[1]  # 1: 0.0009:
 # new_kv_cache = outputs[1][:, :, :, :, -1:, :]  # 2: 0.003526
 
 print("new_kv_cache: shape:", new_kv_cache.shape)
@@ -143,4 +139,3 @@ def get_inputs(kv_cache, seq_len: int, real_kv_size=0, input_pos=0, past_kv_size
 print("new_kv_cache: mean: ", new_kv_cache.float().mean().item())
 # print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item())
 print("new_kv_cache: max: ", new_kv_cache.float().max().item())
-
diff --git a/utils/engine.py b/utils/engine.py
index d3795d3..b4f44af 100644
--- a/utils/engine.py
+++ b/utils/engine.py
@@ -13,6 +13,7 @@
 ACL_MEM_MALLOC_HUGE_FIRST = 0
 ACL_MEMCPY_HOST_TO_DEVICE = 1
 ACL_MEMCPY_DEVICE_TO_HOST = 2
+ACL_MEMCPY_DEVICE_TO_DEVICE = 3
 ACL_MEM_MALLOC_NORMAL_ONLY = 2
 NPY_FLOAT32 = 11
 
@@ -78,7 +79,11 @@ def __init__(self, config: InferenceConfig, context=None,callback=None):
         self.exit_flag = False
         self.max_batch = config.max_batch
         self.kv_cache_length = config.kv_cache_length
-        self.kv_cache = np.zeros(config.past_key_value_shape, dtype=np.float16)
+        # kv_cache的长度和max_output_length的长度一样
+        self.past_kv_size=self.kv_cache_length
+        self.input_pos = 0
+        self.real_kv_size = 0
+        # self.kv_cache = np.zeros(config.past_key_value_shape, dtype=np.float16)
         self.input_dataset, self.output_dataset = None, None
         self.inputs:List[Dict[str,]] = []
         self.outputs:List[Dict[str,]] =  []
@@ -93,6 +98,75 @@ def __init__(self, config: InferenceConfig, context=None,callback=None):
         check_ret("acl.util.start_thread", ret)
         ret = acl.rt.subscribe_report(self.tid, self.stream)
         check_ret("acl.rt.subscribe_report", ret)
+
+    def get_inputs(self, seq_len: int) -> List[np.ndarray]:
+        """
+        获取指定长度的kv_cache, 顺便生成mask和position_id
+        Args:
+            seq_len (int): 待获取的kv-cache长度
+
+        Returns:
+            List[np.ndarray]: _description_
+        """
+
+        """
+        self.kv_cache shape (
+            1,
+            self.kv_cache_length,
+            self.num_hidden_layers * 2 * self.num_key_value_heads,
+            self.per_head_dim
+        )
+        """ 
+        mask = np.ones((1,self.past_kv_size + seq_len), dtype=np.int64)
+        mask[:, self.real_kv_size: self.past_kv_size] = 0
+        pos_id =np.arange(
+            self.input_pos, 
+            self.input_pos + seq_len,
+            dtype=np.int64
+        ).reshape(1,-1)
+        return mask, pos_id
+
+    def reset(self):
+        # 重置kv-cache
+        self.input_pos=0
+        self.real_kv_size=0
+        ret = acl.rt.memset(
+            self.inputs[3]["buffer"], # 内存的起始地址。
+            self.inputs[3]["size"], # 内存的最大长度，单位Byte。
+            0,
+            self.inputs[3]["size"] # 需要设置为指定值的内存长度，单位Byte。
+        )
+        check_ret("reset device kv-cache", ret)
+    
+    def update_kv_cache(self, seq_len):
+        self.input_pos = self.real_kv_size + seq_len
+        if seq_len + self.real_kv_size > self.kv_cache_length:
+            seq_len = self.kv_cache_length - self.real_kv_size
+        if seq_len <= 0:
+            return
+        # 用device memory完成下面的操作
+        # self.kv_cache[:, self.real_kv_size: self.real_kv_size + seq_len] = new_kv_cache[:, 0: seq_len]
+        # kv-cache shape
+        """
+        new_kv_cache_shape = [
+            self.max_batch,
+            seq_length,
+            self.config.num_hidden_layers * 2 * self.config.num_key_value_heads,
+            self.config.per_head_dim
+        ]
+        """
+        base_size = self.config.num_hidden_layers * 2 * self.config.num_key_value_heads * self.config.per_head_dim
+        # print("base_size: ", base_size)
+        # 默认是void指针，想要往前切片，需要将数据个数 * 2（代表float16)偏移
+        ret = acl.rt.memcpy(
+            self.inputs[3]["buffer"] + (base_size * self.real_kv_size * self.max_batch) * 2, # 目的内存地址指针地址。
+            base_size * (self.kv_cache_length - self.real_kv_size) * 2, # 目的内存地址的最大内存长度，单位Byte。
+            self.outputs[1]["buffer"],
+            base_size * seq_len * 2,
+            ACL_MEMCPY_DEVICE_TO_DEVICE
+        )
+        check_ret("update device cache", ret)
+        self.real_kv_size += seq_len
     
     def unload(self):
         if self.callback_func:
@@ -138,18 +212,10 @@ def allocate_memory(self):
         self.input_dataset = acl.mdl.create_dataset()
         input_size = acl.mdl.get_num_inputs(self.model_desc)
         self.inputs = []
+        # 给输入分配Device内存
         for i in range(input_size):
             buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
-            # if i == 3:
-            #     buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
-            #     self.kv_cache = acl.util.ptr_to_numpy(
-            #         buffer, self.config.past_key_value_shape, 23 # 23：NPY_HALF，NPY_FLOAT16
-            #     )
-            #     data = acl.create_data_buffer(buffer, buffer_size)
-            #     _, ret = acl.mdl.add_dataset_buffer(self.input_dataset, data)
-            #     check_ret("add_dataset_buffer",ret)
-            #     self.inputs.append({"buffer": buffer, "size": buffer_size})
-            # else:
+            # print(f"input[{i}], buffer size = {buffer_size}")
             buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
             check_ret("alloc input memory",ret)
             data = acl.create_data_buffer(buffer, buffer_size)
@@ -160,16 +226,22 @@ def allocate_memory(self):
         self.output_dataset = acl.mdl.create_dataset()
         output_size = acl.mdl.get_num_outputs(self.model_desc)
         self.outputs = []
+        # 给输出分配device和host内存
         for i in range(output_size):
             buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
+            # print(f"output[{i}], buffer size = {buffer_size}")
             data_type = acl.mdl.get_output_data_type(self.model_desc, i)
             buffer, ret = acl.rt.malloc(buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
             check_ret("alloc output memory",ret)
             data = acl.create_data_buffer(buffer, buffer_size)
             _, ret = acl.mdl.add_dataset_buffer(self.output_dataset, data)
             check_ret("add_dataset_buffer",ret)
-            buffer_host, ret = acl.rt.malloc_host(buffer_size)
-            check_ret("alloc output host memory",ret)
+            if i == 0:
+                buffer_host, ret = acl.rt.malloc_host(buffer_size)
+                check_ret("alloc output host memory",ret)
+            # 对于new_kv_cache，不需要分配host内存，后面直接在device内存进行更新，节省内存
+            else:
+                buffer_host = None
             self.outputs.append(
                 {
                     "buffer": buffer,
@@ -183,20 +255,26 @@ def free_memory(self):
         """
         释放内存
         """
-        for item in self.input_data:
+        for i, item in enumerate(self.input_data):
             ret = acl.rt.free(item["buffer"])
+            check_ret(f"free input[{i}] device memory",ret)
         ret = acl.mdl.destroy_dataset(self.input_dataset)
-        for item in self.output_data:
+        for i, item in enumerate(self.output_data):
             ret = acl.rt.free(item["buffer"])
-            ret = acl.rt.free_host(item["buffer_host"])
+            check_ret("free output device memory",ret)
+            # 分配结果只分配了logitst的CPU内存，所以释放的时候也只释放logists的
+            if i == 0: 
+                ret = acl.rt.free_host(item["buffer_host"])
         ret = acl.mdl.destroy_dataset(self.output_dataset)
 
-    def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=False) -> List[np.ndarray]:
+    def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=False, is_prefill=False) -> List[np.ndarray]:
         """
         执行推理，同步方式
         Args:
             input_data_list (_type_): _description_
             seq_length: 推理长度
+            is_dynamic: 是否动态推理
+            is_prefill: 是否是prefill阶段
 
         Returns:
             List[np.ndarray]: _description_
@@ -204,9 +282,7 @@ def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=
         start = time.time()
         acl.rt.set_context(self.context)
         for i in range(len(input_data_list)):
-            # if i == 3:
-            #     continue
-            # else:
+            # 内存拷贝，忽略kv_cache，待会直接在device侧更新
             input_data = input_data_list[i]
             input_size = input_data.size
             input_itemsize = input_data.itemsize
@@ -270,16 +346,6 @@ def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=
             new_kv_cache_itemsize = new_kv_cache_size * output_itemsize2
             output_sizes = [logits_size, new_kv_cache_size]
             output_itemsizes = [logits_itemsize, new_kv_cache_itemsize]
-        logits_shape = [self.max_batch, seq_length, self.config.vocab_size]
-        new_kv_cache_shape = [
-            self.config.num_hidden_layers,
-            2,
-            self.max_batch,
-            self.config.num_key_value_heads,
-            seq_length,
-            self.config.per_head_dim
-        ]
-        output_shapes = [logits_shape, new_kv_cache_shape]
 
         ret = acl.mdl.execute(
             self.model_id,
@@ -287,31 +353,42 @@ def inference(self, input_data_list: List[np.ndarray], seq_length=1, is_dynamic=
             self.output_dataset
         )
         check_ret("model_execute", ret)
-        inference_result = []
 
-        for output_idx, out in enumerate(self.outputs):
+        """
+        获取输出结果, 从GPU拷贝输出数据到CPU
+        # 输出结果1：logits
+        # 输出结果2：new_kv_cache
+        prefill结果可以跳过logits的拷贝
+        """
+        # == update device kv cache ==
+        self.update_kv_cache(seq_len=seq_length)
+        # 非prefill阶段才拷贝logits作为输出    
+        if not is_prefill:
+            # === update logits === 
             if is_dynamic:
-                output_itemsize = output_itemsizes[output_idx]
-                output_size = output_sizes[output_idx]
+                output_itemsize = output_itemsizes[0]
+                output_size = output_sizes[0]
             else:
-                output_itemsize = out["size"]
-                output_size = output_itemsize // np.dtype(out["dtype"]).itemsize
+                output_itemsize = self.outputs[0]["size"]
+                output_size = output_itemsize // np.dtype(self.outputs[0]["dtype"]).itemsize
+            logits_shape = [self.max_batch, seq_length, self.config.vocab_size]
             ret = acl.rt.memcpy(
-                out['buffer_host'],
-                out["size"],
-                out["buffer"],
+                self.outputs[0]['buffer_host'],
+                self.outputs[0]["size"],
+                self.outputs[0]["buffer"],
                 output_itemsize,
                 ACL_MEMCPY_DEVICE_TO_HOST
             )
             check_ret("memcpy output", ret)
-            bytes_out = acl.util.ptr_to_bytes(out['buffer_host'], out["size"])
-            out_data = np.frombuffer(
+            bytes_out = acl.util.ptr_to_bytes(self.outputs[0]['buffer_host'], self.outputs[0]["size"])
+            logits = np.frombuffer(
                 bytes_out,
-                dtype=out['dtype'],
+                dtype=self.outputs[0]['dtype'],
                 count=output_size,
-            ).reshape(output_shapes[output_idx])
-            inference_result.append(out_data)
-        return inference_result
+            ).reshape(logits_shape)
+            return logits
+        else:
+            return None
     
     def inference_async(self, data, other_args) -> List[np.ndarray]:
         """
diff --git a/utils/inference.py b/utils/inference.py
index 89b30a0..f654e2f 100644
--- a/utils/inference.py
+++ b/utils/inference.py
@@ -166,14 +166,13 @@ def stream_predict(
                 if show_progress:
                     prefill_show_progress = True
                 # reset counter
-                self.session.run_times = 0
-                self.session.kv_cache.real_kv_size = 0
+                self.session.reset()
             else:
                 prefill_show_progress = False
             logits = self.session.run(
                 input_ids,
-                show_progress=prefill_show_progress
-            )[0]
+                show_progress=prefill_show_progress,
+            )
             input_ids = self.sample_logits(
                 logits[0][-1:],
                 self.sampling_method,
diff --git a/utils/kvcache.py b/utils/kvcache.py
index ad77f42..ec78590 100644
--- a/utils/kvcache.py
+++ b/utils/kvcache.py
@@ -65,8 +65,8 @@ def get_inputs(self, seq_len: int) -> List[np.ndarray]:
             self.per_head_dim
         )
         """ 
-        cache = self.kv_cache[:, :, :, :, :self.past_kv_size]
-        mask = np.ones((1,self.past_kv_size + seq_len),dtype=np.int64)
+        cache = self.kv_cache[:, :self.past_kv_size]
+        mask = np.ones((1,self.past_kv_size + seq_len), dtype=np.int64)
         mask[:, self.real_kv_size: self.past_kv_size] = 0
         pos_id =np.arange(
             self.input_pos, 
@@ -146,11 +146,9 @@ def update(
     ) -> None:
         """
         self.kv_cache shape (
-            self.num_hidden_layers,
-            2,
             1,
-            self.num_key_value_heads,
             self.kv_cache_length,
+            self.num_hidden_layers * 2 * self.num_key_value_heads,
             self.per_head_dim
         )
         """ 
@@ -161,10 +159,10 @@ def update(
             return
         if self.cache_format=="huggingface-tensor":
             temp_shape = list(self.past_key_value_shape)
-            temp_shape[-2] = -1
+            temp_shape[1] = -1
             new_kv_cache = new_kv_cache.reshape(temp_shape)
-            self.kv_cache[:, :, :, :, self.real_kv_size: self.real_kv_size + seq_len] = \
-                new_kv_cache[:, :, :, :, 0: seq_len]
+            self.kv_cache[:, self.real_kv_size: self.real_kv_size + seq_len] = \
+                new_kv_cache[:, 0: seq_len]
         self.real_kv_size += seq_len
 
 class FixSizeStreamLLM(KVCacheManger):
@@ -180,7 +178,7 @@ def update(
         score:Optional[np.ndarray] = None
     ):
         self.input_pos+=seq_len
-        while self.past_len+ seq_len  > self.kv_cache_length:
+        while self.past_len+ seq_len > self.kv_cache_length:
             self.update_part(new_kv_cache, self.past_len, self.kv_cache_length - self.past_len)
             seq_len -= (self.kv_cache_length-self.past_len)
             self.past_len= self.head_len
diff --git a/utils/session.py b/utils/session.py
index 2b4f81f..c8a9839 100644
--- a/utils/session.py
+++ b/utils/session.py
@@ -12,7 +12,6 @@
 
 class Session:
     def __init__(self, config: InferenceConfig) -> None:
-        self.kv_cache = create_kv_cache(config)
         self.run_times = 0
 
     def run(self,input_ids:np.ndarray, show_progress: bool = False):
@@ -39,8 +38,12 @@ def rollback(self,seq_len):
 class OnnxSession(Session):
     def __init__(self,config:InferenceConfig)->None:
         super().__init__(config)
+        self.kv_cache = create_kv_cache(config)
         import onnxruntime
         options = onnxruntime.SessionOptions()
+        options.intra_op_num_threads = config.cpu_thread
+        options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
+        options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
         self.llm_session = onnxruntime.InferenceSession(
             config.onnx_model_path,
             sess_options=options,
@@ -118,12 +121,15 @@ def __init__(self, config:InferenceConfig):
         self.model = ACLModel(config, self.context)
         self.max_batch = config.max_batch
         self.input_ids = np.zeros((1,16),dtype=np.int64)
-        self.kv_cache.kv_cache = self.model.kv_cache
+        # self.kv_cache = create_kv_cache(config)
+        # self.kv_cache.kv_cache = self.model.kv_cache
         self.max_prefill_length = config.max_prefill_length
         self.prefill_log2_number = int(math.log2(self.max_prefill_length))
         self.prefill_log2_list = list(range(self.prefill_log2_number, -1, -1))
         self.prefill_log2_list = [2**index for index in self.prefill_log2_list]
         
+    def reset(self):
+        self.model.reset();
     
     def __del__(self):
         destroy_resource(self.device_id, self.context)
@@ -145,9 +151,10 @@ def decompose_number(self, n, start_index=0):
                 return [power] + self.decompose_number(n - power, i)
         return []
     
-    def run(self, input_ids: np.ndarray, show_progress:bool=False):
+    def run(self, input_ids: np.ndarray, show_progress: bool = False):
         seq_len = input_ids.shape[-1]
         logits = None
+        is_prefill = True
         is_dynamic = bool(self.max_prefill_length > 1)
         # dynamic inference
         if is_dynamic:
@@ -155,12 +162,15 @@ def run(self, input_ids: np.ndarray, show_progress:bool=False):
             if show_progress:
                seq_list = tqdm(seq_list, desc="prefill") 
             start_i = 0
-            for seq in seq_list:
+            for (ii, seq) in enumerate(seq_list):
                 end_i = start_i + seq
+                if (ii == len(seq_list) - 1):
+                    is_prefill = False
                 logits = self.run_some(
                     input_ids[:, start_i: end_i],
                     seq,
                     is_dynamic,
+                    is_prefill=is_prefill
                 )
                 start_i += seq
                 # if show_progress:
@@ -172,46 +182,24 @@ def run(self, input_ids: np.ndarray, show_progress:bool=False):
             else:
                 idx_list = range(seq_len)
             for i in idx_list:
-                logits = self.run_some(input_ids[:,i])
-        return [logits]
+                if (i == len(idx_list) - 1):
+                    is_prefill = False
+                logits = self.run_some(input_ids[:,i], is_prefill=is_prefill)
+        return logits
     
     def run_some(
         self,
         input_ids: np.ndarray,
         seq_length: int = 1,
-        is_dynamic: bool = False
+        is_dynamic: bool = False,
+        is_prefill: bool = False,
     ):
-        # print(
-        #     "self.run_times: ", self.run_times,
-        #     "real kv size: ", self.kv_cache.real_kv_size
-        # )
         self.run_times += seq_length 
-        cache, mask, pos_ids = self.kv_cache.get_inputs(seq_length)
-        result:List[np.ndarray] = self.model.inference(
-                [input_ids, mask, pos_ids, cache], seq_length, is_dynamic
-            )
-        # if self.run_times <= 20:
-        #     print(" === Debug === ")
-        #     print("run times: ", self.run_times) 
-        #     logits = result[0]
-        #     new_kv_cache = result[1]
-        #     print("logits shape: ", logits.shape)
-        #     print("logits mean: ", logits.astype(np.float32).mean().item())
-        #     print("logits max: ", logits.astype(np.float32).max().item())
-        #     print("new_kv_cache: shape", new_kv_cache.shape)
-        #     print("new_kv_cache: mean: ", new_kv_cache.astype(np.float32).mean().item())
-        #     print("new_kv_cache: max: ", new_kv_cache.astype(np.float32).max().item())
-        self.kv_cache.update(seq_length, result[1])
-        return result[0].reshape(self.max_batch, seq_length,-1)
-
-    def run_all_logits(self, input_ids: np.ndarray):
-        seq_len, i = input_ids.shape[-1], 0
-        logits = []
-        while i < seq_len:
-            end = i + 16 if i+16 < seq_len else seq_len
-            cache,mask,pos_ids = self.kv_cache.get_inputs(16)
-            self.input_ids[0:end-i] = input_ids[i:end]
-            result:List[np.ndarray] = self.model.inference([self.input_ids, mask, pos_ids, cache])
-            self.kv_cache.update(end-i,result[1])
-            logits.append(result[0][0:end-i].reshape(1,-1))
-        return [np.concatenate(logits).reshape(1,1,-1)]
\ No newline at end of file
+        mask, pos_ids = self.model.get_inputs(seq_length)
+        logits = self.model.inference(
+            [input_ids, mask, pos_ids], seq_length, is_dynamic, is_prefill=is_prefill
+        )
+        if not is_prefill:
+            return logits.reshape(self.max_batch, seq_length,-1)
+        else:
+            return None