diff --git a/DeepSeek-R1-Distill-Qwen-1.5B b/DeepSeek-R1-Distill-Qwen-1.5B
new file mode 160000
index 00000000..ad9f0ae0
--- /dev/null
+++ b/DeepSeek-R1-Distill-Qwen-1.5B
@@ -0,0 +1 @@
+Subproject commit ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 7054626d..c4dd10d9 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -38,5 +38,7 @@ __C {
     __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
 
     __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+
+    __export void llaisysQwen2ModelResetCache(struct LlaisysQwen2Model * model);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb52..29d57c4c 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -12,6 +12,8 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
+from .models import load_models
+from .models import LlaisysQwen2Meta, LlaisysQwen2Weights, llaisysQwen2Model_t
 
 
 def load_shared_library():
@@ -38,6 +40,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_models(LIB_LLAISYS)
 
 
 __all__ = [
@@ -52,4 +55,7 @@ def load_shared_library():
     "llaisysMemcpyKind_t",
     "MemcpyKind",
     "llaisysStream_t",
+    "LlaisysQwen2Meta",
+    "LlaisysQwen2Weights",
+    "llaisysQwen2Model_t",
 ]
diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py
new file mode 100644
index 00000000..fe625dc8
--- /dev/null
+++ b/python/llaisys/libllaisys/models.py
@@ -0,0 +1,74 @@
+from ctypes import POINTER, c_void_p, c_size_t, c_int64, c_int, c_float, Structure
+from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t
+from .tensor import llaisysTensor_t
+
+
+class LlaisysQwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+
+class LlaisysQwen2Weights(Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", POINTER(llaisysTensor_t)),
+        ("attn_q_w", POINTER(llaisysTensor_t)),
+        ("attn_q_b", POINTER(llaisysTensor_t)),
+        ("attn_k_w", POINTER(llaisysTensor_t)),
+        ("attn_k_b", POINTER(llaisysTensor_t)),
+        ("attn_v_w", POINTER(llaisysTensor_t)),
+        ("attn_v_b", POINTER(llaisysTensor_t)),
+        ("attn_o_w", POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", POINTER(llaisysTensor_t)),
+        ("mlp_up_w", POINTER(llaisysTensor_t)),
+        ("mlp_down_w", POINTER(llaisysTensor_t)),
+    ]
+
+
+llaisysQwen2Model_t = c_void_p
+
+
+def load_models(lib):
+    # llaisysQwen2ModelCreate
+    lib.llaisysQwen2ModelCreate.argtypes = [
+        POINTER(LlaisysQwen2Meta),
+        llaisysDeviceType_t,
+        POINTER(c_int),
+        c_int,
+    ]
+    lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t
+
+    # llaisysQwen2ModelDestroy
+    lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    # llaisysQwen2ModelWeights
+    lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights)
+
+    # llaisysQwen2ModelInfer
+    lib.llaisysQwen2ModelInfer.argtypes = [
+        llaisysQwen2Model_t,
+        POINTER(c_int64),
+        c_size_t,
+    ]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
+
+    # llaisysQwen2ModelResetCache
+    lib.llaisysQwen2ModelResetCache.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelResetCache.restype = None
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b2..e6890bb8 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,23 +1,244 @@
 from typing import Sequence
 from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
+from ..libllaisys import DeviceType, DataType
+from ..libllaisys import LlaisysQwen2Meta, llaisysQwen2Model_t
+from ..tensor import Tensor
 
 from pathlib import Path
-import safetensors
+import json
+import numpy as np
+from ctypes import c_int64, c_size_t, c_int, c_float, pointer, byref, POINTER, cast
+import struct
 
 
 class Qwen2:
-
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
         model_path = Path(model_path)
 
-        for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+        # 读取配置文件
+        config_path = model_path / "config.json"
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        # 提取模型参数
+        self.hidden_size = config["hidden_size"]
+        self.num_hidden_layers = config["num_hidden_layers"]
+        self.num_attention_heads = config["num_attention_heads"]
+        self.num_key_value_heads = config.get("num_key_value_heads", self.num_attention_heads)
+        self.intermediate_size = config["intermediate_size"]
+        self.vocab_size = config["vocab_size"]
+        self.rms_norm_eps = config.get("rms_norm_eps", 1e-6)
+        self.rope_theta = config.get("rope_theta", 10000.0)
+        self.max_position_embeddings = config.get("max_position_embeddings", 131072)
+
+        # 计算每头维度
+        self.head_dim = self.hidden_size // self.num_attention_heads
+
+        # 确定数据类型
+        torch_dtype = config.get("torch_dtype", "bfloat16")
+        if torch_dtype == "bfloat16":
+            self.dtype = DataType.BF16
+        elif torch_dtype == "float16":
+            self.dtype = DataType.F16
+        else:
+            self.dtype = DataType.F32
+
+        # 创建模型元数据
+        meta = LlaisysQwen2Meta()
+        meta.dtype = self.dtype
+        meta.nlayer = self.num_hidden_layers
+        meta.hs = self.hidden_size
+        meta.nh = self.num_attention_heads
+        meta.nkvh = self.num_key_value_heads
+        meta.dh = self.head_dim
+        meta.di = self.intermediate_size
+        meta.maxseq = self.max_position_embeddings
+        meta.voc = self.vocab_size
+        meta.epsilon = self.rms_norm_eps
+        meta.theta = self.rope_theta
+        meta.end_token = config.get("eos_token_id", 151643)
+
+        # 创建设备ID数组
+        device_ids = (c_int * 1)(0)
+
+        # 创建模型
+        self.model = LIB_LLAISYS.llaisysQwen2ModelCreate(
+            byref(meta),
+            device,
+            device_ids,
+            1
+        )
+
+        if not self.model:
+            raise RuntimeError("Failed to create Qwen2 model")
+
+        # 获取权重结构
+        weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self.model)
+        self.weights = weights_ptr.contents
+
+        # 加载权重
+        self._load_weights(model_path)
+
+        # 存储设备信息
+        self.device = device
+
+        # 保存结束token
+        self.end_token = meta.end_token
+
+    def _load_weights(self, model_path: Path):
+        """加载模型权重"""
+
+        # 收集所有safetensors文件
+        safetensor_files = sorted(model_path.glob("*.safetensors"))
+
+        # 逐个文件加载权重
+        for file in safetensor_files:
+            self._load_weights_from_file(file)
+
+    def _load_weights_from_file(self, file_path: Path):
+        """从单个safetensors文件加载权重"""
+        # 读取文件头
+        with open(file_path, 'rb') as f:
+            # 读取长度前缀 (8 bytes, little-endian unsigned long long)
+            length_bytes = f.read(8)
+            header_len = struct.unpack('<Q', length_bytes)[0]
+
+            # 读取JSON头
+            header_bytes = f.read(header_len)
+            header = json.loads(header_bytes.decode('utf-8'))
+
+            # 数据从头部之后开始
+            data_start = 8 + header_len
+
+            # 加载输入/输出嵌入和最终归一化权重
+            self._load_tensor_from_file(f, header, data_start, "model.embed_tokens.weight", self.weights.in_embed)
+            self._load_tensor_from_file(f, header, data_start, "lm_head.weight", self.weights.out_embed)
+            self._load_tensor_from_file(f, header, data_start, "model.norm.weight", self.weights.out_norm_w)
+
+            # 加载每层权重
+            for layer_idx in range(self.num_hidden_layers):
+                prefix = f"model.layers.{layer_idx}."
+
+                # Attention norm
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}input_layernorm.weight",
+                    self.weights.attn_norm_w[layer_idx]
+                )
+
+                # Q projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.q_proj.weight",
+                    self.weights.attn_q_w[layer_idx]
+                )
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.q_proj.bias",
+                    self.weights.attn_q_b[layer_idx]
+                )
+
+                # K projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.k_proj.weight",
+                    self.weights.attn_k_w[layer_idx]
+                )
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.k_proj.bias",
+                    self.weights.attn_k_b[layer_idx]
+                )
+
+                # V projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.v_proj.weight",
+                    self.weights.attn_v_w[layer_idx]
+                )
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.v_proj.bias",
+                    self.weights.attn_v_b[layer_idx]
+                )
+
+                # O projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}self_attn.o_proj.weight",
+                    self.weights.attn_o_w[layer_idx]
+                )
+
+                # MLP norm
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}post_attention_layernorm.weight",
+                    self.weights.mlp_norm_w[layer_idx]
+                )
+
+                # Gate projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}mlp.gate_proj.weight",
+                    self.weights.mlp_gate_w[layer_idx]
+                )
+
+                # Up projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}mlp.up_proj.weight",
+                    self.weights.mlp_up_w[layer_idx]
+                )
+
+                # Down projection
+                self._load_tensor_from_file(
+                    f, header, data_start,
+                    f"{prefix}mlp.down_proj.weight",
+                    self.weights.mlp_down_w[layer_idx]
+                )
+
+    def _load_tensor_from_file(self, f, header: dict, data_start: int, name: str, tensor_handle):
+        """从文件加载单个张量"""
+        if name not in header:
+            return
+
+        info = header[name]
+        dtype = info['dtype']
+        shape = info['shape']
+        data_offsets = info['data_offsets']
+
+        # 计算数据位置和大小
+        start_offset = data_start + data_offsets[0]
+        end_offset = data_start + data_offsets[1]
+        size = end_offset - start_offset
+
+        # 读取原始字节
+        f.seek(start_offset)
+        raw_data = f.read(size)
+
+        # 根据数据类型解析
+        if dtype == 'BF16':
+            # BFloat16: 每个元素2字节，以uint16存储
+            numpy_array = np.frombuffer(raw_data, dtype=np.uint16).reshape(shape)
+        elif dtype == 'F16':
+            numpy_array = np.frombuffer(raw_data, dtype=np.float16).reshape(shape)
+        elif dtype == 'F32':
+            numpy_array = np.frombuffer(raw_data, dtype=np.float32).reshape(shape)
+        elif dtype == 'F64':
+            numpy_array = np.frombuffer(raw_data, dtype=np.float64).reshape(shape)
+        elif dtype == 'I64':
+            numpy_array = np.frombuffer(raw_data, dtype=np.int64).reshape(shape)
+        elif dtype == 'I32':
+            numpy_array = np.frombuffer(raw_data, dtype=np.int32).reshape(shape)
+        else:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+
+        # 确保数据是连续的
+        if not numpy_array.flags['C_CONTIGUOUS']:
+            numpy_array = np.ascontiguousarray(numpy_array)
+
+        # 加载到tensor
+        LIB_LLAISYS.tensorLoad(tensor_handle, numpy_array.ctypes.data)
 
     def generate(
         self,
@@ -27,7 +248,51 @@ def generate(
         top_p: float = 0.8,
         temperature: float = 0.8,
     ):
+        """生成文本，使用KV-Cache进行增量推理"""
+        if max_new_tokens is None:
+            max_new_tokens = 128
+
+        # 重置KV Cache
+        LIB_LLAISYS.llaisysQwen2ModelResetCache(self.model)
+
+        # 准备输入token
+        tokens = list(inputs)
+
+        # 第一步：处理完整的prompt（所有输入token）
+        token_array = (c_int64 * len(tokens))(*tokens)
+        next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
+            self.model,
+            token_array,
+            len(tokens)
+        )
+        tokens.append(next_token)
+
+        # 检查是否生成结束token
+        if next_token == self.end_token:
+            return tokens
+
+        # 后续步骤：每次只传入一个新token，利用KV-Cache
+        for _ in range(max_new_tokens - 1):
+            # 只传入最后一个token
+            token_array = (c_int64 * 1)(tokens[-1])
+
+            # 调用模型推理（使用KV-Cache）
+            next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
+                self.model,
+                token_array,
+                1
+            )
+
+            tokens.append(next_token)
+
+            # 检查是否生成结束token
+            if next_token == self.end_token:
+                break
 
-        # TODO: Implement generate function
+        return tokens
 
-        return []
+    def __del__(self):
+        """析构函数，释放模型资源"""
+        if hasattr(self, 'model') and self.model:
+            LIB_LLAISYS.llaisysQwen2ModelDestroy(self.model)
+            self.model = None
diff --git a/src/llaisys/models/qwen2.cpp b/src/llaisys/models/qwen2.cpp
new file mode 100644
index 00000000..118cacb0
--- /dev/null
+++ b/src/llaisys/models/qwen2.cpp
@@ -0,0 +1,392 @@
+#include "llaisys/models/qwen2.h"
+#include "../llaisys_tensor.hpp"
+#include "../../ops/linear/op.hpp"
+#include "../../ops/embedding/op.hpp"
+#include "../../ops/rms_norm/op.hpp"
+#include "../../ops/rope/op.hpp"
+#include "../../ops/self_attention/op.hpp"
+#include "../../ops/swiglu/op.hpp"
+#include "../../ops/argmax/op.hpp"
+#include "../../ops/add/op.hpp"
+
+#include <vector>
+#include <cmath>
+#include <cstring>
+
+using namespace llaisys;
+
+// Qwen2模型结构
+struct LlaisysQwen2Model {
+    LlaisysQwen2Meta meta;
+    llaisysDeviceType_t device_type;
+    int device_id;
+    
+    // 权重
+    LlaisysQwen2Weights weights;
+    
+    // 中间张量（用于重用内存）
+    tensor_t hidden_states;
+    tensor_t residual;
+    tensor_t q_proj;
+    tensor_t k_proj;
+    tensor_t v_proj;
+    tensor_t o_proj;
+    tensor_t q_rotated;
+    tensor_t k_rotated;
+    tensor_t attn_output;
+    tensor_t gate_proj;
+    tensor_t up_proj;
+    tensor_t mlp_output;
+    tensor_t logits;
+    tensor_t max_val;
+    tensor_t max_idx;
+    
+    // KV Cache
+    std::vector<tensor_t> k_cache;
+    std::vector<tensor_t> v_cache;
+    
+    // 位置ID张量
+    tensor_t pos_ids;
+    
+    // KV Cache状态
+    size_t cache_pos;  // 当前cache位置（已缓存的token数）
+    size_t total_len;  // 总token数（用于注意力计算）
+};
+
+// 辅助函数：从llaisysTensor_t获取tensor_t
+inline tensor_t get_tensor(llaisysTensor_t t) {
+    return t ? t->tensor : nullptr;
+}
+
+// 创建模型
+__C struct LlaisysQwen2Model *llaisysQwen2ModelCreate(
+    const LlaisysQwen2Meta *meta, 
+    llaisysDeviceType_t device, 
+    int *device_ids, 
+    int ndevice) {
+    
+    auto *model = new LlaisysQwen2Model();
+    model->meta = *meta;
+    model->device_type = device;
+    model->device_id = ndevice > 0 ? device_ids[0] : 0;
+    
+    size_t hs = meta->hs;
+    size_t maxseq = meta->maxseq;
+    
+    // 创建中间张量
+    model->hidden_states = Tensor::create({maxseq, hs}, meta->dtype, device, model->device_id);
+    model->residual = Tensor::create({maxseq, hs}, meta->dtype, device, model->device_id);
+    
+    // QKV投影输出 [maxseq, nh * dh] 和 [maxseq, nkvh * dh]
+    size_t q_size = meta->nh * meta->dh;
+    size_t kv_size = meta->nkvh * meta->dh;
+    model->q_proj = Tensor::create({maxseq, q_size}, meta->dtype, device, model->device_id);
+    model->k_proj = Tensor::create({maxseq, kv_size}, meta->dtype, device, model->device_id);
+    model->v_proj = Tensor::create({maxseq, kv_size}, meta->dtype, device, model->device_id);
+    
+    // 旋转后的QK [maxseq, nh, dh] 和 [maxseq, nkvh, dh]
+    model->q_rotated = Tensor::create({maxseq, meta->nh, meta->dh}, meta->dtype, device, model->device_id);
+    model->k_rotated = Tensor::create({maxseq, meta->nkvh, meta->dh}, meta->dtype, device, model->device_id);
+    
+    // 注意力输出 [maxseq, nh, dh]
+    model->attn_output = Tensor::create({maxseq, meta->nh, meta->dh}, meta->dtype, device, model->device_id);
+    
+    // O投影输出 [maxseq, hs]
+    model->o_proj = Tensor::create({maxseq, hs}, meta->dtype, device, model->device_id);
+    
+    // MLP中间张量
+    model->gate_proj = Tensor::create({maxseq, meta->di}, meta->dtype, device, model->device_id);
+    model->up_proj = Tensor::create({maxseq, meta->di}, meta->dtype, device, model->device_id);
+    model->mlp_output = Tensor::create({maxseq, meta->di}, meta->dtype, device, model->device_id);
+    
+    // 输出logits [maxseq, voc]
+    model->logits = Tensor::create({maxseq, meta->voc}, meta->dtype, device, model->device_id);
+    
+    // argmax输出
+    model->max_val = Tensor::create({1}, meta->dtype, device, model->device_id);
+    model->max_idx = Tensor::create({1}, LLAISYS_DTYPE_I64, device, model->device_id);
+    
+    // 创建权重张量
+    // Embedding权重 [voc, hs]
+    model->weights.in_embed = new LlaisysTensor{Tensor::create({meta->voc, hs}, meta->dtype, device, model->device_id)};
+    model->weights.out_embed = new LlaisysTensor{Tensor::create({meta->voc, hs}, meta->dtype, device, model->device_id)};
+    
+    // 最终归一化权重 [hs]
+    model->weights.out_norm_w = new LlaisysTensor{Tensor::create({hs}, meta->dtype, device, model->device_id)};
+    
+    // 分配每层权重指针数组
+    model->weights.attn_norm_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_q_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_q_b = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_k_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_k_b = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_v_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_v_b = new llaisysTensor_t[meta->nlayer]();
+    model->weights.attn_o_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.mlp_norm_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.mlp_gate_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.mlp_up_w = new llaisysTensor_t[meta->nlayer]();
+    model->weights.mlp_down_w = new llaisysTensor_t[meta->nlayer]();
+    
+    // 创建每层权重张量
+    for (size_t i = 0; i < meta->nlayer; i++) {
+        // Attention归一化权重 [hs]
+        model->weights.attn_norm_w[i] = new LlaisysTensor{Tensor::create({hs}, meta->dtype, device, model->device_id)};
+        
+        // Q投影权重 [nh*dh, hs] 和偏置 [nh*dh]
+        model->weights.attn_q_w[i] = new LlaisysTensor{Tensor::create({meta->nh * meta->dh, hs}, meta->dtype, device, model->device_id)};
+        model->weights.attn_q_b[i] = new LlaisysTensor{Tensor::create({meta->nh * meta->dh}, meta->dtype, device, model->device_id)};
+        
+        // K投影权重 [nkvh*dh, hs] 和偏置 [nkvh*dh]
+        model->weights.attn_k_w[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh, hs}, meta->dtype, device, model->device_id)};
+        model->weights.attn_k_b[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh}, meta->dtype, device, model->device_id)};
+        
+        // V投影权重 [nkvh*dh, hs] 和偏置 [nkvh*dh]
+        model->weights.attn_v_w[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh, hs}, meta->dtype, device, model->device_id)};
+        model->weights.attn_v_b[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh}, meta->dtype, device, model->device_id)};
+        
+        // O投影权重 [hs, nh*dh]
+        model->weights.attn_o_w[i] = new LlaisysTensor{Tensor::create({hs, meta->nh * meta->dh}, meta->dtype, device, model->device_id)};
+        
+        // MLP归一化权重 [hs]
+        model->weights.mlp_norm_w[i] = new LlaisysTensor{Tensor::create({hs}, meta->dtype, device, model->device_id)};
+        
+        // Gate投影权重 [di, hs]
+        model->weights.mlp_gate_w[i] = new LlaisysTensor{Tensor::create({meta->di, hs}, meta->dtype, device, model->device_id)};
+        
+        // Up投影权重 [di, hs]
+        model->weights.mlp_up_w[i] = new LlaisysTensor{Tensor::create({meta->di, hs}, meta->dtype, device, model->device_id)};
+        
+        // Down投影权重 [hs, di]
+        model->weights.mlp_down_w[i] = new LlaisysTensor{Tensor::create({hs, meta->di}, meta->dtype, device, model->device_id)};
+    }
+    
+    // 初始化KV Cache
+    model->k_cache.resize(meta->nlayer);
+    model->v_cache.resize(meta->nlayer);
+    for (size_t i = 0; i < meta->nlayer; i++) {
+        model->k_cache[i] = Tensor::create({maxseq, meta->nkvh, meta->dh}, meta->dtype, device, model->device_id);
+        model->v_cache[i] = Tensor::create({maxseq, meta->nkvh, meta->dh}, meta->dtype, device, model->device_id);
+    }
+    
+    // 位置ID张量
+    model->pos_ids = Tensor::create({maxseq}, LLAISYS_DTYPE_I64, device, model->device_id);
+    
+    // 初始化KV Cache状态
+    model->cache_pos = 0;
+    model->total_len = 0;
+    
+    return model;
+}
+
+// 销毁模型
+__C void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model) {
+    if (!model) return;
+    
+    // 释放权重张量
+    delete model->weights.in_embed;
+    delete model->weights.out_embed;
+    delete model->weights.out_norm_w;
+    
+    // 释放每层权重张量
+    for (size_t i = 0; i < model->meta.nlayer; i++) {
+        delete model->weights.attn_norm_w[i];
+        delete model->weights.attn_q_w[i];
+        delete model->weights.attn_q_b[i];
+        delete model->weights.attn_k_w[i];
+        delete model->weights.attn_k_b[i];
+        delete model->weights.attn_v_w[i];
+        delete model->weights.attn_v_b[i];
+        delete model->weights.attn_o_w[i];
+        delete model->weights.mlp_norm_w[i];
+        delete model->weights.mlp_gate_w[i];
+        delete model->weights.mlp_up_w[i];
+        delete model->weights.mlp_down_w[i];
+    }
+    
+    // 释放权重指针数组
+    delete[] model->weights.attn_norm_w;
+    delete[] model->weights.attn_q_w;
+    delete[] model->weights.attn_q_b;
+    delete[] model->weights.attn_k_w;
+    delete[] model->weights.attn_k_b;
+    delete[] model->weights.attn_v_w;
+    delete[] model->weights.attn_v_b;
+    delete[] model->weights.attn_o_w;
+    delete[] model->weights.mlp_norm_w;
+    delete[] model->weights.mlp_gate_w;
+    delete[] model->weights.mlp_up_w;
+    delete[] model->weights.mlp_down_w;
+    
+    delete model;
+}
+
+// 获取模型权重
+__C struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model) {
+    return &model->weights;
+}
+
+// 辅助函数：复制张量数据到KV Cache
+static void copy_to_kv_cache(tensor_t cache, tensor_t src, size_t start_pos, size_t len) {
+    // src: [len, nkvh, dh]
+    // cache: [maxseq, nkvh, dh]
+    // 将src复制到cache的[start_pos, start_pos+len)位置
+    
+    auto cache_slice = cache->slice(0, start_pos, start_pos + len);
+    
+    // 获取数据指针
+    std::byte *cache_data = cache_slice->data();
+    const std::byte *src_data = src->data();
+    
+    size_t bytes = len * src->shape()[1] * src->shape()[2] * src->elementSize();
+    std::memcpy(cache_data, src_data, bytes);
+}
+
+// 模型推理 - 支持KV Cache的增量推理
+// 当use_cache为true时，只处理新token（ntoken应为1），利用KV Cache
+// 当use_cache为false时，处理所有token（首次调用或重置时）
+__C int64_t llaisysQwen2ModelInfer(
+    struct LlaisysQwen2Model *model, 
+    int64_t *token_ids, 
+    size_t ntoken) {
+    
+    const auto &meta = model->meta;
+    size_t nh = meta.nh;
+    size_t nkvh = meta.nkvh;
+    size_t dh = meta.dh;
+    size_t voc = meta.voc;
+    size_t nlayer = meta.nlayer;
+    float epsilon = meta.epsilon;
+    float theta = meta.theta;
+    
+    // 判断是否使用KV Cache
+    bool use_cache = model->cache_pos > 0;
+    size_t start_pos = use_cache ? model->cache_pos : 0;
+    size_t total_len = start_pos + ntoken;
+    
+    // 使用hidden_states的[start_pos, total_len)行作为当前处理的hidden
+    auto hidden = model->hidden_states->slice(0, start_pos, total_len);
+    
+    // 1. Embedding（只对新token做embedding）
+    auto token_ids_tensor = Tensor::create({ntoken}, LLAISYS_DTYPE_I64, model->device_type, model->device_id);
+    token_ids_tensor->load(token_ids);
+    ops::embedding(hidden, token_ids_tensor, get_tensor(model->weights.in_embed));
+    
+    // 设置位置ID（使用实际位置）
+    auto pos_ids_slice = model->pos_ids->slice(0, start_pos, total_len);
+    std::vector<int64_t> pos_ids_data(ntoken);
+    for (size_t i = 0; i < ntoken; i++) {
+        pos_ids_data[i] = static_cast<int64_t>(start_pos + i);
+    }
+    pos_ids_slice->load(pos_ids_data.data());
+    
+    // Transformer层
+    for (size_t layer = 0; layer < nlayer; layer++) {
+        // 保存当前hidden到residual张量（用于残差连接）
+        // 需要将hidden_states[start_pos:total_len]复制到residual[start_pos:total_len]
+        auto residual_slice = model->residual->slice(0, start_pos, total_len);
+        std::memcpy(residual_slice->data(), hidden->data(), 
+                    ntoken * meta.hs * hidden->elementSize());
+        
+        // 1. RMS Norm (input_layernorm)
+        ops::rms_norm(hidden, hidden, get_tensor(model->weights.attn_norm_w[layer]), epsilon);
+        
+        // 2. QKV投影
+        auto q_proj_slice = model->q_proj->slice(0, start_pos, total_len);
+        auto k_proj_slice = model->k_proj->slice(0, start_pos, total_len);
+        auto v_proj_slice = model->v_proj->slice(0, start_pos, total_len);
+        
+        ops::linear(q_proj_slice, hidden, get_tensor(model->weights.attn_q_w[layer]), 
+                    get_tensor(model->weights.attn_q_b[layer]));
+        ops::linear(k_proj_slice, hidden, get_tensor(model->weights.attn_k_w[layer]),
+                    get_tensor(model->weights.attn_k_b[layer]));
+        ops::linear(v_proj_slice, hidden, get_tensor(model->weights.attn_v_w[layer]),
+                    get_tensor(model->weights.attn_v_b[layer]));
+        
+        // 3. 重塑为 [ntoken, nh, dh] 和 [ntoken, nkvh, dh]
+        auto q_reshaped = q_proj_slice->view({ntoken, nh, dh});
+        auto k_reshaped = k_proj_slice->view({ntoken, nkvh, dh});
+        auto v_reshaped = v_proj_slice->view({ntoken, nkvh, dh});
+        
+        // 4. RoPE
+        auto q_rotated_slice = model->q_rotated->slice(0, start_pos, total_len);
+        auto k_rotated_slice = model->k_rotated->slice(0, start_pos, total_len);
+        
+        ops::rope(q_rotated_slice, q_reshaped, pos_ids_slice, theta);
+        ops::rope(k_rotated_slice, k_reshaped, pos_ids_slice, theta);
+        
+        // 5. 更新KV Cache
+        copy_to_kv_cache(model->k_cache[layer], k_rotated_slice, start_pos, ntoken);
+        copy_to_kv_cache(model->v_cache[layer], v_reshaped, start_pos, ntoken);
+        
+        // 6. Self Attention - 使用完整的KV Cache
+        auto k_cache_slice = model->k_cache[layer]->slice(0, 0, total_len);
+        auto v_cache_slice = model->v_cache[layer]->slice(0, 0, total_len);
+        
+        auto attn_output_slice = model->attn_output->slice(0, start_pos, total_len);
+        float scale = 1.0f / std::sqrt(static_cast<float>(dh));
+        ops::self_attention(attn_output_slice, q_rotated_slice, k_cache_slice, v_cache_slice, scale);
+        
+        // 7. O投影
+        auto o_proj_slice = model->o_proj->slice(0, start_pos, total_len);
+        auto attn_flat = attn_output_slice->view({ntoken, nh * dh});
+        ops::linear(o_proj_slice, attn_flat, get_tensor(model->weights.attn_o_w[layer]), nullptr);
+        
+        // 8. 残差连接：o_proj + residual
+        // 需要将residual[start_pos:total_len]加到o_proj_slice
+        ops::add(o_proj_slice, o_proj_slice, residual_slice);
+        
+        // 9. 保存当前结果到residual用于MLP残差连接
+        std::memcpy(residual_slice->data(), o_proj_slice->data(), 
+                    ntoken * meta.hs * o_proj_slice->elementSize());
+        
+        // 10. RMS Norm (post_attention_layernorm)
+        ops::rms_norm(o_proj_slice, o_proj_slice, get_tensor(model->weights.mlp_norm_w[layer]), epsilon);
+        
+        // 11. MLP
+        auto gate_slice = model->gate_proj->slice(0, start_pos, total_len);
+        auto up_slice = model->up_proj->slice(0, start_pos, total_len);
+        
+        ops::linear(gate_slice, o_proj_slice, get_tensor(model->weights.mlp_gate_w[layer]), nullptr);
+        ops::linear(up_slice, o_proj_slice, get_tensor(model->weights.mlp_up_w[layer]), nullptr);
+        
+        auto mlp_out_slice = model->mlp_output->slice(0, start_pos, total_len);
+        ops::swiglu(mlp_out_slice, gate_slice, up_slice);
+        
+        // 12. Down投影（结果存回hidden）
+        ops::linear(hidden, mlp_out_slice, get_tensor(model->weights.mlp_down_w[layer]), nullptr);
+        
+        // 13. 残差连接：hidden + residual
+        ops::add(hidden, hidden, residual_slice);
+    }
+    
+    // 最终RMS Norm
+    ops::rms_norm(hidden, hidden, get_tensor(model->weights.out_norm_w), epsilon);
+    
+    // 输出投影
+    auto logits_slice = model->logits->slice(0, start_pos, total_len);
+    ops::linear(logits_slice, hidden, get_tensor(model->weights.out_embed), nullptr);
+    
+    // 取最后一个token的logits进行argmax
+    auto last_logits = logits_slice->slice(0, ntoken - 1, ntoken);
+    auto last_logits_flat = last_logits->view({voc});
+    
+    ops::argmax(model->max_idx, model->max_val, last_logits_flat);
+    
+    // 获取结果
+    int64_t result;
+    std::memcpy(&result, model->max_idx->data(), sizeof(int64_t));
+    
+    // 更新KV Cache状态
+    model->cache_pos = total_len;
+    model->total_len = total_len;
+    
+    return result;
+}
+
+// 重置KV Cache状态（用于新的对话）
+__C void llaisysQwen2ModelResetCache(struct LlaisysQwen2Model *model) {
+    if (!model) return;
+    model->cache_pos = 0;
+    model->total_len = 0;
+}
diff --git a/src/llaisys/models/qwen2.hpp b/src/llaisys/models/qwen2.hpp
new file mode 100644
index 00000000..dedab2da
--- /dev/null
+++ b/src/llaisys/models/qwen2.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "llaisys/models/qwen2.h"
+#include "../llaisys_tensor.hpp"
+
+namespace llaisys {
+
+// 前向声明
+struct LlaisysQwen2Model;
+
+} // namespace llaisys
diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp
new file mode 100644
index 00000000..a332be19
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.cpp
@@ -0,0 +1,113 @@
+#include "argmax_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void argmax_impl(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t size) {
+    const T* vals_data = reinterpret_cast<const T*>(vals);
+    T* max_val_data = reinterpret_cast<T*>(max_val);
+    int64_t* max_idx_data = reinterpret_cast<int64_t*>(max_idx);
+    
+    // 初始化最大值和索引
+    T max_val_val = vals_data[0];
+    size_t max_idx_val = 0;
+    
+    // 遍历所有元素找最大值
+    for (size_t i = 1; i < size; i++) {
+        if (vals_data[i] > max_val_val) {
+            max_val_val = vals_data[i];
+            max_idx_val = i;
+        }
+    }
+    
+    // 存储结果
+    max_val_data[0] = max_val_val;
+    max_idx_data[0] = static_cast<int64_t>(max_idx_val);
+}
+
+// 处理F16类型的特化实现
+template <>
+void argmax_impl<llaisys::fp16_t>(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t size) {
+    const llaisys::fp16_t* vals_data = reinterpret_cast<const llaisys::fp16_t*>(vals);
+    llaisys::fp16_t* max_val_data = reinterpret_cast<llaisys::fp16_t*>(max_val);
+    int64_t* max_idx_data = reinterpret_cast<int64_t*>(max_idx);
+    
+    // 初始化最大值和索引
+    float max_val_val = llaisys::utils::cast<float>(vals_data[0]);
+    size_t max_idx_val = 0;
+    
+    // 遍历所有元素找最大值
+    for (size_t i = 1; i < size; i++) {
+        float current_val = llaisys::utils::cast<float>(vals_data[i]);
+        if (current_val > max_val_val) {
+            max_val_val = current_val;
+            max_idx_val = i;
+        }
+    }
+    
+    // 存储结果
+    max_val_data[0] = llaisys::utils::cast<llaisys::fp16_t>(max_val_val);
+    max_idx_data[0] = static_cast<int64_t>(max_idx_val);
+}
+
+// 处理BF16类型的特化实现
+template <>
+void argmax_impl<llaisys::bf16_t>(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t size) {
+    const llaisys::bf16_t* vals_data = reinterpret_cast<const llaisys::bf16_t*>(vals);
+    llaisys::bf16_t* max_val_data = reinterpret_cast<llaisys::bf16_t*>(max_val);
+    int64_t* max_idx_data = reinterpret_cast<int64_t*>(max_idx);
+    
+    // 初始化最大值和索引
+    float max_val_val = llaisys::utils::cast<float>(vals_data[0]);
+    size_t max_idx_val = 0;
+    
+    // 遍历所有元素找最大值
+    for (size_t i = 1; i < size; i++) {
+        float current_val = llaisys::utils::cast<float>(vals_data[i]);
+        if (current_val > max_val_val) {
+            max_val_val = current_val;
+            max_idx_val = i;
+        }
+    }
+    
+    // 存储结果
+    max_val_data[0] = llaisys::utils::cast<llaisys::bf16_t>(max_val_val);
+    max_idx_data[0] = static_cast<int64_t>(max_idx_val);
+}
+
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return argmax_impl<float>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_F64:
+        return argmax_impl<double>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_I8:
+        return argmax_impl<int8_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_I16:
+        return argmax_impl<int16_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_I32:
+        return argmax_impl<int32_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_I64:
+        return argmax_impl<int64_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_U8:
+        return argmax_impl<uint8_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_U16:
+        return argmax_impl<uint16_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_U32:
+        return argmax_impl<uint32_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_U64:
+        return argmax_impl<uint64_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_F16:
+        return argmax_impl<llaisys::fp16_t>(max_idx, max_val, vals, size);
+    case LLAISYS_DTYPE_BF16:
+        return argmax_impl<llaisys::bf16_t>(max_idx, max_val, vals, size);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp
new file mode 100644
index 00000000..0c362ee4
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size);
+}
\ No newline at end of file
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d42..5622a993 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,37 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/argmax_cpu.hpp"
+
 namespace llaisys::ops {
+
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
-    TO_BE_IMPLEMENTED();
+    // 参数检查
+    CHECK_ARGUMENT(max_idx->ndim() == 1, "max_idx must be a 1D tensor");
+    CHECK_ARGUMENT(max_val->ndim() == 1 && max_val->shape()[0] == 1, "max_val must be a 1D tensor with one element");
+    CHECK_ARGUMENT(max_val->dtype() == vals->dtype(), "max_val must have the same dtype as vals");
+    CHECK_ARGUMENT(max_idx->dtype() == LLAISYS_DTYPE_I64, "max_idx must have dtype I64");
+    CHECK_ARGUMENT(vals->isContiguous(), "argmax: vals tensor must be contiguous.");
+
+    // 总是支持CPU计算
+    if (vals->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+    }
+
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+
+    switch (vals->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
-} // namespace llaisys::ops
+} // namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp
new file mode 100644
index 00000000..130ccc3c
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.cpp
@@ -0,0 +1,92 @@
+#include "embedding_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void embedding_impl(std::byte *out, const std::byte *index, const std::byte *weight, size_t out_size, size_t weight_dim1) {
+    const int64_t* index_data = reinterpret_cast<const int64_t*>(index);
+    const T* weight_data = reinterpret_cast<const T*>(weight);
+    T* out_data = reinterpret_cast<T*>(out);
+    
+    size_t batch_size = out_size / weight_dim1;
+    
+    for (size_t i = 0; i < batch_size; i++) {
+        int64_t idx = index_data[i];
+        // 处理负索引
+        if (idx < 0) {
+            idx = weight_dim1 + idx;
+        }
+        
+        // 复制对应的行
+        for (size_t j = 0; j < weight_dim1; j++) {
+            out_data[i * weight_dim1 + j] = weight_data[idx * weight_dim1 + j];
+        }
+    }
+}
+
+// 处理F16类型的特化实现
+template <>
+void embedding_impl<llaisys::fp16_t>(std::byte *out, const std::byte *index, const std::byte *weight, size_t out_size, size_t weight_dim1) {
+    const int64_t* index_data = reinterpret_cast<const int64_t*>(index);
+    const llaisys::fp16_t* weight_data = reinterpret_cast<const llaisys::fp16_t*>(weight);
+    llaisys::fp16_t* out_data = reinterpret_cast<llaisys::fp16_t*>(out);
+    
+    size_t batch_size = out_size / weight_dim1;
+    
+    for (size_t i = 0; i < batch_size; i++) {
+        int64_t idx = index_data[i];
+        // 处理负索引
+        if (idx < 0) {
+            idx = weight_dim1 + idx;
+        }
+        
+        // 复制对应的行
+        for (size_t j = 0; j < weight_dim1; j++) {
+            out_data[i * weight_dim1 + j] = weight_data[idx * weight_dim1 + j];
+        }
+    }
+}
+
+// 处理BF16类型的特化实现
+template <>
+void embedding_impl<llaisys::bf16_t>(std::byte *out, const std::byte *index, const std::byte *weight, size_t out_size, size_t weight_dim1) {
+    const int64_t* index_data = reinterpret_cast<const int64_t*>(index);
+    const llaisys::bf16_t* weight_data = reinterpret_cast<const llaisys::bf16_t*>(weight);
+    llaisys::bf16_t* out_data = reinterpret_cast<llaisys::bf16_t*>(out);
+    
+    size_t batch_size = out_size / weight_dim1;
+    
+    for (size_t i = 0; i < batch_size; i++) {
+        int64_t idx = index_data[i];
+        // 处理负索引
+        if (idx < 0) {
+            idx = weight_dim1 + idx;
+        }
+        
+        // 复制对应的行
+        for (size_t j = 0; j < weight_dim1; j++) {
+            out_data[i * weight_dim1 + j] = weight_data[idx * weight_dim1 + j];
+        }
+    }
+}
+
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t out_size, size_t weight_dim1) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return embedding_impl<float>(out, index, weight, out_size, weight_dim1);
+    case LLAISYS_DTYPE_F64:
+        return embedding_impl<double>(out, index, weight, out_size, weight_dim1);
+    case LLAISYS_DTYPE_F16:
+        return embedding_impl<llaisys::fp16_t>(out, index, weight, out_size, weight_dim1);
+    case LLAISYS_DTYPE_BF16:
+        return embedding_impl<llaisys::bf16_t>(out, index, weight, out_size, weight_dim1);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp
new file mode 100644
index 00000000..b5d2a0f6
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t out_size, size_t weight_dim1);
+}
\ No newline at end of file
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d0..e5e8e5ba 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,39 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/embedding_cpu.hpp"
+
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor");
+    CHECK_ARGUMENT(index->ndim() == 1, "index must be a 1D tensor");
+    CHECK_ARGUMENT(weight->ndim() == 2, "weight must be a 2D tensor");
+    CHECK_ARGUMENT(index->dtype() == LLAISYS_DTYPE_I64, "index must have dtype I64");
+    CHECK_ARGUMENT(out->shape()[0] == index->shape()[0], "out must have the same shape as index");
+    CHECK_ARGUMENT(out->shape()[1] == weight->shape()[1], "out must have the same shape as weight");
+    CHECK_ARGUMENT(out->isContiguous(), "embedding: out tensor must be contiguous.");
+    CHECK_ARGUMENT(index->isContiguous(), "embedding: index tensor must be contiguous.");
+    CHECK_ARGUMENT(weight->isContiguous(), "embedding: weight tensor must be contiguous.");
+
+    // 总是支持CPU计算
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->numel(), weight->shape()[1]);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->numel(), weight->shape()[1]);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp
new file mode 100644
index 00000000..284f8e54
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.cpp
@@ -0,0 +1,112 @@
+#include "linear_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void linear_impl(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, size_t batch_size, size_t in_features, size_t out_features) {
+    const T* in_data = reinterpret_cast<const T*>(in);
+    const T* weight_data = reinterpret_cast<const T*>(weight);
+    T* out_data = reinterpret_cast<T*>(out);
+    
+    // 矩阵乘法：Y = xW^T
+    for (size_t i = 0; i < batch_size; i++) {
+        for (size_t j = 0; j < out_features; j++) {
+            T sum = 0;
+            for (size_t k = 0; k < in_features; k++) {
+                sum += in_data[i * in_features + k] * weight_data[j * in_features + k];
+            }
+            out_data[i * out_features + j] = sum;
+        }
+    }
+    
+    // 添加偏置
+    if (bias != nullptr) {
+        const T* bias_data = reinterpret_cast<const T*>(bias);
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t j = 0; j < out_features; j++) {
+                out_data[i * out_features + j] += bias_data[j];
+            }
+        }
+    }
+}
+
+// 处理F16类型的特化实现
+template <>
+void linear_impl<llaisys::fp16_t>(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, size_t batch_size, size_t in_features, size_t out_features) {
+    const llaisys::fp16_t* in_data = reinterpret_cast<const llaisys::fp16_t*>(in);
+    const llaisys::fp16_t* weight_data = reinterpret_cast<const llaisys::fp16_t*>(weight);
+    llaisys::fp16_t* out_data = reinterpret_cast<llaisys::fp16_t*>(out);
+    
+    // 矩阵乘法：Y = xW^T
+    for (size_t i = 0; i < batch_size; i++) {
+        for (size_t j = 0; j < out_features; j++) {
+            float sum = 0;
+            for (size_t k = 0; k < in_features; k++) {
+                sum += llaisys::utils::cast<float>(in_data[i * in_features + k]) * llaisys::utils::cast<float>(weight_data[j * in_features + k]);
+            }
+            out_data[i * out_features + j] = llaisys::utils::cast<llaisys::fp16_t>(sum);
+        }
+    }
+    
+    // 添加偏置
+    if (bias != nullptr) {
+        const llaisys::fp16_t* bias_data = reinterpret_cast<const llaisys::fp16_t*>(bias);
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t j = 0; j < out_features; j++) {
+                float val = llaisys::utils::cast<float>(out_data[i * out_features + j]) + llaisys::utils::cast<float>(bias_data[j]);
+                out_data[i * out_features + j] = llaisys::utils::cast<llaisys::fp16_t>(val);
+            }
+        }
+    }
+}
+
+// 处理BF16类型的特化实现
+template <>
+void linear_impl<llaisys::bf16_t>(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, size_t batch_size, size_t in_features, size_t out_features) {
+    const llaisys::bf16_t* in_data = reinterpret_cast<const llaisys::bf16_t*>(in);
+    const llaisys::bf16_t* weight_data = reinterpret_cast<const llaisys::bf16_t*>(weight);
+    llaisys::bf16_t* out_data = reinterpret_cast<llaisys::bf16_t*>(out);
+    
+    // 矩阵乘法：Y = xW^T
+    for (size_t i = 0; i < batch_size; i++) {
+        for (size_t j = 0; j < out_features; j++) {
+            float sum = 0;
+            for (size_t k = 0; k < in_features; k++) {
+                sum += llaisys::utils::cast<float>(in_data[i * in_features + k]) * llaisys::utils::cast<float>(weight_data[j * in_features + k]);
+            }
+            out_data[i * out_features + j] = llaisys::utils::cast<llaisys::bf16_t>(sum);
+        }
+    }
+    
+    // 添加偏置
+    if (bias != nullptr) {
+        const llaisys::bf16_t* bias_data = reinterpret_cast<const llaisys::bf16_t*>(bias);
+        for (size_t i = 0; i < batch_size; i++) {
+            for (size_t j = 0; j < out_features; j++) {
+                float val = llaisys::utils::cast<float>(out_data[i * out_features + j]) + llaisys::utils::cast<float>(bias_data[j]);
+                out_data[i * out_features + j] = llaisys::utils::cast<llaisys::bf16_t>(val);
+            }
+        }
+    }
+}
+
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return linear_impl<float>(out, in, weight, bias, batch_size, in_features, out_features);
+    case LLAISYS_DTYPE_F64:
+        return linear_impl<double>(out, in, weight, bias, batch_size, in_features, out_features);
+    case LLAISYS_DTYPE_F16:
+        return linear_impl<llaisys::fp16_t>(out, in, weight, bias, batch_size, in_features, out_features);
+    case LLAISYS_DTYPE_BF16:
+        return linear_impl<llaisys::bf16_t>(out, in, weight, bias, batch_size, in_features, out_features);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp
new file mode 100644
index 00000000..3d01b1c5
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features);
+}
\ No newline at end of file
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f865..5500f23b 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,68 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/linear_cpu.hpp"
+
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor");
+    CHECK_ARGUMENT(in->ndim() == 2, "in must be a 2D tensor");
+    CHECK_ARGUMENT(weight->ndim() == 2, "weight must be a 2D tensor");
+    CHECK_ARGUMENT(out->shape()[0] == in->shape()[0], "out and in must have the same batch size");
+    CHECK_ARGUMENT(out->shape()[1] == weight->shape()[0], "out features must match weight rows");
+    CHECK_ARGUMENT(in->shape()[1] == weight->shape()[1], "in features must match weight columns");
+    if (bias != nullptr) {
+        CHECK_ARGUMENT(bias->ndim() == 1, "bias must be a 1D tensor");
+        CHECK_ARGUMENT(bias->shape()[0] == out->shape()[1], "bias size must match out features");
+    }
+    CHECK_ARGUMENT(out->isContiguous(), "linear: out tensor must be contiguous.");
+    CHECK_ARGUMENT(in->isContiguous(), "linear: in tensor must be contiguous.");
+    CHECK_ARGUMENT(weight->isContiguous(), "linear: weight tensor must be contiguous.");
+    if (bias != nullptr) {
+        CHECK_ARGUMENT(bias->isContiguous(), "linear: bias tensor must be contiguous.");
+    }
+
+    size_t batch_size = in->shape()[0];
+    size_t in_features = in->shape()[1];
+    size_t out_features = out->shape()[1];
+
+    // 总是支持CPU计算
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::linear(
+            out->data(), 
+            in->data(), 
+            weight->data(), 
+            bias ? bias->data() : nullptr, 
+            out->dtype(), 
+            batch_size, 
+            in_features, 
+            out_features
+        );
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::linear(
+            out->data(), 
+            in->data(), 
+            weight->data(), 
+            bias ? bias->data() : nullptr, 
+            out->dtype(), 
+            batch_size, 
+            in_features, 
+            out_features
+        );
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
new file mode 100644
index 00000000..f7cd4094
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
@@ -0,0 +1,105 @@
+#include "rms_norm_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void rms_norm_impl(std::byte *out, const std::byte *in, const std::byte *weight, float eps, size_t batch_size, size_t hidden_size) {
+    const T* in_data = reinterpret_cast<const T*>(in);
+    const T* weight_data = reinterpret_cast<const T*>(weight);
+    T* out_data = reinterpret_cast<T*>(out);
+    
+    for (size_t i = 0; i < batch_size; i++) {
+        // 计算平方和
+        float sum_sq = 0.0f;
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = static_cast<float>(in_data[i * hidden_size + j]);
+            sum_sq += val * val;
+        }
+        
+        // 计算RMS
+        float rms = std::sqrt(sum_sq / static_cast<float>(hidden_size) + eps);
+        
+        // 计算输出
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = static_cast<float>(in_data[i * hidden_size + j]);
+            float w = static_cast<float>(weight_data[j]);
+            out_data[i * hidden_size + j] = static_cast<T>((w * val) / rms);
+        }
+    }
+}
+
+// 处理F16类型的特化实现
+template <>
+void rms_norm_impl<llaisys::fp16_t>(std::byte *out, const std::byte *in, const std::byte *weight, float eps, size_t batch_size, size_t hidden_size) {
+    const llaisys::fp16_t* in_data = reinterpret_cast<const llaisys::fp16_t*>(in);
+    const llaisys::fp16_t* weight_data = reinterpret_cast<const llaisys::fp16_t*>(weight);
+    llaisys::fp16_t* out_data = reinterpret_cast<llaisys::fp16_t*>(out);
+    
+    for (size_t i = 0; i < batch_size; i++) {
+        // 计算平方和
+        float sum_sq = 0.0f;
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = llaisys::utils::cast<float>(in_data[i * hidden_size + j]);
+            sum_sq += val * val;
+        }
+        
+        // 计算RMS
+        float rms = std::sqrt(sum_sq / static_cast<float>(hidden_size) + eps);
+        
+        // 计算输出
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = llaisys::utils::cast<float>(in_data[i * hidden_size + j]);
+            float w = llaisys::utils::cast<float>(weight_data[j]);
+            out_data[i * hidden_size + j] = llaisys::utils::cast<llaisys::fp16_t>((w * val) / rms);
+        }
+    }
+}
+
+// 处理BF16类型的特化实现
+template <>
+void rms_norm_impl<llaisys::bf16_t>(std::byte *out, const std::byte *in, const std::byte *weight, float eps, size_t batch_size, size_t hidden_size) {
+    const llaisys::bf16_t* in_data = reinterpret_cast<const llaisys::bf16_t*>(in);
+    const llaisys::bf16_t* weight_data = reinterpret_cast<const llaisys::bf16_t*>(weight);
+    llaisys::bf16_t* out_data = reinterpret_cast<llaisys::bf16_t*>(out);
+    
+    for (size_t i = 0; i < batch_size; i++) {
+        // 计算平方和
+        float sum_sq = 0.0f;
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = llaisys::utils::cast<float>(in_data[i * hidden_size + j]);
+            sum_sq += val * val;
+        }
+        
+        // 计算RMS
+        float rms = std::sqrt(sum_sq / static_cast<float>(hidden_size) + eps);
+        
+        // 计算输出
+        for (size_t j = 0; j < hidden_size; j++) {
+            float val = llaisys::utils::cast<float>(in_data[i * hidden_size + j]);
+            float w = llaisys::utils::cast<float>(weight_data[j]);
+            out_data[i * hidden_size + j] = llaisys::utils::cast<llaisys::bf16_t>((w * val) / rms);
+        }
+    }
+}
+
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch_size, size_t hidden_size) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rms_norm_impl<float>(out, in, weight, eps, batch_size, hidden_size);
+    case LLAISYS_DTYPE_F64:
+        return rms_norm_impl<double>(out, in, weight, eps, batch_size, hidden_size);
+    case LLAISYS_DTYPE_F16:
+        return rms_norm_impl<llaisys::fp16_t>(out, in, weight, eps, batch_size, hidden_size);
+    case LLAISYS_DTYPE_BF16:
+        return rms_norm_impl<llaisys::bf16_t>(out, in, weight, eps, batch_size, hidden_size);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
new file mode 100644
index 00000000..7a4db6df
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch_size, size_t hidden_size);
+}
\ No newline at end of file
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9..f9016a3c 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,58 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rms_norm_cpu.hpp"
+
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor");
+    CHECK_ARGUMENT(in->ndim() == 2, "in must be a 2D tensor");
+    CHECK_ARGUMENT(weight->ndim() == 1, "weight must be a 1D tensor");
+    CHECK_ARGUMENT(out->shape()[0] == in->shape()[0], "out and in must have the same batch size");
+    CHECK_ARGUMENT(out->shape()[1] == in->shape()[1], "out and in must have the same hidden size");
+    CHECK_ARGUMENT(weight->shape()[0] == in->shape()[1], "weight size must match hidden size");
+    CHECK_ARGUMENT(out->isContiguous(), "rms_norm: out tensor must be contiguous.");
+    CHECK_ARGUMENT(in->isContiguous(), "rms_norm: in tensor must be contiguous.");
+    CHECK_ARGUMENT(weight->isContiguous(), "rms_norm: weight tensor must be contiguous.");
+
+    size_t batch_size = in->shape()[0];
+    size_t hidden_size = in->shape()[1];
+
+    // 总是支持CPU计算
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rms_norm(
+            out->data(), 
+            in->data(), 
+            weight->data(), 
+            eps, 
+            out->dtype(), 
+            batch_size, 
+            hidden_size
+        );
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rms_norm(
+            out->data(), 
+            in->data(), 
+            weight->data(), 
+            eps, 
+            out->dtype(), 
+            batch_size, 
+            hidden_size
+        );
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp
new file mode 100644
index 00000000..b8a0ca18
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.cpp
@@ -0,0 +1,132 @@
+#include "rope_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void rope_impl(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, size_t seqlen, size_t nhead, size_t d) {
+    const T* in_data = reinterpret_cast<const T*>(in);
+    const int64_t* pos_ids_data = reinterpret_cast<const int64_t*>(pos_ids);
+    T* out_data = reinterpret_cast<T*>(out);
+    
+    size_t d_half = d / 2;
+    
+    for (size_t i = 0; i < seqlen; i++) {
+        int64_t p = pos_ids_data[i];
+        
+        for (size_t j = 0; j < nhead; j++) {
+            for (size_t k = 0; k < d_half; k++) {
+                // 计算角度: phi = p / theta^(2k/d)
+                // 使用与PyTorch相同的计算顺序以确保数值一致性
+                float exp_value = 2.0f * static_cast<float>(k) / static_cast<float>(d);
+                float theta_exp = std::pow(theta, exp_value);
+                float phi = static_cast<float>(p) / theta_exp;
+                float cos_phi = std::cos(phi);
+                float sin_phi = std::sin(phi);
+                
+                // 获取输入值
+                size_t idx = i * nhead * d + j * d + k;
+                size_t idx_b = idx + d_half;
+                float a = static_cast<float>(in_data[idx]);
+                float b = static_cast<float>(in_data[idx_b]);
+                
+                // 计算输出
+                out_data[idx] = static_cast<T>(a * cos_phi - b * sin_phi);
+                out_data[idx_b] = static_cast<T>(b * cos_phi + a * sin_phi);
+            }
+        }
+    }
+}
+
+// 处理F16类型的特化实现
+template <>
+void rope_impl<llaisys::fp16_t>(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, size_t seqlen, size_t nhead, size_t d) {
+    const llaisys::fp16_t* in_data = reinterpret_cast<const llaisys::fp16_t*>(in);
+    const int64_t* pos_ids_data = reinterpret_cast<const int64_t*>(pos_ids);
+    llaisys::fp16_t* out_data = reinterpret_cast<llaisys::fp16_t*>(out);
+    
+    size_t d_half = d / 2;
+    
+    for (size_t i = 0; i < seqlen; i++) {
+        int64_t p = pos_ids_data[i];
+        
+        for (size_t j = 0; j < nhead; j++) {
+            for (size_t k = 0; k < d_half; k++) {
+                // 计算角度: phi = p / theta^(2k/d)
+                // 使用与PyTorch相同的计算顺序以确保数值一致性
+                float exp_value = 2.0f * static_cast<float>(k) / static_cast<float>(d);
+                float theta_exp = std::pow(theta, exp_value);
+                float phi = static_cast<float>(p) / theta_exp;
+                float cos_phi = std::cos(phi);
+                float sin_phi = std::sin(phi);
+                
+                // 获取输入值
+                size_t idx = i * nhead * d + j * d + k;
+                size_t idx_b = idx + d_half;
+                float a = llaisys::utils::cast<float>(in_data[idx]);
+                float b = llaisys::utils::cast<float>(in_data[idx_b]);
+                
+                // 计算输出
+                out_data[idx] = llaisys::utils::cast<llaisys::fp16_t>(a * cos_phi - b * sin_phi);
+                out_data[idx_b] = llaisys::utils::cast<llaisys::fp16_t>(b * cos_phi + a * sin_phi);
+            }
+        }
+    }
+}
+
+// 处理BF16类型的特化实现
+template <>
+void rope_impl<llaisys::bf16_t>(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, size_t seqlen, size_t nhead, size_t d) {
+    const llaisys::bf16_t* in_data = reinterpret_cast<const llaisys::bf16_t*>(in);
+    const int64_t* pos_ids_data = reinterpret_cast<const int64_t*>(pos_ids);
+    llaisys::bf16_t* out_data = reinterpret_cast<llaisys::bf16_t*>(out);
+    
+    size_t d_half = d / 2;
+    
+    for (size_t i = 0; i < seqlen; i++) {
+        int64_t p = pos_ids_data[i];
+        
+        for (size_t j = 0; j < nhead; j++) {
+            for (size_t k = 0; k < d_half; k++) {
+                // 计算角度: phi = p / theta^(2k/d)
+                // 使用与PyTorch相同的计算顺序以确保数值一致性
+                float exp_value = 2.0f * static_cast<float>(k) / static_cast<float>(d);
+                float theta_exp = std::pow(theta, exp_value);
+                float phi = static_cast<float>(p) / theta_exp;
+                float cos_phi = std::cos(phi);
+                float sin_phi = std::sin(phi);
+                
+                // 获取输入值
+                size_t idx = i * nhead * d + j * d + k;
+                size_t idx_b = idx + d_half;
+                float a = llaisys::utils::cast<float>(in_data[idx]);
+                float b = llaisys::utils::cast<float>(in_data[idx_b]);
+                
+                // 计算输出
+                out_data[idx] = llaisys::utils::cast<llaisys::bf16_t>(a * cos_phi - b * sin_phi);
+                out_data[idx_b] = llaisys::utils::cast<llaisys::bf16_t>(b * cos_phi + a * sin_phi);
+            }
+        }
+    }
+}
+
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seqlen, size_t nhead, size_t d) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rope_impl<float>(out, in, pos_ids, theta, seqlen, nhead, d);
+    case LLAISYS_DTYPE_F64:
+        return rope_impl<double>(out, in, pos_ids, theta, seqlen, nhead, d);
+    case LLAISYS_DTYPE_F16:
+        return rope_impl<llaisys::fp16_t>(out, in, pos_ids, theta, seqlen, nhead, d);
+    case LLAISYS_DTYPE_BF16:
+        return rope_impl<llaisys::bf16_t>(out, in, pos_ids, theta, seqlen, nhead, d);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp
new file mode 100644
index 00000000..e8262244
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t num_heads, size_t head_dim);
+}
\ No newline at end of file
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64..2f834ef0 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,62 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rope_cpu.hpp"
+
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(out->ndim() == 3, "out must be a 3D tensor");
+    CHECK_ARGUMENT(in->ndim() == 3, "in must be a 3D tensor");
+    CHECK_ARGUMENT(pos_ids->ndim() == 1, "pos_ids must be a 1D tensor");
+    CHECK_ARGUMENT(out->shape()[0] == in->shape()[0], "out and in must have the same sequence length");
+    CHECK_ARGUMENT(out->shape()[1] == in->shape()[1], "out and in must have the same number of heads");
+    CHECK_ARGUMENT(out->shape()[2] == in->shape()[2], "out and in must have the same head dimension");
+    CHECK_ARGUMENT(pos_ids->shape()[0] == in->shape()[0], "pos_ids length must match sequence length");
+    CHECK_ARGUMENT(out->isContiguous(), "rope: out tensor must be contiguous.");
+    CHECK_ARGUMENT(in->isContiguous(), "rope: in tensor must be contiguous.");
+    CHECK_ARGUMENT(pos_ids->isContiguous(), "rope: pos_ids tensor must be contiguous.");
+
+    size_t seq_len = in->shape()[0];
+    size_t num_heads = in->shape()[1];
+    size_t head_dim = in->shape()[2];
+
+    // 总是支持CPU计算
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rope(
+            out->data(), 
+            in->data(), 
+            pos_ids->data(), 
+            theta, 
+            out->dtype(), 
+            seq_len, 
+            num_heads, 
+            head_dim
+        );
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rope(
+            out->data(), 
+            in->data(), 
+            pos_ids->data(), 
+            theta, 
+            out->dtype(), 
+            seq_len, 
+            num_heads, 
+            head_dim
+        );
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp
new file mode 100644
index 00000000..15c7a4f6
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp
@@ -0,0 +1,296 @@
+#include "self_attention_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstddef>
+#include <algorithm>
+
+namespace llaisys::ops::cpu {
+
+template <typename T>
+void softmax(T* data, size_t size) {
+    T max_val = data[0];
+    for (size_t i = 1; i < size; i++) {
+        if (data[i] > max_val) {
+            max_val = data[i];
+        }
+    }
+    
+    T sum = 0;
+    for (size_t i = 0; i < size; i++) {
+        data[i] = static_cast<T>(std::exp(static_cast<float>(data[i] - max_val)));
+        sum += data[i];
+    }
+    
+    for (size_t i = 0; i < size; i++) {
+        data[i] /= sum;
+    }
+}
+
+template <>
+void softmax<llaisys::fp16_t>(llaisys::fp16_t* data, size_t size) {
+    float max_val = llaisys::utils::cast<float>(data[0]);
+    for (size_t i = 1; i < size; i++) {
+        float val = llaisys::utils::cast<float>(data[i]);
+        if (val > max_val) {
+            max_val = val;
+        }
+    }
+    
+    float sum = 0;
+    for (size_t i = 0; i < size; i++) {
+        float val = llaisys::utils::cast<float>(data[i]);
+        data[i] = llaisys::utils::cast<llaisys::fp16_t>(std::exp(val - max_val));
+        sum += llaisys::utils::cast<float>(data[i]);
+    }
+    
+    for (size_t i = 0; i < size; i++) {
+        float val = llaisys::utils::cast<float>(data[i]);
+        data[i] = llaisys::utils::cast<llaisys::fp16_t>(val / sum);
+    }
+}
+
+template <>
+void softmax<llaisys::bf16_t>(llaisys::bf16_t* data, size_t size) {
+    float max_val = llaisys::utils::cast<float>(data[0]);
+    for (size_t i = 1; i < size; i++) {
+        float val = llaisys::utils::cast<float>(data[i]);
+        if (val > max_val) {
+            max_val = val;
+        }
+    }
+    
+    float sum = 0;
+    for (size_t i = 0; i < size; i++) {
+        float val = llaisys::utils::cast<float>(data[i]);
+        data[i] = llaisys::utils::cast<llaisys::bf16_t>(std::exp(val - max_val));
+        sum += llaisys::utils::cast<float>(data[i]);
+    }
+    
+    for (size_t i = 0; i < size; i++) {
+        float val = llaisys::utils::cast<float>(data[i]);
+        data[i] = llaisys::utils::cast<llaisys::bf16_t>(val / sum);
+    }
+}
+
+template <typename T>
+void self_attention_impl(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, size_t seqlen, size_t nhead, size_t nkhead, size_t d, size_t total_len) {
+    const T* q_data = reinterpret_cast<const T*>(q);
+    const T* k_data = reinterpret_cast<const T*>(k);
+    const T* v_data = reinterpret_cast<const T*>(v);
+    T* attn_val_data = reinterpret_cast<T*>(attn_val);
+    
+    size_t repeats = nhead / nkhead;
+    
+    T* k_expanded = new T[total_len * nhead * d];
+    T* v_expanded = new T[total_len * nhead * d];
+    
+    for (size_t i = 0; i < total_len; i++) {
+        for (size_t j = 0; j < nhead; j++) {
+            size_t kv_head = j / repeats;
+            for (size_t k_idx = 0; k_idx < d; k_idx++) {
+                size_t src_idx = i * nkhead * d + kv_head * d + k_idx;
+                size_t dst_idx = i * nhead * d + j * d + k_idx;
+                k_expanded[dst_idx] = k_data[src_idx];
+                v_expanded[dst_idx] = v_data[src_idx];
+            }
+        }
+    }
+    
+    T* attn_scores = new T[nhead * seqlen * total_len];
+    
+    for (size_t j = 0; j < nhead; j++) {
+        for (size_t i = 0; i < seqlen; i++) {
+            for (size_t t = 0; t < total_len; t++) {
+                float score = 0.0f;
+                for (size_t k_idx = 0; k_idx < d; k_idx++) {
+                    size_t q_idx = i * nhead * d + j * d + k_idx;
+                    size_t k_idx_local = t * nhead * d + j * d + k_idx;
+                    score += llaisys::utils::cast<float>(q_data[q_idx]) * llaisys::utils::cast<float>(k_expanded[k_idx_local]);
+                }
+                score *= scale;
+                
+                size_t mask_threshold = (total_len > seqlen) ? (i + total_len - seqlen) : i;
+                if (t > mask_threshold) {
+                    score = -1e9f;
+                }
+                
+                attn_scores[j * seqlen * total_len + i * total_len + t] = llaisys::utils::cast<T>(score);
+            }
+            softmax(&attn_scores[j * seqlen * total_len + i * total_len], total_len);
+        }
+    }
+    
+    for (size_t j = 0; j < nhead; j++) {
+        for (size_t i = 0; i < seqlen; i++) {
+            for (size_t d_idx = 0; d_idx < d; d_idx++) {
+                float val = 0.0f;
+                for (size_t t = 0; t < total_len; t++) {
+                    float attn_weight = llaisys::utils::cast<float>(attn_scores[j * seqlen * total_len + i * total_len + t]);
+                    size_t v_idx = t * nhead * d + j * d + d_idx;
+                    val += attn_weight * llaisys::utils::cast<float>(v_expanded[v_idx]);
+                }
+                size_t out_idx = i * nhead * d + j * d + d_idx;
+                attn_val_data[out_idx] = llaisys::utils::cast<T>(val);
+            }
+        }
+    }
+    
+    delete[] attn_scores;
+    delete[] k_expanded;
+    delete[] v_expanded;
+}
+
+template <>
+void self_attention_impl<llaisys::fp16_t>(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, size_t seqlen, size_t nhead, size_t nkhead, size_t d, size_t total_len) {
+    const llaisys::fp16_t* q_data = reinterpret_cast<const llaisys::fp16_t*>(q);
+    const llaisys::fp16_t* k_data = reinterpret_cast<const llaisys::fp16_t*>(k);
+    const llaisys::fp16_t* v_data = reinterpret_cast<const llaisys::fp16_t*>(v);
+    llaisys::fp16_t* attn_val_data = reinterpret_cast<llaisys::fp16_t*>(attn_val);
+    
+    size_t repeats = nhead / nkhead;
+    
+    llaisys::fp16_t* k_expanded = new llaisys::fp16_t[total_len * nhead * d];
+    llaisys::fp16_t* v_expanded = new llaisys::fp16_t[total_len * nhead * d];
+    
+    for (size_t i = 0; i < total_len; i++) {
+        for (size_t j = 0; j < nhead; j++) {
+            size_t kv_head = j / repeats;
+            for (size_t k_idx = 0; k_idx < d; k_idx++) {
+                size_t src_idx = i * nkhead * d + kv_head * d + k_idx;
+                size_t dst_idx = i * nhead * d + j * d + k_idx;
+                k_expanded[dst_idx] = k_data[src_idx];
+                v_expanded[dst_idx] = v_data[src_idx];
+            }
+        }
+    }
+    
+    llaisys::fp16_t* attn_scores = new llaisys::fp16_t[nhead * seqlen * total_len];
+    
+    for (size_t j = 0; j < nhead; j++) {
+        for (size_t i = 0; i < seqlen; i++) {
+            for (size_t t = 0; t < total_len; t++) {
+                float score = 0.0f;
+                for (size_t k_idx = 0; k_idx < d; k_idx++) {
+                    size_t q_idx = i * nhead * d + j * d + k_idx;
+                    size_t k_idx_local = t * nhead * d + j * d + k_idx;
+                    score += llaisys::utils::cast<float>(q_data[q_idx]) * llaisys::utils::cast<float>(k_expanded[k_idx_local]);
+                }
+                score *= scale;
+                
+                size_t mask_threshold = (total_len > seqlen) ? (i + total_len - seqlen) : i;
+                if (t > mask_threshold) {
+                    score = -1e9f;
+                }
+                
+                attn_scores[j * seqlen * total_len + i * total_len + t] = llaisys::utils::cast<llaisys::fp16_t>(score);
+            }
+            softmax(&attn_scores[j * seqlen * total_len + i * total_len], total_len);
+        }
+    }
+    
+    for (size_t j = 0; j < nhead; j++) {
+        for (size_t i = 0; i < seqlen; i++) {
+            for (size_t d_idx = 0; d_idx < d; d_idx++) {
+                float val = 0.0f;
+                for (size_t t = 0; t < total_len; t++) {
+                    float attn_weight = llaisys::utils::cast<float>(attn_scores[j * seqlen * total_len + i * total_len + t]);
+                    size_t v_idx = t * nhead * d + j * d + d_idx;
+                    val += attn_weight * llaisys::utils::cast<float>(v_expanded[v_idx]);
+                }
+                size_t out_idx = i * nhead * d + j * d + d_idx;
+                attn_val_data[out_idx] = llaisys::utils::cast<llaisys::fp16_t>(val);
+            }
+        }
+    }
+    
+    delete[] attn_scores;
+    delete[] k_expanded;
+    delete[] v_expanded;
+}
+
+template <>
+void self_attention_impl<llaisys::bf16_t>(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, size_t seqlen, size_t nhead, size_t nkhead, size_t d, size_t total_len) {
+    const llaisys::bf16_t* q_data = reinterpret_cast<const llaisys::bf16_t*>(q);
+    const llaisys::bf16_t* k_data = reinterpret_cast<const llaisys::bf16_t*>(k);
+    const llaisys::bf16_t* v_data = reinterpret_cast<const llaisys::bf16_t*>(v);
+    llaisys::bf16_t* attn_val_data = reinterpret_cast<llaisys::bf16_t*>(attn_val);
+    
+    size_t repeats = nhead / nkhead;
+    
+    llaisys::bf16_t* k_expanded = new llaisys::bf16_t[total_len * nhead * d];
+    llaisys::bf16_t* v_expanded = new llaisys::bf16_t[total_len * nhead * d];
+    
+    for (size_t i = 0; i < total_len; i++) {
+        for (size_t j = 0; j < nhead; j++) {
+            size_t kv_head = j / repeats;
+            for (size_t k_idx = 0; k_idx < d; k_idx++) {
+                size_t src_idx = i * nkhead * d + kv_head * d + k_idx;
+                size_t dst_idx = i * nhead * d + j * d + k_idx;
+                k_expanded[dst_idx] = k_data[src_idx];
+                v_expanded[dst_idx] = v_data[src_idx];
+            }
+        }
+    }
+    
+    llaisys::bf16_t* attn_scores = new llaisys::bf16_t[nhead * seqlen * total_len];
+    
+    for (size_t j = 0; j < nhead; j++) {
+        for (size_t i = 0; i < seqlen; i++) {
+            for (size_t t = 0; t < total_len; t++) {
+                float score = 0.0f;
+                for (size_t k_idx = 0; k_idx < d; k_idx++) {
+                    size_t q_idx = i * nhead * d + j * d + k_idx;
+                    size_t k_idx_local = t * nhead * d + j * d + k_idx;
+                    score += llaisys::utils::cast<float>(q_data[q_idx]) * llaisys::utils::cast<float>(k_expanded[k_idx_local]);
+                }
+                score *= scale;
+                
+                size_t mask_threshold = (total_len > seqlen) ? (i + total_len - seqlen) : i;
+                if (t > mask_threshold) {
+                    score = -1e9f;
+                }
+                
+                attn_scores[j * seqlen * total_len + i * total_len + t] = llaisys::utils::cast<llaisys::bf16_t>(score);
+            }
+            softmax(&attn_scores[j * seqlen * total_len + i * total_len], total_len);
+        }
+    }
+    
+    for (size_t j = 0; j < nhead; j++) {
+        for (size_t i = 0; i < seqlen; i++) {
+            for (size_t d_idx = 0; d_idx < d; d_idx++) {
+                float val = 0.0f;
+                for (size_t t = 0; t < total_len; t++) {
+                    float attn_weight = llaisys::utils::cast<float>(attn_scores[j * seqlen * total_len + i * total_len + t]);
+                    size_t v_idx = t * nhead * d + j * d + d_idx;
+                    val += attn_weight * llaisys::utils::cast<float>(v_expanded[v_idx]);
+                }
+                size_t out_idx = i * nhead * d + j * d + d_idx;
+                attn_val_data[out_idx] = llaisys::utils::cast<llaisys::bf16_t>(val);
+            }
+        }
+    }
+    
+    delete[] attn_scores;
+    delete[] k_expanded;
+    delete[] v_expanded;
+}
+
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t seqlen, size_t nhead, size_t num_kv_heads, size_t d, size_t total_len) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return self_attention_impl<float>(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len);
+    case LLAISYS_DTYPE_F64:
+        return self_attention_impl<double>(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len);
+    case LLAISYS_DTYPE_F16:
+        return self_attention_impl<llaisys::fp16_t>(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len);
+    case LLAISYS_DTYPE_BF16:
+        return self_attention_impl<llaisys::bf16_t>(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp
new file mode 100644
index 00000000..6f7ddb0c
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t seq_len, size_t num_heads, size_t num_kv_heads, size_t head_dim, size_t total_len);
+}
\ No newline at end of file
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d62014..d8161d2e 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,75 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/self_attention_cpu.hpp"
+
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(attn_val->ndim() == 3, "attn_val must be a 3D tensor");
+    CHECK_ARGUMENT(q->ndim() == 3, "q must be a 3D tensor");
+    CHECK_ARGUMENT(k->ndim() == 3, "k must be a 3D tensor");
+    CHECK_ARGUMENT(v->ndim() == 3, "v must be a 3D tensor");
+    CHECK_ARGUMENT(attn_val->shape()[0] == q->shape()[0], "attn_val and q must have the same sequence length");
+    CHECK_ARGUMENT(attn_val->shape()[1] == q->shape()[1], "attn_val and q must have the same number of heads");
+    CHECK_ARGUMENT(attn_val->shape()[2] == v->shape()[2], "attn_val and v must have the same head dimension");
+    CHECK_ARGUMENT(q->shape()[2] == k->shape()[2], "q and k must have the same head dimension");
+    CHECK_ARGUMENT(k->shape()[1] == v->shape()[1], "k and v must have the same number of heads");
+    CHECK_ARGUMENT(k->shape()[2] == v->shape()[2], "k and v must have the same head dimension");
+    CHECK_ARGUMENT(q->shape()[1] % k->shape()[1] == 0, "q's number of heads must be a multiple of k's number of heads");
+    CHECK_ARGUMENT(attn_val->isContiguous(), "self_attention: attn_val tensor must be contiguous.");
+    CHECK_ARGUMENT(q->isContiguous(), "self_attention: q tensor must be contiguous.");
+    CHECK_ARGUMENT(k->isContiguous(), "self_attention: k tensor must be contiguous.");
+    CHECK_ARGUMENT(v->isContiguous(), "self_attention: v tensor must be contiguous.");
+
+    size_t seq_len = q->shape()[0];
+    size_t num_heads = q->shape()[1];
+    size_t num_kv_heads = k->shape()[1];
+    size_t head_dim = q->shape()[2];
+    size_t total_len = k->shape()[0];
+
+    // 总是支持CPU计算
+    if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::self_attention(
+            attn_val->data(), 
+            q->data(), 
+            k->data(), 
+            v->data(), 
+            scale, 
+            attn_val->dtype(), 
+            seq_len, 
+            num_heads, 
+            num_kv_heads, 
+            head_dim, 
+            total_len
+        );
+    }
+
+    llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId());
+
+    switch (attn_val->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::self_attention(
+            attn_val->data(), 
+            q->data(), 
+            k->data(), 
+            v->data(), 
+            scale, 
+            attn_val->dtype(), 
+            seq_len, 
+            num_heads, 
+            num_kv_heads, 
+            head_dim, 
+            total_len
+        );
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp
new file mode 100644
index 00000000..763821de
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp
@@ -0,0 +1,97 @@
+#include "swiglu_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+
+// Sigmoid函数模板
+template <typename T>
+T sigmoid(T x) {
+    return static_cast<T>(1.0f / (1.0f + std::exp(-static_cast<float>(x))));
+}
+
+// F16类型的sigmoid特化
+template <>
+llaisys::fp16_t sigmoid<llaisys::fp16_t>(llaisys::fp16_t x) {
+    float val = llaisys::utils::cast<float>(x);
+    float sigmoid_val = 1.0f / (1.0f + std::exp(-val));
+    return llaisys::utils::cast<llaisys::fp16_t>(sigmoid_val);
+}
+
+// BF16类型的sigmoid特化
+template <>
+llaisys::bf16_t sigmoid<llaisys::bf16_t>(llaisys::bf16_t x) {
+    float val = llaisys::utils::cast<float>(x);
+    float sigmoid_val = 1.0f / (1.0f + std::exp(-val));
+    return llaisys::utils::cast<llaisys::bf16_t>(sigmoid_val);
+}
+
+template <typename T>
+void swiglu_impl(std::byte *out, const std::byte *gate, const std::byte *up, size_t size) {
+    const T* gate_data = reinterpret_cast<const T*>(gate);
+    const T* up_data = reinterpret_cast<const T*>(up);
+    T* out_data = reinterpret_cast<T*>(out);
+    
+    for (size_t i = 0; i < size; i++) {
+        T gate_val = gate_data[i];
+        T up_val = up_data[i];
+        T sigmoid_gate = sigmoid(gate_val);
+        out_data[i] = up_val * gate_val * sigmoid_gate;
+    }
+}
+
+// F16类型的特化实现
+template <>
+void swiglu_impl<llaisys::fp16_t>(std::byte *out, const std::byte *gate, const std::byte *up, size_t size) {
+    const llaisys::fp16_t* gate_data = reinterpret_cast<const llaisys::fp16_t*>(gate);
+    const llaisys::fp16_t* up_data = reinterpret_cast<const llaisys::fp16_t*>(up);
+    llaisys::fp16_t* out_data = reinterpret_cast<llaisys::fp16_t*>(out);
+    
+    for (size_t i = 0; i < size; i++) {
+        llaisys::fp16_t gate_val = gate_data[i];
+        llaisys::fp16_t up_val = up_data[i];
+        llaisys::fp16_t sigmoid_gate = sigmoid(gate_val);
+        float up_float = llaisys::utils::cast<float>(up_val);
+        float gate_float = llaisys::utils::cast<float>(gate_val);
+        float sigmoid_float = llaisys::utils::cast<float>(sigmoid_gate);
+        out_data[i] = llaisys::utils::cast<llaisys::fp16_t>(up_float * gate_float * sigmoid_float);
+    }
+}
+
+// BF16类型的特化实现
+template <>
+void swiglu_impl<llaisys::bf16_t>(std::byte *out, const std::byte *gate, const std::byte *up, size_t size) {
+    const llaisys::bf16_t* gate_data = reinterpret_cast<const llaisys::bf16_t*>(gate);
+    const llaisys::bf16_t* up_data = reinterpret_cast<const llaisys::bf16_t*>(up);
+    llaisys::bf16_t* out_data = reinterpret_cast<llaisys::bf16_t*>(out);
+    
+    for (size_t i = 0; i < size; i++) {
+        llaisys::bf16_t gate_val = gate_data[i];
+        llaisys::bf16_t up_val = up_data[i];
+        llaisys::bf16_t sigmoid_gate = sigmoid(gate_val);
+        float up_float = llaisys::utils::cast<float>(up_val);
+        float gate_float = llaisys::utils::cast<float>(gate_val);
+        float sigmoid_float = llaisys::utils::cast<float>(sigmoid_gate);
+        out_data[i] = llaisys::utils::cast<llaisys::bf16_t>(up_float * gate_float * sigmoid_float);
+    }
+}
+
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t size) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return swiglu_impl<float>(out, gate, up, size);
+    case LLAISYS_DTYPE_F64:
+        return swiglu_impl<double>(out, gate, up, size);
+    case LLAISYS_DTYPE_F16:
+        return swiglu_impl<llaisys::fp16_t>(out, gate, up, size);
+    case LLAISYS_DTYPE_BF16:
+        return swiglu_impl<llaisys::bf16_t>(out, gate, up, size);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp
new file mode 100644
index 00000000..ed570a75
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t size);
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc9..37752f2c 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,40 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/swiglu_cpu.hpp"
+
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor");
+    CHECK_ARGUMENT(gate->ndim() == 2, "gate must be a 2D tensor");
+    CHECK_ARGUMENT(up->ndim() == 2, "up must be a 2D tensor");
+    CHECK_ARGUMENT(out->shape()[0] == gate->shape()[0], "out and gate must have the same shape");
+    CHECK_ARGUMENT(out->shape()[1] == gate->shape()[1], "out and gate must have the same shape");
+    CHECK_ARGUMENT(out->shape()[0] == up->shape()[0], "out and up must have the same shape");
+    CHECK_ARGUMENT(out->shape()[1] == up->shape()[1], "out and up must have the same shape");
+    CHECK_ARGUMENT(out->isContiguous(), "swiglu: out tensor must be contiguous.");
+    CHECK_ARGUMENT(gate->isContiguous(), "swiglu: gate tensor must be contiguous.");
+    CHECK_ARGUMENT(up->isContiguous(), "swiglu: up tensor must be contiguous.");
+
+    // 总是支持CPU计算
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel());
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb6..43cfedb2 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -2,6 +2,7 @@
 
 #include "../utils.hpp"
 
+#include <cstddef>
 #include <cstring>
 #include <numeric>
 #include <sstream>
@@ -164,27 +165,122 @@ void Tensor::debug() const {
 }
 
 bool Tensor::isContiguous() const {
-    TO_BE_IMPLEMENTED();
+    size_t ndim_ = this->ndim();
+    ptrdiff_t stride = 1;
+    for (size_t i = 1; i <= ndim_; i++) {
+        if (this->_meta.strides[ndim_ - i] != stride) {
+            return false;
+        }
+        stride *= static_cast<ptrdiff_t>(this->_meta.shape[ndim_ - i]);
+    }
     return true;
 }
 
 tensor_t Tensor::permute(const std::vector<size_t> &order) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    size_t ndim_ = this->ndim();
+    CHECK_ARGUMENT(order.size() == ndim_, "order size must be equal to ndim");
+    // 检查order是否包含所有维度
+    std::vector<bool> used(ndim_, false);
+    for (auto idx : order) {
+        if (idx >= ndim_ || used[idx]) {
+            CHECK_ARGUMENT(false, "Invalid permutation order");
+        }
+        used[idx] = true;
+    }
+
+    std::vector<size_t> shape(ndim_);
+    std::vector<ptrdiff_t> strides(ndim_);
+    for (size_t i = 0; i < ndim_; i++) {
+        shape[i] = this->_meta.shape[order[i]];
+        strides[i] = this->_meta.strides[order[i]];
+    }
+    TensorMeta meta{this->_meta.dtype, shape, strides};
+    return std::shared_ptr<Tensor>(new Tensor(meta, _storage, _offset));
 }
 
 tensor_t Tensor::view(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // 计算新形状的元素总数
+    size_t new_numel = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies<size_t>());
+    // 检查元素总数是否与原张量相同
+    if (new_numel != this->numel()) {
+        CHECK_ARGUMENT(false, "New shape has different number of elements");
+    }
+    
+    // 对于连续张量，直接计算新的步长
+    if(isContiguous()) {
+        size_t ndim_ = shape.size();
+        std::vector<ptrdiff_t> strides(ndim_);
+        size_t stride = 1;
+        for (size_t i = 1; i <= ndim_; i++) {
+            strides[ndim_ - i] = stride;
+            stride *= shape[ndim_ - i];
+        }
+        TensorMeta meta{this->_meta.dtype, shape, strides};
+        return std::shared_ptr<Tensor>(new Tensor(meta, _storage, _offset));
+    } else {
+        // 对于非连续张量，检查是否可以进行视图操作
+        // 这里简化实现，实际中可能需要更复杂的检查
+        CHECK_ARGUMENT(false, "Cannot view non-contiguous tensor");
+    }
 }
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // 检查dim是否有效
+    if (dim >= this->ndim()) {
+        CHECK_ARGUMENT(false, "Dimension out of range");
+    }
+    
+    // 检查start和end是否有效
+    if (start >= end || end > this->_meta.shape[dim]) {
+        CHECK_ARGUMENT(false, "Invalid start or end indices");
+    }
+    
+    // 创建新的形状
+    std::vector<size_t> new_shape = this->_meta.shape;
+    new_shape[dim] = end - start;
+    
+    // 计算新的偏移量
+    size_t new_offset = this->_offset;
+    new_offset += start * this->_meta.strides[dim] * this->elementSize();
+    
+    // 创建新的步长（保持不变）
+    std::vector<ptrdiff_t> new_strides = this->_meta.strides;
+    
+    // 创建新的meta
+    TensorMeta meta{this->_meta.dtype, new_shape, new_strides};
+    
+    // 创建并返回新的张量
+    return std::shared_ptr<Tensor>(new Tensor(meta, _storage, new_offset));
 }
 
 void Tensor::load(const void *src_) {
-    TO_BE_IMPLEMENTED();
+    // 计算需要复制的字节数
+    size_t bytes_to_copy = this->numel() * this->elementSize();
+    
+    // 检查存储大小是否足够
+    if (this->_storage->size() < this->_offset + bytes_to_copy) {
+        CHECK_ARGUMENT(false, "Storage size is not sufficient");
+    }
+    
+    // 获取目标内存地址（考虑偏移量）
+    std::byte *dst = this->data();
+    
+    // 根据设备类型选择合适的内存复制方式
+    if (this->deviceType() == LLAISYS_DEVICE_CPU) {
+        // CPU到CPU的复制
+        std::memcpy(dst, src_, bytes_to_copy);
+    } else {
+        // 主机到设备的复制
+        core::context().setDevice(this->deviceType(), this->deviceId());
+        core::context().runtime().api()->memcpy_sync(
+            dst,
+            src_,
+            bytes_to_copy,
+            LLAISYS_MEMCPY_H2D
+        );
+        // 同步设备确保复制完成
+        core::context().runtime().api()->device_synchronize();
+    }
 }
 
 tensor_t Tensor::contiguous() const {
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a9..f18fa292 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -95,6 +95,22 @@ target("llaisys-ops")
     on_install(function (target) end)
 target_end()
 
+target("llaisys-models")
+    set_kind("static")
+    add_deps("llaisys-tensor")
+    add_deps("llaisys-ops")
+
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+    end
+
+    add_files("src/llaisys/models/*.cpp")
+
+    on_install(function (target) end)
+target_end()
+
 target("llaisys")
     set_kind("shared")
     add_deps("llaisys-utils")
@@ -106,6 +122,7 @@ target("llaisys")
     set_languages("cxx17")
     set_warnings("all", "error")
     add_files("src/llaisys/*.cc")
+    add_files("src/llaisys/models/*.cpp")
     set_installdir(".")