diff --git a/.envrc b/.envrc
new file mode 100644
index 00000000..86241311
--- /dev/null
+++ b/.envrc
@@ -0,0 +1 @@
+source .venv/bin/activate
diff --git a/.gitignore b/.gitignore
index e38cf574..8fb09640 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,4 +87,7 @@ htmlcov/
 # Windows
 Thumbs.db
 ehthumbs.db
-desktop.ini
\ No newline at end of file
+desktop.ini
+
+# model
+DeepSeek-R1-Distill-Qwen-1.5B/
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 7054626d..6cd98dd7 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -4,14 +4,14 @@
 #include "../tensor.h"
 
 __C {
-    struct LlaisysQwen2Meta {
+    typedef struct LlaisysQwen2Meta_ {
         llaisysDataType_t dtype;
         size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
         float epsilon, theta;
         int64_t end_token;
-    };
+    }LlaisysQwen2Meta;
 
-    struct LlaisysQwen2Weights {
+    typedef struct LlaisysQwen2Weights_ {
         llaisysTensor_t in_embed;
         llaisysTensor_t out_embed;
         llaisysTensor_t out_norm_w;   // a.k.a. model.norm.weight
@@ -27,16 +27,24 @@ __C {
         llaisysTensor_t *mlp_gate_w;
         llaisysTensor_t *mlp_up_w;
         llaisysTensor_t *mlp_down_w;
-    };
+    }LlaisysQwen2Weights;
 
-    struct LlaisysQwen2Model;
+    typedef struct LlaisysQwen2Model_ {
+        LlaisysQwen2Meta* meta;
+        LlaisysQwen2Weights* weights = nullptr;
+        void *impl = nullptr; // Opaque pointer to the actual model implementation (e.g., a C++ class instance).
+    }LlaisysQwen2Model;
 
-    __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
 
-    __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
 
-    __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
+    __export LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
 
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    __export void llaisysQwen2ModelDestroy(LlaisysQwen2Model * model);
+    
+    __export void llaisysQwen2modelLoadWeight(LlaisysQwen2Model * model, const void *weight_data, const char *weight_name);
+
+    __export LlaisysQwen2Weights *llaisysQwen2ModelWeights(LlaisysQwen2Model * model);
+
+    __export int64_t llaisysQwen2ModelInfer(LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/make.sh b/make.sh
new file mode 100755
index 00000000..89daef43
--- /dev/null
+++ b/make.sh
@@ -0,0 +1,4 @@
+xmake
+xmake install
+pip install ./python
+
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb52..d3691ce4 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -10,9 +10,10 @@
 from .llaisys_types import llaisysMemcpyKind_t, MemcpyKind
 from .llaisys_types import llaisysStream_t
 from .tensor import llaisysTensor_t
+from .models.qwen2 import Qwen2Meta, LlaisysQwen2Meta_t, LlaisysQwen2Model_t, LlaisysQwen2Weights_t
 from .tensor import load_tensor
 from .ops import load_ops
-
+from .models.qwen2 import load_qwen2
 
 def load_shared_library():
     lib_dir = Path(__file__).parent
@@ -38,6 +39,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_qwen2(LIB_LLAISYS)
 
 
 __all__ = [
@@ -46,6 +48,10 @@ def load_shared_library():
     "llaisysStream_t",
     "llaisysTensor_t",
     "llaisysDataType_t",
+    "Qwen2Meta",
+    "LlaisysQwen2Meta_t",
+    "LlaisysQwen2Model_t",
+    "LlaisysQwen2Weights_t",
     "DataType",
     "llaisysDeviceType_t",
     "DeviceType",
diff --git a/python/llaisys/libllaisys/llaisys_types.py b/python/llaisys/libllaisys/llaisys_types.py
index c5a0b467..3da82aa6 100644
--- a/python/llaisys/libllaisys/llaisys_types.py
+++ b/python/llaisys/libllaisys/llaisys_types.py
@@ -49,6 +49,16 @@ class MemcpyKind(IntEnum):
 
 llaisysMemcpyKind_t = ctypes.c_int
 
+'''
+struct LlaisysQwen2Meta_ {
+        llaisysDataType_t dtype;
+        size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
+        float epsilon, theta;
+        int64_t end_token;
+}
+'''
+
+
 # Stream type (opaque pointer)
 llaisysStream_t = ctypes.c_void_p
 
diff --git a/python/llaisys/libllaisys/models/__init__.py b/python/llaisys/libllaisys/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/python/llaisys/libllaisys/models/qwen2.py b/python/llaisys/libllaisys/models/qwen2.py
new file mode 100644
index 00000000..98beb969
--- /dev/null
+++ b/python/llaisys/libllaisys/models/qwen2.py
@@ -0,0 +1,59 @@
+from ctypes import POINTER, c_void_p, c_size_t, c_int, c_char_p, Structure, c_float, c_int64
+from ..llaisys_types import *
+from ..tensor import llaisysTensor_t
+
+class Qwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+class LlaisysQwen2Weights(Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", POINTER(llaisysTensor_t)),
+        ("attn_q_w", POINTER(llaisysTensor_t)),
+        ("attn_q_b", POINTER(llaisysTensor_t)),
+        ("attn_k_w", POINTER(llaisysTensor_t)),
+        ("attn_k_b", POINTER(llaisysTensor_t)),
+        ("attn_v_w", POINTER(llaisysTensor_t)),
+        ("attn_v_b", POINTER(llaisysTensor_t)),
+        ("attn_o_w", POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", POINTER(llaisysTensor_t)),
+        ("mlp_up_w", POINTER(llaisysTensor_t)),
+        ("mlp_down_w", POINTER(llaisysTensor_t)),
+    ]
+
+LlaisysQwen2Meta_t = POINTER(Qwen2Meta)
+LlaisysQwen2Model_t = c_void_p
+LlaisysQwen2Weights_t = POINTER(LlaisysQwen2Weights)
+
+
+def load_qwen2(lib):
+    lib.llaisysQwen2ModelCreate.argtypes = [LlaisysQwen2Meta_t, llaisysDeviceType_t, POINTER(c_int), c_int]
+    lib.llaisysQwen2ModelCreate.restype = LlaisysQwen2Model_t
+
+    lib.llaisysQwen2ModelDestroy.argtypes = [LlaisysQwen2Model_t]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    lib.llaisysQwen2modelLoadWeight.argtypes = [LlaisysQwen2Model_t, c_void_p, c_char_p]
+    lib.llaisysQwen2modelLoadWeight.restype = None
+
+    lib.llaisysQwen2ModelInfer.argtypes = [LlaisysQwen2Model_t, POINTER(c_int64), c_size_t]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
+
+    lib.llaisysQwen2ModelWeights.argtypes = [LlaisysQwen2Model_t]
+    lib.llaisysQwen2ModelWeights.restype = LlaisysQwen2Weights_t
diff --git a/python/llaisys/models/__init__.py b/python/llaisys/models/__init__.py
index af9918b0..c8129885 100644
--- a/python/llaisys/models/__init__.py
+++ b/python/llaisys/models/__init__.py
@@ -1 +1 @@
-from .qwen2 import Qwen2
+from .qwen2 import Qwen2
\ No newline at end of file
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b2..28eaa6e0 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,23 +1,28 @@
+import ctypes
+import numpy as np
+import gc
+from enum import IntEnum
 from typing import Sequence
 from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
+from ..libllaisys import *
+from ..tensor import Tensor
+import torch
+
 
 from pathlib import Path
 import safetensors
+import json
 
 
 class Qwen2:
-
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
-        model_path = Path(model_path)
-
-        for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+        self.model_path = Path(model_path)
+        self.device = device
+        self._load_config()
+        self._load_weights()
+       
+    def __delete__(self):
+        LIB_LLAISYS.llaisysQwen2ModelDestroy(self.model)
 
     def generate(
         self,
@@ -28,6 +33,45 @@ def generate(
         temperature: float = 0.8,
     ):
 
-        # TODO: Implement generate function
+        ptr = np.array(inputs, dtype=np.int64).ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+        l = len(inputs)
+        ret = list(inputs)
+        id = 0
+        while id != self.config["eos_token_id"]:
+            id = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self.model, ptr, ctypes.c_size_t(l)))
+            ret.append(id)
+            ptr = ctypes.byref(ctypes.c_int64(id))
+            l = 1
+        return ret
+    
+    def _load_config(self):
+        config_file = self.model_path / "config.json"
+        with open(config_file, "r") as f:
+            self.config = json.load(f)
+        meta = Qwen2Meta()
+        meta.dtype = ctypes.c_int(DataType.BF16)
+        meta.nlayer = ctypes.c_size_t(self.config["num_hidden_layers"])
+        meta.hs = ctypes.c_size_t(self.config["hidden_size"])
+        meta.nh = ctypes.c_size_t(self.config["num_attention_heads"])
+        meta.nkvh = ctypes.c_size_t(self.config["num_key_value_heads"])
+        meta.dh = ctypes.c_size_t(self.config["hidden_size"] // self.config["num_attention_heads"])
+        meta.di = ctypes.c_size_t(self.config["intermediate_size"])
+        meta.maxseq = ctypes.c_size_t(self.config["max_position_embeddings"])
+        meta.voc = ctypes.c_size_t(self.config["vocab_size"])
+        meta.epsilon = ctypes.c_float(self.config["rms_norm_eps"])
+        meta.theta = ctypes.c_float(self.config["rope_theta"])
+        meta.end_token = ctypes.c_int64(self.config["eos_token_id"])
+
 
-        return []
+        id = ctypes.c_int(0)
+        self.model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), self.device, ctypes.byref(id), 1)
+
+    def _load_weights(self):
+        for file in sorted(self.model_path.glob("*.safetensors")):
+            data_ = safetensors.safe_open(file, framework="torch", device="cpu")
+            for name_ in data_.keys():
+                tensor = data_.get_tensor(name_)
+                name_c = ctypes.c_char_p(name_.encode('utf-8'))
+                LIB_LLAISYS.llaisysQwen2modelLoadWeight(self.model, ctypes.c_void_p(tensor.data_ptr()), name_c)
+                del tensor
+        gc.collect()
\ No newline at end of file
diff --git a/python/llaisys/tensor.py b/python/llaisys/tensor.py
index 1466d851..7963dddb 100644
--- a/python/llaisys/tensor.py
+++ b/python/llaisys/tensor.py
@@ -9,6 +9,7 @@
     DataType,
 )
 from ctypes import c_size_t, c_int, c_ssize_t, c_void_p
+import torch
 
 
 class Tensor:
@@ -95,3 +96,26 @@ def slice(self, dim: int, start: int, end: int):
                 self._tensor, c_size_t(dim), c_size_t(start), c_size_t(end)
             )
         )
+    
+    @staticmethod
+    def from_torch(torch_tensor: torch.Tensor):
+        assert torch_tensor.is_contiguous(), "Only contiguous tensors are supported"
+        assert torch_tensor.device.type in ["cpu", "cuda"], "Only CPU and CUDA devices are supported"
+
+        device_type = DeviceType.CPU if torch_tensor.device.type == "cpu" else DeviceType.NVIDIA
+        dtype = DataType.F32
+        if torch_tensor.dtype == torch.float16:
+            dtype = DataType.F16
+        elif torch_tensor.dtype == torch.bfloat16:
+            dtype = DataType.BF16
+        else:
+            raise ValueError(f"Unsupported data type: {torch_tensor.dtype}")
+        _tensor = Tensor(
+            shape=torch_tensor.shape,
+            dtype=dtype,
+            device=device_type,
+            device_id=torch_tensor.device.index if torch_tensor.device.type == "cuda" else 0,
+        )
+
+        _tensor.load(torch_tensor.data_ptr())
+        return _tensor
diff --git a/src/core/runtime/runtime.hpp b/src/core/runtime/runtime.hpp
index 43235824..86f40a4e 100644
--- a/src/core/runtime/runtime.hpp
+++ b/src/core/runtime/runtime.hpp
@@ -37,7 +37,6 @@ class Runtime {
     const LlaisysRuntimeAPI *api() const;
 
     storage_t allocateDeviceStorage(size_t size);
-    ;
     storage_t allocateHostStorage(size_t size);
     void freeStorage(Storage *storage);
 
diff --git a/src/llaisys/models/qwen2.cc b/src/llaisys/models/qwen2.cc
new file mode 100644
index 00000000..61619f4e
--- /dev/null
+++ b/src/llaisys/models/qwen2.cc
@@ -0,0 +1,436 @@
+#include "llaisys/models/qwen2.h"
+
+#include "../../tensor/tensor.hpp"
+#include "../../ops/argmax/op.hpp"
+#include "../../ops/embedding/op.hpp"
+#include "../../ops/linear/op.hpp"
+#include "../../ops/rms_norm/op.hpp"
+#include "../../ops/add/op.hpp"
+#include "../../ops/rope/op.hpp"
+#include "../../ops/self_attention/op.hpp"
+#include "../../ops/swiglu/op.hpp"
+#include "../../utils.hpp"
+#include "../../core/llaisys_core.hpp"
+#include <vector>
+#include <string>
+#include <cmath>
+#include <cstring>
+
+
+using namespace llaisys;
+
+struct Qwen2Weights {
+    tensor_t in_embed;
+    tensor_t out_embed;
+    tensor_t out_norm_w;   // a.k.a. model.norm.weight
+    std::vector<tensor_t> attn_norm_w; // a.k.a. input_layernorm.weight
+    std::vector<tensor_t> attn_q_w;
+    std::vector<tensor_t> attn_q_b;
+    std::vector<tensor_t> attn_k_w;
+    std::vector<tensor_t> attn_k_b;
+    std::vector<tensor_t> attn_v_w;
+    std::vector<tensor_t> attn_v_b;
+    std::vector<tensor_t> attn_o_w;
+    std::vector<tensor_t> mlp_norm_w; // a.k.a. post_attention_layernorm.weight
+    std::vector<tensor_t> mlp_gate_w;
+    std::vector<tensor_t> mlp_up_w;
+    std::vector<tensor_t> mlp_down_w;
+};
+
+class debug {
+public:
+    debug& get() {
+        static debug instance;
+        return instance;
+    }
+
+    static void print_shape(tensor_t tensor, const std::string& tensor_name) {
+        auto shape = tensor->shape();
+        std::cout << tensor_name << " shape: [";
+        for (size_t i = 0; i < shape.size(); i++) {
+            std::cout << shape[i];
+            if (i != shape.size() - 1) {
+                std::cout << ", ";
+            }
+        }
+        std::cout << "]" << std::endl;
+    }
+
+    template<typename... T>
+    static void print(T... args) {
+        ((std::cout << args << '\t'), ...);
+        std::cout << std::endl;
+    }
+
+    debug(const debug&) = delete;
+    debug& operator=(const debug&) = delete;
+private:
+    debug() {}
+};
+
+class Kv_cache {
+public:
+    Kv_cache(size_t nlayers, size_t dh, size_t nkvh, llaisysDataType_t dtype, llaisysDeviceType_t device_type)
+    : nlayer_(nlayers), dh_(dh), nkvh_(nkvh), dtype_(dtype), device_type_(device_type) {
+        k_cache_.resize(nlayer_);
+        v_cache_.resize(nlayer_);
+        total_len_.assign(nlayer_, 0);
+        buf_size_.assign(nlayer_, 5);
+        for(size_t i = 0; i < nlayer_; ++i) {
+            k_cache_[i] = Tensor::create({buf_size_[i], nkvh_, dh_}, dtype_, device_type_);
+            v_cache_[i] = Tensor::create({buf_size_[i], nkvh_, dh_}, dtype_, device_type_);
+        }
+    }
+
+    void add(size_t layer_id, const tensor_t& k, const tensor_t& v, size_t seq_len) {//only support cpu for now
+        if (layer_id >= nlayer_) {
+            throw std::runtime_error("Layer id exceeds the number of layers in the model.");
+        }
+        auto& k_cache = k_cache_[layer_id];
+        auto& v_cache = v_cache_[layer_id];
+        if (total_len_[layer_id] + seq_len > buf_size_[layer_id]) {
+            // If the total length exceeds the buffer size, we need to reallocate larger buffers and copy the existing data
+            size_t new_buf_size = std::max(buf_size_[layer_id] * 2, total_len_[layer_id] + seq_len);
+            auto new_k = Tensor::create({new_buf_size, nkvh_, dh_}, dtype_, device_type_);
+            auto new_v = Tensor::create({new_buf_size, nkvh_, dh_}, dtype_, device_type_);
+            llaisys::core::context().runtime().api()->memcpy_sync(
+                new_k->data(),
+                k_cache->data(),
+                total_len_[layer_id] * nkvh_ * dh_ * k_cache->elementSize(),
+                LLAISYS_MEMCPY_H2H
+            );
+            llaisys::core::context().runtime().api()->memcpy_sync(
+                new_v->data(),
+                v_cache->data(),
+                total_len_[layer_id] * nkvh_ * dh_ * v_cache->elementSize(),
+                LLAISYS_MEMCPY_H2H
+            );
+            k_cache_[layer_id] = new_k;
+            v_cache_[layer_id] = new_v;
+            buf_size_[layer_id] = new_buf_size;
+        }
+        // Copy the new k and v to the cache at the correct position.
+        llaisys::core::context().runtime().api()->memcpy_sync(
+            k_cache_[layer_id]->data() + total_len_[layer_id] * nkvh_ * dh_ * k->elementSize(),
+            k->data(),
+            k->numel() * k->elementSize(),
+            LLAISYS_MEMCPY_H2H
+        );
+        llaisys::core::context().runtime().api()->memcpy_sync(
+            v_cache_[layer_id]->data() + total_len_[layer_id] * nkvh_ * dh_ * v->elementSize(),
+            v->data(),
+            v->numel() * v->elementSize(),
+            LLAISYS_MEMCPY_H2H
+        );
+
+        total_len_[layer_id] += seq_len;
+    }
+
+    tensor_t k(size_t layer_id) {
+        if (layer_id >= nlayer_) {
+            throw std::runtime_error("Layer id exceeds the number of layers in the model.");
+        }
+        return k_cache_[layer_id]->slice(0, 0, total_len_[layer_id]);
+    }
+
+    tensor_t v(size_t layer_id) {
+        if (layer_id >= nlayer_) {
+            throw std::runtime_error("Layer id exceeds the number of layers in the model.");
+        }
+        return v_cache_[layer_id]->slice(0, 0, total_len_[layer_id]);
+    }
+
+private:
+    std::vector<tensor_t> k_cache_;
+    std::vector<tensor_t> v_cache_;
+    size_t nlayer_;
+    std::vector<size_t> total_len_;
+    std::vector<size_t> buf_size_;
+    size_t dh_;
+    size_t nkvh_;
+    llaisysDataType_t dtype_;
+    llaisysDeviceType_t device_type_;
+};
+
+class Qwen2ModelImpl {
+public:
+    Qwen2ModelImpl(const LlaisysQwen2Meta &meta, llaisysDeviceType_t device, const std::vector<int> &device_ids)
+    : meta_(meta), device_(device), device_ids_(device_ids), kv_cache_(meta.nlayer, meta.dh, meta.nkvh, meta.dtype, device)
+    {
+        auto hs = meta_.hs;
+        auto nh = meta_.nh;
+        auto nkvh = meta_.nkvh;
+        auto dh = meta_.dh;
+        auto di = meta_.di;
+
+        weights_.in_embed = Tensor::create({meta_.voc, hs}, meta_.dtype, device_, device_ids_[0]);
+        weights_.out_embed = Tensor::create({meta_.voc, hs}, meta_.dtype, device_, device_ids_[0]);
+        weights_.out_norm_w = Tensor::create({hs}, meta_.dtype, device_, device_ids_[0]);
+        for (size_t i = 0; i < meta_.nlayer; ++i) {
+            weights_.attn_norm_w.push_back(Tensor::create({hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_q_w.push_back(Tensor::create({nh * dh, hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_q_b.push_back(Tensor::create({hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_k_w.push_back(Tensor::create({nkvh * dh, hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_k_b.push_back(Tensor::create({nkvh * dh}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_v_w.push_back(Tensor::create({nkvh * dh, hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_v_b.push_back(Tensor::create({nkvh * dh}, meta_.dtype, device_, device_ids_[0]));
+            weights_.attn_o_w.push_back(Tensor::create({hs, hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.mlp_norm_w.push_back(Tensor::create({hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.mlp_gate_w.push_back(Tensor::create({di, hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.mlp_up_w.push_back(Tensor::create({di, hs}, meta_.dtype, device_, device_ids_[0]));
+            weights_.mlp_down_w.push_back(Tensor::create({hs, di}, meta_.dtype, device_, device_ids_[0]));
+        }
+    }
+    ~Qwen2ModelImpl() = default;
+
+    void loadWeight(const void* src, std::string name) {
+        if (name == "lm_head.weight") {
+            weights_.out_embed->load(src);
+        } else if (name.find("embed_tokens.weight") != std::string::npos) {
+            weights_.in_embed->load(src);
+        }else if (name == "model.norm.weight") {
+            weights_.out_norm_w->load(src);
+        } else {
+            constexpr size_t prefix_len = 13; // "model.layers."
+            name = name.substr(prefix_len);
+            auto pos = name.find('.');
+            auto layer_id = std::stoi(name.substr(0, pos));
+            auto param_name = name.substr(pos + 1);
+            if (param_name == "input_layernorm.weight") {
+                weights_.attn_norm_w[layer_id]->load(src); 
+            }else if (param_name == "self_attn.q_proj.weight") {
+                weights_.attn_q_w[layer_id]->load(src);
+            }else if (param_name == "self_attn.q_proj.bias") {
+                weights_.attn_q_b[layer_id]->load(src);
+            }else if (param_name == "self_attn.k_proj.weight") {
+                weights_.attn_k_w[layer_id]->load(src);
+            }else if (param_name == "self_attn.k_proj.bias") {
+                weights_.attn_k_b[layer_id]->load(src);
+            }else if (param_name == "self_attn.v_proj.weight") {
+                weights_.attn_v_w[layer_id]->load(src);
+            }else if (param_name == "self_attn.v_proj.bias") {
+                weights_.attn_v_b[layer_id]->load(src);
+            }else if (param_name == "self_attn.o_proj.weight") {
+                weights_.attn_o_w[layer_id]->load(src);
+            }else if (param_name == "post_attention_layernorm.weight") {
+                weights_.mlp_norm_w[layer_id]->load(src);
+            }else if (param_name == "mlp.gate_proj.weight") {
+                weights_.mlp_gate_w[layer_id]->load(src);
+            }else if (param_name == "mlp.up_proj.weight") {
+                weights_.mlp_up_w[layer_id]->load(src);
+            }else if (param_name == "mlp.down_proj.weight") {
+                weights_.mlp_down_w[layer_id]->load(src);
+            }else {
+                throw std::runtime_error("Unknown weight name: " + name);
+            }
+        }
+    }
+
+    size_t forward_with_cache(const std::vector<int64_t> &input_ids) {
+        using namespace ops;
+        size_t seq_len = input_ids.size();
+        //Allocate tensor
+        auto cache = get_(seq_len);
+        auto& tensor_input_ids  = cache.tensor_input_ids;
+        auto& x                 = cache.x;
+        auto& x_norm            = cache.x_norm;
+        auto& q                 = cache.q;
+        auto& k_                = cache.k_;
+        auto& v_                = cache.v_;
+        auto& q_rope            = cache.q_rope;
+        auto& k_rope            = cache.k_rope;
+        auto& pos               = cache.pos;
+        auto& attn_val          = cache.attn_val;
+        auto& attn_out          = cache.attn_out;
+        auto& swiglu_out        = cache.swiglu_out;
+        auto& gate_out          = cache.gate_out;
+        auto& up_out            = cache.up_out;
+        auto& norm_out          = cache.norm_out;
+        auto& logits            = cache.logits;
+        auto& next_token_id     = cache.next_token_id;
+        auto& next_token_possibility = cache.next_token_possibility;
+        tensor_t k;
+        tensor_t v;
+        // input embedding
+        tensor_input_ids->load(input_ids.data());
+        embedding(x, tensor_input_ids, weights_.in_embed);
+        
+        for (size_t i = 0; i < meta_.nlayer; i++) {
+            rms_norm(x_norm, x, weights_.attn_norm_w[i], meta_.epsilon);
+            //compute q, k, v
+            linear(q, x_norm, weights_.attn_q_w[i], weights_.attn_q_b[i]);
+            linear(k_, x_norm, weights_.attn_k_w[i], weights_.attn_k_b[i]);
+            linear(v_, x_norm, weights_.attn_v_w[i], weights_.attn_v_b[i]);
+            //rope
+            rope(q_rope, q, pos, meta_.theta);
+            rope(k_rope, k_, pos, meta_.theta);
+            //cache and load k, v
+            kv_cache_.add(i, k_rope, v_, seq_len);
+            k = kv_cache_.k(i);
+            v = kv_cache_.v(i);
+            //attention
+            self_attention(
+                attn_val,
+                q_rope, k, v,
+                1.0f / std::sqrt(utils::cast<float>(meta_.dh))
+            );
+            linear(attn_out, attn_val, weights_.attn_o_w[i]);
+            add(x, x, attn_out);
+            rms_norm(x_norm, x, weights_.mlp_norm_w[i], meta_.epsilon);
+            //FFN
+            linear(gate_out, x_norm, weights_.mlp_gate_w[i]);
+            linear(up_out, x_norm, weights_.mlp_up_w[i]);
+            swiglu(swiglu_out, gate_out, up_out);
+            linear(x_norm, swiglu_out, weights_.mlp_down_w[i]);
+            add(x, x, x_norm);
+        }
+        rms_norm(norm_out, x->slice(0, seq_len - 1, seq_len), weights_.out_norm_w, meta_.epsilon);
+        linear(logits, norm_out, weights_.out_embed);
+        argmax(next_token_id, next_token_possibility, logits);
+        auto ret = *reinterpret_cast<int64_t*>(next_token_id->data());
+
+        //debug
+        debug::print("Next token id: ", ret, " possibility: ", utils::cast<float>(*reinterpret_cast<llaisys::bf16_t*>(next_token_possibility->data())));
+
+        return ret;
+    }
+
+
+    const Qwen2Weights& weights() const {
+        return weights_;
+    }
+private:
+    struct Infer_tensors_buf {
+        tensor_t tensor_input_ids;
+        tensor_t x;
+        tensor_t x_norm;
+        tensor_t q;
+        tensor_t k_;
+        tensor_t v_;
+        tensor_t q_rope;
+        tensor_t k_rope;
+        tensor_t pos;
+        tensor_t attn_val;
+        tensor_t attn_out;
+        tensor_t swiglu_out;
+        tensor_t gate_out;
+        tensor_t up_out;
+        tensor_t norm_out;
+        tensor_t logits;
+        tensor_t next_token_id;
+        tensor_t next_token_possibility;
+        size_t seq_len = 0;
+        size_t total_len = 0;
+    };
+    Infer_tensors_buf get_(size_t seq_len) {
+        infer_buf_.total_len += seq_len;
+        if (!infer_buf_.norm_out) {
+            infer_buf_.norm_out               = Tensor::create({1, meta_.hs}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.logits                 = Tensor::create({meta_.voc}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.next_token_id          = Tensor::create({1}, LLAISYS_DTYPE_I64, device_, device_ids_[0]);
+            infer_buf_.next_token_possibility = Tensor::create({1}, meta_.dtype, device_, device_ids_[0]);
+        }
+        
+        if (!infer_buf_.pos || infer_buf_.total_len > infer_buf_.pos->shape()[0]) {
+            infer_buf_.pos = Tensor::create({infer_buf_.total_len * 2}, LLAISYS_DTYPE_I64, device_, device_ids_[0]);
+            auto p = reinterpret_cast<int64_t*>(infer_buf_.pos->data());
+            for (size_t i = 0; i < infer_buf_.pos->numel(); i++) {
+                *p++ = i;
+            }
+        }
+        Infer_tensors_buf cache;
+        if (infer_buf_.seq_len < seq_len) {
+            infer_buf_.tensor_input_ids  = Tensor::create({seq_len}, LLAISYS_DTYPE_I64, device_, device_ids_[0]);
+            infer_buf_.x                 = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.x_norm            = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.q                 = Tensor::create({seq_len, meta_.nh, meta_.dh}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.k_                = Tensor::create({seq_len, meta_.nkvh, meta_.dh}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.v_                = Tensor::create({seq_len, meta_.nkvh, meta_.dh}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.q_rope            = Tensor::create({seq_len, meta_.nh, meta_.dh}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.k_rope            = Tensor::create({seq_len, meta_.nkvh, meta_.dh}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.attn_val          = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.attn_out          = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.swiglu_out        = Tensor::create({seq_len, meta_.di}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.gate_out          = Tensor::create({seq_len, meta_.di}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.up_out            = Tensor::create({seq_len, meta_.di}, meta_.dtype, device_, device_ids_[0]);
+            infer_buf_.seq_len = seq_len;
+            cache = infer_buf_;
+        }else {
+            cache.tensor_input_ids = infer_buf_.tensor_input_ids->slice(0, 0, seq_len);
+            cache.x = infer_buf_.x->slice(0, 0, seq_len);
+            cache.x_norm = infer_buf_.x_norm->slice(0, 0, seq_len);
+            cache.q = infer_buf_.q->slice(0, 0, seq_len);
+            cache.k_ = infer_buf_.k_->slice(0, 0, seq_len);
+            cache.v_ = infer_buf_.v_->slice(0, 0, seq_len);
+            cache.q_rope = infer_buf_.q_rope->slice(0, 0, seq_len);
+            cache.k_rope = infer_buf_.k_rope->slice(0, 0, seq_len);
+            cache.attn_val = infer_buf_.attn_val->slice(0, 0, seq_len);
+            cache.attn_out = infer_buf_.attn_out->slice(0, 0, seq_len);
+            cache.swiglu_out = infer_buf_.swiglu_out->slice(0, 0, seq_len);
+            cache.gate_out = infer_buf_.gate_out->slice(0, 0, seq_len);
+            cache.up_out = infer_buf_.up_out->slice(0, 0, seq_len);
+        }
+        cache.pos = infer_buf_.pos->slice(0, infer_buf_.total_len - seq_len, infer_buf_.total_len);
+        cache.norm_out = infer_buf_.norm_out;
+        cache.logits = infer_buf_.logits;
+        cache.next_token_id = infer_buf_.next_token_id;
+        cache.next_token_possibility = infer_buf_.next_token_possibility;
+        return cache;
+    }
+private:
+    LlaisysQwen2Meta meta_;
+    Qwen2Weights weights_;
+    llaisysDeviceType_t device_;
+    std::vector<int> device_ids_;
+    Kv_cache kv_cache_;
+    Infer_tensors_buf infer_buf_;
+};
+
+__C {
+    LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) {
+        auto impl = new Qwen2ModelImpl(*meta, device, std::vector<int>(device_ids, device_ids + ndevice));
+        auto model = new LlaisysQwen2Model;
+        model->impl = impl;
+        return model;
+    }
+
+    void llaisysQwen2ModelDestroy(LlaisysQwen2Model * model) {
+        delete model->meta;
+        delete model->weights;
+        delete static_cast<Qwen2ModelImpl*>(model->impl);
+        delete model;
+    }
+
+    void llaisysQwen2modelLoadWeight(LlaisysQwen2Model * model, const void *weight_data, const char *weight_name) {
+        auto impl = static_cast<Qwen2ModelImpl*>(model->impl);
+        impl->loadWeight(weight_data, weight_name);
+    }
+
+    LlaisysQwen2Weights *llaisysQwen2ModelWeights(LlaisysQwen2Model * model) {
+        delete model->weights; // Free previously allocated weights if any
+        auto weights =  reinterpret_cast<Qwen2ModelImpl*>(model->impl)->weights();
+        model->weights = new LlaisysQwen2Weights{
+            reinterpret_cast<llaisysTensor_t>(weights.in_embed.get()),
+            reinterpret_cast<llaisysTensor_t>(weights.out_embed.get()),
+            reinterpret_cast<llaisysTensor_t>(weights.out_norm_w.get()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_norm_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_q_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_q_b.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_k_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_k_b.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_v_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_v_b.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.attn_o_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.mlp_norm_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.mlp_gate_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.mlp_up_w.data()),
+            reinterpret_cast<llaisysTensor_t*>(weights.mlp_down_w.data())
+        };
+        return model->weights;
+    }
+
+    int64_t llaisysQwen2ModelInfer(LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) {
+        auto impl = static_cast<Qwen2ModelImpl*>(model->impl);
+        return impl->forward_with_cache(std::vector<int64_t>(token_ids,token_ids + ntoken));
+    }
+}
\ No newline at end of file
diff --git a/src/ops/add/cpu/add_cpu.hpp b/src/ops/add/cpu/add_cpu.hpp
index 34d809a1..d9167beb 100644
--- a/src/ops/add/cpu/add_cpu.hpp
+++ b/src/ops/add/cpu/add_cpu.hpp
@@ -5,4 +5,4 @@
 
 namespace llaisys::ops::cpu {
 void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t size);
-}
\ No newline at end of file
+} 
\ No newline at end of file
diff --git a/src/ops/argmax/cpu/argmax.cpp b/src/ops/argmax/cpu/argmax.cpp
new file mode 100644
index 00000000..fbe884ca
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax.cpp
@@ -0,0 +1,38 @@
+#include "argmax.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+
+template <typename T>
+static void argmax_(size_t *max_idx, T *max_val, const T *input, size_t numel) {
+   if (numel == 0) {
+       return;
+   }
+   *max_idx = static_cast<size_t>(0);
+   *max_val = input[0];
+   for(size_t i = 1; i < numel; ++i) {
+       if (input[i] > *max_val) {
+           *max_val = input[i];
+           *max_idx = i;
+       }
+   }
+}
+
+namespace llaisys::ops::cpu {
+void argmax(size_t *max_idx, std::byte *max_val, const std::byte *input, llaisysDataType_t val_type, size_t numel) {
+    switch (val_type) {
+    case LLAISYS_DTYPE_F32:
+        argmax_(max_idx, reinterpret_cast<float *>(max_val), reinterpret_cast<const float *>(input), numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        argmax_(max_idx, reinterpret_cast<llaisys::bf16_t *>(max_val), reinterpret_cast<const llaisys::bf16_t *>(input), numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        argmax_(max_idx, reinterpret_cast<llaisys::fp16_t *>(max_val), reinterpret_cast<const llaisys::fp16_t *>(input), numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(val_type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/argmax/cpu/argmax.hpp b/src/ops/argmax/cpu/argmax.hpp
new file mode 100644
index 00000000..db92c915
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void argmax(size_t *max_idx, std::byte *max_val, const std::byte *input, llaisysDataType_t type, size_t size);
+}
\ No newline at end of file
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d42..7597d993 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,29 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/argmax.hpp"
+
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(max_idx, max_val, vals);
+    CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype());
+    //Only support 1D tensor for now.
+    CHECK_ARGUMENT(vals->ndim() == 1, "Argmax: only 1D tensor is supported for now.");
+
+    // Only support contiguous inputs for now.
+    ASSERT(max_idx->isContiguous() && max_val->isContiguous() && vals->isContiguous(), "Argmax: all tensors must be contiguous.");
+    
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+    switch (vals->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::argmax(reinterpret_cast<size_t *>(max_idx->data()), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/embedding/cpu/embedding.cpp b/src/ops/embedding/cpu/embedding.cpp
new file mode 100644
index 00000000..6c2d386b
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding.cpp
@@ -0,0 +1,65 @@
+#include "embedding.hpp"
+
+#include "../../../utils.hpp"
+#include "../../../core/llaisys_core.hpp"
+
+template <typename T>
+static void embedding_(
+    T* out,
+    const int64_t* index,
+    const T* weight,
+    std::size_t embedding_dim,
+    std::size_t index_size
+) {
+    for (size_t i = 0; i < index_size; ++i) {
+        llaisys::core::context().runtime().api()->memcpy_sync(
+            out + i * embedding_dim,
+            weight + index[i] * embedding_dim,
+            embedding_dim * sizeof(T),
+            LLAISYS_MEMCPY_H2H
+        );
+    }
+}
+
+namespace llaisys::ops::cpu {
+void embedding(
+    void* out,
+    const void* index,
+    const void* weight,
+    size_t index_size,
+    size_t embedding_dim,
+    llaisysDataType_t data_type
+) {
+    switch (data_type) {
+        case LLAISYS_DTYPE_F32:
+            embedding_(
+                reinterpret_cast<float*>(out),
+                reinterpret_cast<const int64_t*>(index),
+                reinterpret_cast<const float*>(weight),
+                embedding_dim,
+                index_size
+            );
+            break;
+        case LLAISYS_DTYPE_F16:
+            embedding_(
+                reinterpret_cast<llaisys::bf16_t*>(out),
+                static_cast<const int64_t*>(index),
+                reinterpret_cast<const llaisys::bf16_t*>(weight),
+                embedding_dim,
+                index_size
+            );
+            break;
+        case LLAISYS_DTYPE_BF16:
+            embedding_(
+                reinterpret_cast<llaisys::fp16_t*>(out),
+                static_cast<const int64_t*>(index),
+                reinterpret_cast<const llaisys::fp16_t*>(weight),
+                embedding_dim,
+                index_size
+            );
+            break;
+        default:
+            throw std::runtime_error("Unsupported data type in embedding operation.");
+    }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding.hpp b/src/ops/embedding/cpu/embedding.hpp
new file mode 100644
index 00000000..ed5ca94a
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding.hpp
@@ -0,0 +1,15 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void embedding(
+    void* out,
+    const void* index,
+    const void* weight,
+    size_t index_size,
+    size_t embedding_dim,
+    llaisysDataType_t data_type
+);
+}
\ No newline at end of file
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d0..baafb72f 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,38 @@
 #include "op.hpp"
 
+#include "../../utils.hpp"
+
+#include "cpu/embedding.hpp"
+
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, index, weight);
+    CHECK_SAME_DTYPE(out->dtype(), weight->dtype());
+    CHECK_ARGUMENT(
+        index->dtype() == LLAISYS_DTYPE_I64,
+        "Index tensor must be of type INT64."
+    );
+
+    if(out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::embedding(
+            out->data(),
+            index->data(),
+            weight->data(),
+            index->numel(),
+            weight->shape().back(),
+            out->dtype()
+        );
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+    switch (out->deviceType()) {
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/linear/cpu/linear.cpp b/src/ops/linear/cpu/linear.cpp
new file mode 100644
index 00000000..410d9c4c
--- /dev/null
+++ b/src/ops/linear/cpu/linear.cpp
@@ -0,0 +1,79 @@
+#include "linear.hpp"
+
+#include "../../../utils.hpp"
+
+template <typename T>
+static void linear_(
+    T* out,
+    const T* in,
+    const T* weight,
+    const T* bias,
+    size_t batch_size,
+    size_t in_features,
+    size_t out_features
+) {
+    using namespace llaisys::utils;
+    for (size_t b = 0; b < batch_size; b++) {
+        const T* in_batch = in + b * in_features;
+        T* out_batch = out + b * out_features;
+
+        for (size_t o = 0; o < out_features; o++) {
+            float sum = bias ? cast<float>(bias[o]) : 0.0f;
+            const T* weight_ = weight + o * in_features;
+            
+            for (size_t i = 0; i < in_features; i++) {
+                sum += cast<float>(in_batch[i]) * cast<float>(weight_[i]);
+            }
+            out_batch[o] = cast<T>(sum);
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void linear(
+    void* out,
+    const void* in,
+    const void* weight,
+    const void* bias,
+    size_t batch_size,
+    size_t in_features,
+    size_t out_features,
+    llaisysDataType_t data_type
+) {
+    switch (data_type) {
+    case LLAISYS_DTYPE_F32:
+        return linear_(
+            reinterpret_cast<float*>(out),
+            reinterpret_cast<const float*>(in),
+            reinterpret_cast<const float*>(weight),
+            reinterpret_cast<const float*>(bias),
+            batch_size,
+            in_features,
+            out_features
+        );
+    case LLAISYS_DTYPE_F16:
+        return linear_(
+            reinterpret_cast<llaisys::fp16_t*>(out),
+            reinterpret_cast<const llaisys::fp16_t*>(in),
+            reinterpret_cast<const llaisys::fp16_t*>(weight),
+            reinterpret_cast<const llaisys::fp16_t*>(bias),
+            batch_size,
+            in_features,
+            out_features
+        );
+    case LLAISYS_DTYPE_BF16:
+        return linear_(
+            reinterpret_cast<llaisys::bf16_t*>(out),
+            reinterpret_cast<const llaisys::bf16_t*>(in),
+            reinterpret_cast<const llaisys::bf16_t*>(weight),
+            reinterpret_cast<const llaisys::bf16_t*>(bias),
+            batch_size,
+            in_features,
+            out_features
+        );
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(data_type);
+    }
+        
+}
+}
\ No newline at end of file
diff --git a/src/ops/linear/cpu/linear.hpp b/src/ops/linear/cpu/linear.hpp
new file mode 100644
index 00000000..795cc827
--- /dev/null
+++ b/src/ops/linear/cpu/linear.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void linear(
+    void* out,
+    const void* in,
+    const void* weight,
+    const void* bias,
+    size_t batch_size,
+    size_t in_features,
+    size_t out_features,
+    llaisysDataType_t data_type
+);
+}
\ No newline at end of file
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f865..3b4cf84a 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,79 @@
 #include "op.hpp"
 
+#include "../../utils.hpp"
+
+#include "cpu/linear.hpp"
+
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, weight, bias);
+    CHECK_ARGUMENT(
+        in->shape().back() == weight->shape().back(),
+        "Input dimension does not match weight dimension."
+    );
+
+    size_t batch_size = 1;
+    for (size_t i = 0; i < in->ndim() - 1; ++i) {
+        batch_size *= in->shape()[i];
+    }
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::linear(
+            out->data(),
+            in->data(),
+            weight->data(),
+            bias->data(),
+            batch_size,
+            in->shape().back(),
+            weight->shape()[weight->ndim() - 2],
+            in->dtype()
+        );
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+    switch (out->deviceType()) {
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
+
+void linear(tensor_t out, tensor_t in, tensor_t weight) {
+    CHECK_SAME_DEVICE(out, in, weight);
+    CHECK_ARGUMENT(
+        in->shape().back() == weight->shape().back(),
+        "Input dimension does not match weight dimension."
+    );
+
+    size_t batch_size = 1;
+    for (size_t i = 0; i < in->ndim() - 1; ++i) {
+        batch_size *= in->shape()[i];
+    }
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::linear(
+            out->data(),
+            in->data(),
+            weight->data(),
+            nullptr,
+            batch_size,
+            in->shape().back(),
+            weight->shape()[weight->ndim() - 2],
+            in->dtype()
+        );
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+    switch (out->deviceType()) {
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/linear/op.hpp b/src/ops/linear/op.hpp
index 7bf06f01..b8b41d1b 100644
--- a/src/ops/linear/op.hpp
+++ b/src/ops/linear/op.hpp
@@ -4,4 +4,5 @@
 
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias);
+void linear(tensor_t out, tensor_t in, tensor_t weight);
 }
diff --git a/src/ops/rms_norm/cpu/rms_norm.cpp b/src/ops/rms_norm/cpu/rms_norm.cpp
new file mode 100644
index 00000000..9fd4f987
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm.cpp
@@ -0,0 +1,93 @@
+#include "rms_norm.hpp"
+
+#include "../../../utils.hpp"
+
+#include<algorithm>
+#include <cmath>
+
+template <typename T>
+void rms_norm_(
+    T *output,
+    const T *input,
+    const T *weight,
+    const T *bias,
+    size_t batch_size,
+    size_t feature_size,
+    float epsilon
+) {
+    for (size_t b = 0; b < batch_size; b++) {
+        const T *input_batch = input + b * feature_size;
+        T *output_batch = output + b * feature_size;
+
+        // Compute mean square
+        float mean_square = 0.0f;
+        for (size_t i = 0; i < feature_size; i++) {
+            float val = llaisys::utils::cast<float>(input_batch[i]);
+            mean_square += val * val;
+        }
+        mean_square /= static_cast<float>(feature_size);
+
+        // Compute RMS
+        float rms = std::sqrt(mean_square + epsilon);
+
+        // Normalize and apply weight and bias
+        for (size_t i = 0; i < feature_size; i++) {
+            float normalized = llaisys::utils::cast<float>(input_batch[i]) / rms;
+            if (weight) {
+                normalized *= llaisys::utils::cast<float>(weight[i]);
+            }
+            if (bias) {
+                normalized += llaisys::utils::cast<float>(bias[i]);
+            }
+            output_batch[i] = llaisys::utils::cast<T>(normalized);
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rms_norm(
+    std::byte *output,
+    const std::byte *input,
+    const std::byte *weight,
+    const std::byte *bias,
+    llaisysDataType_t type,
+    size_t batch_size,
+    size_t feature_size,
+    float epsilon
+) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rms_norm_(
+            reinterpret_cast<float *>(output),
+            reinterpret_cast<const float *>(input),
+            reinterpret_cast<const float *>(weight),
+            reinterpret_cast<const float *>(bias),
+            batch_size,
+            feature_size,
+            epsilon
+        );
+    case LLAISYS_DTYPE_F16:
+        return rms_norm_(
+            reinterpret_cast<llaisys::fp16_t *>(output),
+            reinterpret_cast<const llaisys::fp16_t *>(input),
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            reinterpret_cast<const llaisys::fp16_t *>(bias),
+            batch_size,
+            feature_size,
+            epsilon
+        );
+    case LLAISYS_DTYPE_BF16:
+        return rms_norm_(
+            reinterpret_cast<llaisys::bf16_t *>(output),
+            reinterpret_cast<const llaisys::bf16_t *>(input),
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            reinterpret_cast<const llaisys::bf16_t *>(bias),
+            batch_size,
+            feature_size,
+            epsilon
+        );
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
diff --git a/src/ops/rms_norm/cpu/rms_norm.hpp b/src/ops/rms_norm/cpu/rms_norm.hpp
new file mode 100644
index 00000000..11e796f7
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rms_norm(
+    std::byte *output,
+    const std::byte *input,
+    const std::byte *weight,
+    const std::byte *bias,
+    llaisysDataType_t type,
+    size_t batch_size,
+    size_t feature_size,
+    float epsilon
+);
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9..f1b789b2 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,27 @@
 #include "op.hpp"
 
+#include "../../utils.hpp"
+
+#include "cpu/rms_norm.hpp"
+
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
-    TO_BE_IMPLEMENTED();
+    size_t feature_size = in->shape().back();
+    size_t batch_size = in->numel() / feature_size;
+
+    // Call CPU implementation
+    if (in->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rms_norm(
+            out->data(),
+            in->data(),
+            weight->data(),
+            nullptr,
+            in->dtype(),
+            batch_size,
+            feature_size,
+            eps);
+    }
+    //TODO: Add more device implementations here
+    EXCEPTION_UNSUPPORTED_DEVICE;
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rope/cpu/rope.cpp b/src/ops/rope/cpu/rope.cpp
new file mode 100644
index 00000000..dc331ac4
--- /dev/null
+++ b/src/ops/rope/cpu/rope.cpp
@@ -0,0 +1,66 @@
+#include "rope.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <vector>
+
+template<typename T>
+static void rope_(
+    T* out, const T* in, const int64_t* pos_id, float theta,
+    size_t seqlen, size_t nhead, size_t d
+) {
+    using namespace llaisys::utils;
+    size_t half = d / 2;
+
+    std::vector<float> inv_freq(half);
+    float log_theta = std::log(theta);
+    for (size_t j = 0; j < half; j++) {
+        inv_freq[j] = std::exp(-log_theta * (2.0f * j / d));
+    }
+
+    for (size_t i = 0; i < seqlen; i++) {
+        float p = static_cast<float>(pos_id[i]);
+        for (size_t h = 0; h < nhead; h++) {
+            size_t base = (i * nhead + h) * d;
+            for (size_t j = 0; j < half; j++) {
+                float angle = p * inv_freq[j];
+                float cos_val = std::cos(angle);
+                float sin_val = std::sin(angle);
+
+                float a = cast<float>(in[base + j]);
+                float b = cast<float>(in[base + j + half]);
+
+                out[base + j]        = cast<T>(a * cos_val - b * sin_val);
+                out[base + j + half] = cast<T>(b * cos_val + a * sin_val);
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+    void rope(
+        void* out,
+        const void* in,
+        const void* pos_id,
+        float theta,
+        size_t seqlen,
+        size_t nhead,
+        size_t d,
+        llaisysDataType_t dtype
+    ) {
+        switch (dtype) {
+            case LLAISYS_DTYPE_F32:
+                return rope_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(in),
+                             reinterpret_cast<const int64_t *>(pos_id), theta, seqlen, nhead, d);
+            case LLAISYS_DTYPE_BF16:
+                return rope_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(in),
+                             reinterpret_cast<const int64_t *>(pos_id), theta, seqlen, nhead, d);
+            case LLAISYS_DTYPE_F16:
+                return rope_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(in),
+                             reinterpret_cast<const int64_t *>(pos_id), theta, seqlen, nhead, d);
+            default:
+                EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+        }
+    }
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/rope/cpu/rope.hpp b/src/ops/rope/cpu/rope.hpp
new file mode 100644
index 00000000..8a332258
--- /dev/null
+++ b/src/ops/rope/cpu/rope.hpp
@@ -0,0 +1,17 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+    void rope(
+        void* out,
+        const void* in,
+        const void* pos_id,
+        float theta,
+        size_t seqlen,
+        size_t nhead,
+        size_t d,
+        llaisysDataType_t dtype
+    );
+} 
\ No newline at end of file
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64..ad5cdbe3 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,26 @@
 #include "op.hpp"
 
+#include "../../utils.hpp"
+
+#include "cpu/rope.hpp"
+
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
+    CHECK_SAME_DEVICE(out, in, pos_ids);
+    CHECK_SAME_SHAPE(out->shape(), in->shape());
+    CHECK_ARGUMENT(pos_ids->ndim() == 1, "pos_ids must be 1D");
+    CHECK_ARGUMENT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "pos_ids must be of type int64");
+    CHECK_ARGUMENT(out->dtype() == in->dtype(), "out and in must have the same dtype");
+
+    size_t seqlen = in->shape()[0];
+    size_t nhead = in->shape()[1];
+    size_t d = in->shape()[2];
+    if(out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rope(
+            out->data(), in->data(), pos_ids->data(),
+            theta, seqlen, nhead, d, in->dtype()
+        );
+    }
     TO_BE_IMPLEMENTED();
 }
 } // namespace llaisys::ops
diff --git a/src/ops/self_attention/cpu/self_attention.cpp b/src/ops/self_attention/cpu/self_attention.cpp
new file mode 100644
index 00000000..3d2dbdf1
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention.cpp
@@ -0,0 +1,82 @@
+#include "self_attention.hpp"
+
+#include "../../../utils.hpp"
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+template <typename T>
+void self_attention_(
+    T* out, const T* q, const T* k, const T* v,
+    size_t seqlen, size_t nhead, size_t d,
+    size_t total_len, size_t nkvhead, size_t dv,
+    float scale
+) {
+    using namespace llaisys::utils;
+    for (size_t h = 0; h < nhead; h++) {
+        size_t q_base = h * d;
+        size_t k_base = h / (nhead / nkvhead) * d;
+        size_t v_base = h / (nhead / nkvhead) * dv;
+
+        for (size_t i = 0; i < seqlen; i++) {
+            size_t len = total_len - seqlen + 1;
+            std::vector<float> attn_weights(len + i);
+            for (size_t j = 0; j < i + len; j++) {
+                float sum = 0.f;
+                for (size_t n = 0; n < d; n++) {
+                    sum += cast<float>(
+                        q[q_base + i * nhead * d + n] * k[k_base  + j * nkvhead * d + n]
+                    );
+                }
+                sum *= scale;
+                attn_weights[j] = sum;
+            }
+            // softmax
+            float max_weight = *std::max_element(attn_weights.begin(), attn_weights.end());
+            float sum_exp = 0.f;
+            for (float& w : attn_weights) {
+                w = std::exp(w - max_weight);
+                sum_exp += w;
+            }
+            for (float& w : attn_weights) {
+                w /= sum_exp;
+            }
+            // output
+            for (size_t j = 0; j < dv; j++) {
+                float sum = 0.f;
+                for (size_t n = 0; n < attn_weights.size(); n++) {
+                    sum += attn_weights[n] * cast<float>(v[v_base + n * nkvhead * dv + j]);
+                }
+                out[(h + i * nhead) * dv + j] = cast<T>(sum);
+            }
+        }
+   }
+}
+
+namespace llaisys::ops::cpu {
+void self_attention(
+    void* out, const void* q, const void* k, const void* v,
+    size_t seqlen, size_t nhead, size_t d,
+    size_t total_len, size_t nkvhead, size_t dv,
+    float scale,
+    llaisysDataType_t dtype
+) {
+    switch (dtype) {
+        case LLAISYS_DTYPE_F32:
+            return self_attention_(reinterpret_cast<float *>(out), reinterpret_cast<const float *>(q),
+                                   reinterpret_cast<const float *>(k), reinterpret_cast<const float *>(v),
+                                   seqlen, nhead, d, total_len, nkvhead, dv, scale);
+        case LLAISYS_DTYPE_BF16:
+            return self_attention_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const llaisys::bf16_t *>(q),
+                                   reinterpret_cast<const llaisys::bf16_t *>(k), reinterpret_cast<const llaisys::bf16_t *>(v),
+                                   seqlen, nhead, d, total_len, nkvhead, dv, scale);
+        case LLAISYS_DTYPE_F16:
+            return self_attention_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const llaisys::fp16_t *>(q),
+                                   reinterpret_cast<const llaisys::fp16_t *>(k), reinterpret_cast<const llaisys::fp16_t *>(v),
+                                   seqlen, nhead, d, total_len, nkvhead, dv, scale);
+        default:
+            EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+    }
+} // namespace llaisys::ops::cpu
+}
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention.hpp b/src/ops/self_attention/cpu/self_attention.hpp
new file mode 100644
index 00000000..6aa374d0
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention.hpp
@@ -0,0 +1,14 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void self_attention(
+    void* out, const void* q, const void* k, const void* v,
+    size_t seqlen, size_t nhead, size_t d,
+    size_t total_len, size_t nkvhead, size_t dv,
+    float scale,
+    llaisysDataType_t dtype
+);
+} 
\ No newline at end of file
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d62014..dd6eceee 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,29 @@
 #include "op.hpp"
 
+#include "../../utils.hpp"
+
+#include "cpu/self_attention.hpp"
+
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
+    CHECK_SAME_DEVICE(attn_val, q, k, v);
+    
+    size_t seqlen = q->shape()[0];
+    size_t nhead = q->shape()[1];
+    size_t d = q->shape()[2];
+    size_t total_len = k->shape()[0];
+    size_t nkvhead = k->shape()[1];
+    size_t dv = v->shape()[2];
+
+    if(attn_val->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::self_attention(
+            attn_val->data(), q->data(), k->data(), v->data(),
+            seqlen, nhead, d,
+            total_len, nkvhead, dv,
+            scale,
+            q->dtype()
+        );
+    }
     TO_BE_IMPLEMENTED();
 }
 } // namespace llaisys::ops
diff --git a/src/ops/swiglu/cpu/swiglu.cpp b/src/ops/swiglu/cpu/swiglu.cpp
new file mode 100644
index 00000000..c0552ca0
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu.cpp
@@ -0,0 +1,46 @@
+#include "swiglu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <algorithm>
+#include <cmath>
+
+template <typename T>
+void swiglu_(
+    T *output,
+    const T *input,
+    const T *gate,
+    size_t total_size
+) {
+    for(size_t i = 0; i < total_size; i++) {
+        float gate_val = llaisys::utils::cast<float>(gate[i]);
+        float silu = gate_val / (1.0f + std::exp(-gate_val));
+        float input_val = llaisys::utils::cast<float>(input[i]);
+        output[i] = llaisys::utils::cast<T>(input_val * silu);
+    }
+}
+
+namespace llaisys::ops::cpu {
+void swiglu(
+    std::byte *output,
+    const std::byte *input,
+    const std::byte *gate,
+    llaisysDataType_t type,
+    size_t total_size
+) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return swiglu_(reinterpret_cast<float *>(output), reinterpret_cast<const float *>(input),
+                       reinterpret_cast<const float *>(gate), total_size);
+    case LLAISYS_DTYPE_F16:
+        return swiglu_(reinterpret_cast<llaisys::fp16_t *>(output), reinterpret_cast<const llaisys::fp16_t *>(input),
+                       reinterpret_cast<const llaisys::fp16_t *>(gate), total_size);
+    case LLAISYS_DTYPE_BF16:
+        return swiglu_(reinterpret_cast<llaisys::bf16_t *>(output), reinterpret_cast<const llaisys::bf16_t *>(input),
+                       reinterpret_cast<const llaisys::bf16_t *>(gate), total_size);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
+
diff --git a/src/ops/swiglu/cpu/swiglu.hpp b/src/ops/swiglu/cpu/swiglu.hpp
new file mode 100644
index 00000000..1b0b91f5
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu.hpp
@@ -0,0 +1,14 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void swiglu(
+    std::byte *output,
+    const std::byte *input,
+    const std::byte *gate,
+    llaisysDataType_t type,
+    size_t tatal_size
+);
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc9..ac4fa5bd 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,21 @@
 #include "op.hpp"
+#include "cpu/swiglu.hpp"
 
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, gate, up);
+    CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape());
+
+   if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::swiglu(
+            out->data(),
+            up->data(),
+            gate->data(),
+            out->dtype(),
+            out->numel()
+        );
+    } else {
+        TO_BE_IMPLEMENTED();
+   }
 }
 } // namespace llaisys::ops
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb6..5e9d8730 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -26,14 +26,17 @@ tensor_t Tensor::create(const std::vector<size_t> &shape,
     size_t total_elems = stride;
     size_t dtype_size = utils::dsize(dtype);
 
+    tensor_t ret;
     if (device_type == LLAISYS_DEVICE_CPU && core::context().runtime().deviceType() != LLAISYS_DEVICE_CPU) {
         auto storage = core::context().runtime().allocateHostStorage(total_elems * dtype_size);
-        return std::shared_ptr<Tensor>(new Tensor(meta, storage));
+        ret =  std::shared_ptr<Tensor>(new Tensor(meta, storage, 0));
     } else {
         core::context().setDevice(device_type, device);
         auto storage = core::context().runtime().allocateDeviceStorage(total_elems * dtype_size);
-        return std::shared_ptr<Tensor>(new Tensor(meta, storage));
+        ret = std::shared_ptr<Tensor>(new Tensor(meta, storage, 0));
     }
+    ASSERT(ret->data() != nullptr, "Failed to allocate memory for tensor.");
+    return ret;
 }
 
 std::byte *Tensor::data() {
@@ -164,32 +167,100 @@ void Tensor::debug() const {
 }
 
 bool Tensor::isContiguous() const {
-    TO_BE_IMPLEMENTED();
-    return true;
+    auto& shape = this->shape();
+    auto& strides = this->strides();
+    size_t ndim = shape.size();
+    
+    bool ret = true;
+    if(ndim != 0) {
+        ptrdiff_t expected_stride = 1;
+        for(int i = static_cast<int>(ndim - 1); i >= 0; i--) {
+            if(strides[i] != expected_stride) {
+                ret = false;
+                break;
+            }
+            expected_stride *= static_cast<ptrdiff_t>(shape[i]);
+        }
+    }
+    return ret;
 }
 
 tensor_t Tensor::permute(const std::vector<size_t> &order) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    auto meta = _meta;
+    for(size_t i = 0; i < order.size(); i++) {
+        CHECK_ARGUMENT(order[i] < order.size(), "Permute order out of range");
+        meta.shape[i] = _meta.shape[order[i]];
+        meta.strides[i] = _meta.strides[order[i]];
+    }
+    return std::shared_ptr<Tensor>(new Tensor(meta, _storage, _offset));
 }
 
 tensor_t Tensor::view(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    size_t new_numel = 1;
+    for (auto s : shape) {
+        new_numel *= s;
+    }
+    if (new_numel != this->numel()) {
+        EXCEPTION_SHAPE_MISMATCH;
+    }
+    if (!this->isContiguous()) {
+        std::cerr << "[ERROR] View error" << EXCEPTION_LOCATION_MSG << std::endl;
+        throw std::runtime_error("View on non-contiguous tensor, call contiguous() first.");
+    }
+    
+    std::vector<ptrdiff_t> new_strides(shape.size());
+    size_t stride = 1;
+    for (size_t i = 1; i <= shape.size(); i++) {
+        new_strides[shape.size() - i] = stride;
+        stride *= shape[shape.size() - i];
+    }
+    auto new_meta = TensorMeta{
+        this->dtype(),
+        shape,
+        new_strides,
+    };
+    return tensor_t(new Tensor(new_meta, _storage, _offset));
 }
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    CHECK_ARGUMENT(dim < this->ndim(), "Slice dimension out of range");
+    CHECK_ARGUMENT(start <= end && end <= this->shape()[dim], "Slice indices out of range");
+    
+    TensorMeta meta = _meta;
+    meta.shape[dim] = end - start;
+    size_t offset = _offset + start * _meta.strides[dim] * this->elementSize();
+    return tensor_t(new Tensor(meta, _storage, offset));
 }
 
 void Tensor::load(const void *src_) {
-    TO_BE_IMPLEMENTED();
+    size_t total_bytes = this->numel() * this->elementSize();
+    if (this->deviceType() == LLAISYS_DEVICE_CPU) {
+        core::context().runtime().api()->memcpy_sync(
+            this->data(),
+            src_,
+            total_bytes,
+            LLAISYS_MEMCPY_H2H
+        );
+    }
+    else {
+        core::context().runtime().api()->memcpy_sync(
+            this->data(),
+            src_,
+            total_bytes,
+            LLAISYS_MEMCPY_H2D
+        );
+    }
 }
 
 tensor_t Tensor::contiguous() const {
+    if(this->isContiguous()) {
+        return std::shared_ptr<Tensor>(new Tensor(_meta, _storage, _offset));
+    }
+    auto new_tensor = Tensor::create(shape(), dtype(), deviceType(), deviceId());
+    //size_t total_bytes = this->numel() * this->elementSize();
+
     TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    return new_tensor;
 }
 
 tensor_t Tensor::reshape(const std::vector<size_t> &shape) const {
diff --git a/src/utils/types.cpp b/src/utils/types.cpp
index 4163c214..958d632f 100644
--- a/src/utils/types.cpp
+++ b/src/utils/types.cpp
@@ -2,6 +2,58 @@
 
 #include <cstring>
 
+namespace llaisys {
+    bool CustomFloat16::operator<(const CustomFloat16 &other) const {
+        return utils::cast<float>(*this) < utils::cast<float>(other);
+    }
+    bool CustomFloat16::operator>(const CustomFloat16 &other) const {
+        return utils::cast<float>(*this) > utils::cast<float>(other);
+    }
+    bool CustomFloat16::operator<=(const CustomFloat16 &other) const {
+        return utils::cast<float>(*this) <= utils::cast<float>(other);
+    }
+    bool CustomFloat16::operator>=(const CustomFloat16 &other) const {
+        return utils::cast<float>(*this) >= utils::cast<float>(other);
+    }
+    CustomFloat16 CustomFloat16::operator+(const CustomFloat16 &other) const{
+        return utils::cast<CustomFloat16>(utils::cast<float>(*this) + utils::cast<float>(other));
+    }
+    CustomFloat16 CustomFloat16::operator-(const CustomFloat16 &other) const {
+        return utils::cast<CustomFloat16>(utils::cast<float>(*this) - utils::cast<float>(other));
+    }
+    CustomFloat16 CustomFloat16::operator*(const CustomFloat16 &other) const {
+        return utils::cast<CustomFloat16>(utils::cast<float>(*this) * utils::cast<float>(other));
+    }
+    CustomFloat16 CustomFloat16::operator/(const CustomFloat16 &other) const {
+        return utils::cast<CustomFloat16>(utils::cast<float>(*this) / utils::cast<float>(other));
+    }
+
+    bool CustomBFloat16::operator<(const CustomBFloat16 &other) const {
+        return utils::cast<float>(*this) < utils::cast<float>(other);
+    }
+    bool CustomBFloat16::operator>(const CustomBFloat16 &other) const {
+        return utils::cast<float>(*this) > utils::cast<float>(other);
+    }
+    bool CustomBFloat16::operator<=(const CustomBFloat16 &other) const {
+        return utils::cast<float>(*this) <= utils::cast<float>(other);
+    }
+    bool CustomBFloat16::operator>=(const CustomBFloat16 &other) const {
+        return utils::cast<float>(*this) >= utils::cast<float>(other);
+    }
+    CustomBFloat16 CustomBFloat16::operator+(const CustomBFloat16 &other) const {
+        return utils::cast<CustomBFloat16>(utils::cast<float>(*this) + utils::cast<float>(other));
+    }
+    CustomBFloat16 CustomBFloat16::operator-(const CustomBFloat16 &other) const {
+        return utils::cast<CustomBFloat16>(utils::cast<float>(*this) - utils::cast<float>(other));
+    }
+    CustomBFloat16 CustomBFloat16::operator*(const CustomBFloat16 &other) const {
+        return utils::cast<CustomBFloat16>(utils::cast<float>(*this) * utils::cast<float>(other));
+    }
+    CustomBFloat16 CustomBFloat16::operator/(const CustomBFloat16 &other) const {
+        return utils::cast<CustomBFloat16>(utils::cast<float>(*this) / utils::cast<float>(other));
+    }
+}//llaisys
+
 namespace llaisys::utils {
 float _f16_to_f32(fp16_t val) {
     uint16_t h = val._v;
@@ -63,14 +115,6 @@ fp16_t _f32_to_f16(float val) {
     }
 }
 
-float _bf16_to_f32(bf16_t val) {
-    uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
-
-    float out;
-    std::memcpy(&out, &bits32, sizeof(out));
-    return out;
-}
-
 bf16_t _f32_to_bf16(float val) {
     uint32_t bits32;
     std::memcpy(&bits32, &val, sizeof(bits32));
diff --git a/src/utils/types.hpp b/src/utils/types.hpp
index e09619db..a95986a7 100644
--- a/src/utils/types.hpp
+++ b/src/utils/types.hpp
@@ -2,15 +2,44 @@
 
 #include <iostream>
 #include <stdexcept>
+#include <cstring>
 
 namespace llaisys {
 struct CustomFloat16 {
     uint16_t _v;
+    bool operator==(const CustomFloat16 &other) const {
+        return _v == other._v;
+    }
+    bool operator!=(const CustomFloat16 &other) const {
+        return _v != other._v;
+    }
+    bool operator<(const CustomFloat16 &other) const;
+    bool operator>(const CustomFloat16 &other) const;
+    bool operator<=(const CustomFloat16 &other) const;
+    bool operator>=(const CustomFloat16 &other) const;
+    CustomFloat16 operator+(const CustomFloat16 &other) const;
+    CustomFloat16 operator-(const CustomFloat16 &other) const;
+    CustomFloat16 operator*(const CustomFloat16 &other) const;
+    CustomFloat16 operator/(const CustomFloat16 &other) const;
 };
 typedef struct CustomFloat16 fp16_t;
 
 struct CustomBFloat16 {
     uint16_t _v;
+    bool operator==(const CustomBFloat16 &other) const {
+        return _v == other._v;
+    }
+    bool operator!=(const CustomBFloat16 &other) const {
+        return _v != other._v;
+    }
+    bool operator<(const CustomBFloat16 &other) const;
+    bool operator>(const CustomBFloat16 &other) const;
+    bool operator<=(const CustomBFloat16 &other) const;
+    bool operator>=(const CustomBFloat16 &other) const;
+    CustomBFloat16 operator+(const CustomBFloat16 &other) const;
+    CustomBFloat16 operator-(const CustomBFloat16 &other) const;
+    CustomBFloat16 operator*(const CustomBFloat16 &other) const;
+    CustomBFloat16 operator/(const CustomBFloat16 &other) const;
 };
 typedef struct CustomBFloat16 bf16_t;
 
@@ -110,7 +139,13 @@ inline const char *dtype_to_str(llaisysDataType_t dtype) {
 float _f16_to_f32(fp16_t val);
 fp16_t _f32_to_f16(float val);
 
-float _bf16_to_f32(bf16_t val);
+inline float _bf16_to_f32(bf16_t val) {
+    uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
+
+    float out;
+    std::memcpy(&out, &bits32, sizeof(out));
+    return out;
+}
 bf16_t _f32_to_bf16(float val);
 
 template <typename TypeTo, typename TypeFrom>
diff --git a/test/my_test.py b/test/my_test.py
new file mode 100644
index 00000000..1797f31b
--- /dev/null
+++ b/test/my_test.py
@@ -0,0 +1,4 @@
+from llaisys.models.qwen2 import *
+
+path = "/home/jie/learning/ai_camp_2026/DeepSeek-R1-Distill-Qwen-1.5B"
+model = Qwen2(path)
\ No newline at end of file
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a9..43f00c57 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -106,6 +106,7 @@ target("llaisys")
     set_languages("cxx17")
     set_warnings("all", "error")
     add_files("src/llaisys/*.cc")
+    add_files("src/llaisys/models/*.cc")
     set_installdir(".")