From b3f96ac17d1dec7662358d4365276498ebbea8cb Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Mon, 19 Jan 2026 22:30:45 +0800
Subject: [PATCH 01/14] tensor task

---
 src/tensor/tensor.cpp | 182 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 174 insertions(+), 8 deletions(-)
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb6..6a7f3012 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -164,27 +164,193 @@ void Tensor::debug() const {
 }
 
 bool Tensor::isContiguous() const {
-    TO_BE_IMPLEMENTED();
+    //first initial expected stride(from lastdim to 0)
+    ptrdiff_t expected_stride = 1;
+    for(int i = (int)ndim()-1; i >= 0; i--)
+    {
+        size_t current_shape = shape()[i];
+        if(current_shape == 0) return true;
+        if(current_shape > 1)
+        {
+            if(strides()[i] != expected_stride)
+                return false;
+            expected_stride *= current_shape;
+        }
+    }
     return true;
 }
 
 tensor_t Tensor::permute(const std::vector<size_t> &order) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // valid order.size() == this->ndim()
+    CHECK_ARGUMENT(order.size() == this->ndim(), "Permute: order size mismatch.");
+    std::vector<bool> seen(order.size(), false);
+    std::vector<size_t> new_shape(order.size());
+    std::vector<ptrdiff_t> new_strides(order.size());
+    for(int i = 0; i < (int)order.size(); ++i)
+    {
+        size_t dim = order[i];
+        CHECK_ARGUMENT(dim < this->ndim(), "Permute: order index out of range.");
+        CHECK_ARGUMENT(!seen[dim], "Permute: order index duplicated.");
+        seen[dim] = true;
+        new_shape[i] = this->shape()[dim];
+        new_strides[i] = this->strides()[dim];
+    } 
+    TensorMeta meta{this->dtype(), new_shape, new_strides};
+    return std::shared_ptr<Tensor>(new Tensor(meta, _storage, _offset));
 }
 
 tensor_t Tensor::view(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+
+    // calculate new_shape's numel, must be the same as the old
+    size_t new_numel = 1;
+    for(auto s : shape)
+        new_numel *= s;
+    CHECK_ARGUMENT(new_numel == this->numel(), "View: total elements mismatch.");
+    std::vector<ptrdiff_t> new_strides(shape.size(),0);
+
+    //filter size = 1 shape in old_shape
+    std::vector<size_t> old_shape;
+    std::vector<ptrdiff_t> old_strides;
+    old_shape.reserve(this->shape().size());
+    old_strides.reserve(this->strides().size());
+    for(size_t i = 0; i < this->shape().size(); ++i)
+    {
+        if(this->shape()[i] > 1)
+        {
+            old_shape.push_back(this->shape()[i]);
+            old_strides.push_back(this->strides()[i]);
+        }
+    }
+
+    // filter size = 1 shape in new_shape
+    std::vector<size_t> new_shape;
+    std::vector<size_t> new_indices;
+    new_shape.reserve(shape.size());
+    new_indices.reserve(shape.size());
+    for(size_t i = 0; i < shape.size(); ++i)
+    {
+        if(shape[i] > 1)
+        {
+            new_shape.push_back(shape[i]);
+            new_indices.push_back(i);
+        }
+    }
+
+    // if no size > 1 in old_shape or new_shape
+    // no need to spilt blocks, generate continual new_strides
+    if(old_shape.empty() || new_shape.empty())
+    {
+        for(int i = (int)shape.size()-1; i>=0; --i)
+        {
+            if(i+1 < (int)shape.size())
+            {
+                new_strides[i] = new_strides[i+1]*shape[i+1];
+            }
+            else
+            {
+                new_strides[i] = 1;
+            }
+        }
+        TensorMeta meta{this->dtype(), shape, new_strides};
+        return std::shared_ptr<Tensor>(new Tensor(meta, _storage));
+    }
+    // spilt blocks
+    struct Block
+    {
+        size_t numel;
+        ptrdiff_t inner_stride;
+    };
+
+    std::vector<Block> blocks;
+    for(int i = (int)old_shape.size() - 1; i >= 0; --i)
+    {
+        if(blocks.empty())
+        {
+            blocks.push_back({old_shape[i], old_strides[i]});
+        }
+        else
+        {
+            //continual add to current block
+            if(old_strides[i] == (ptrdiff_t)old_shape[i+1] * old_strides[i+1])
+            {
+                blocks.back().numel *= old_shape[i];
+            }
+            //break, spilt new block
+            else
+            {
+                blocks.push_back({old_shape[i], old_strides[i]});
+            }
+        }
+    }
+    // for every block, judge which dims can cover it
+    size_t dim_pos = 0;
+    for(int b = (int)blocks.size() - 1; b >= 0; --b)
+    {
+        const auto &block = blocks[b];
+        size_t start = dim_pos;
+        size_t prod = 1;
+        while(dim_pos < new_shape.size() && prod < block.numel)
+        {
+            prod *= new_shape[dim_pos];
+            dim_pos++;
+        }
+        CHECK_ARGUMENT(prod == block.numel, "View: shape is not compatible with storage.");
+        // set new_strides for this block
+        ptrdiff_t stride = block.inner_stride;
+        for(int j = (int)dim_pos - 1; j >= (int)start; --j)
+        {
+            new_strides[new_indices[j]] = stride;
+            stride *= (ptrdiff_t)new_shape[j];
+        }
+    }
+    CHECK_ARGUMENT(dim_pos == new_shape.size(), "View: shape is not compatible with storage.");
+    // for size = 1 dims, set new_strides[i] = new_strides[i+1] * shape[i+1]
+    for(int i = (int)shape.size() - 1; i >= 0; --i)
+    {
+        if(shape[i] == 1)
+        {
+            if(i + 1 < (int)shape.size())
+            {
+                new_strides[i] = new_strides[i+1] * shape[i+1];
+            }
+            else
+            {
+                new_strides[i] = 1;
+            }
+        }
+    }
+    TensorMeta meta{this->dtype(), shape, new_strides};
+    return std::shared_ptr<Tensor>(new Tensor(meta, _storage, _offset));
 }
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    CHECK_ARGUMENT(dim < this->ndim(), "Slice: dim out of range.");
+    CHECK_ARGUMENT(start <= end, "Slice: start must be less than or equal to end.");
+    CHECK_ARGUMENT(end <= this->shape()[dim], "Slice: end out of range.");
+    std::vector<size_t> new_shape = this->shape();
+    new_shape[dim] = end - start;
+    std::vector<ptrdiff_t> new_strides = this->strides();
+    size_t byte_offset = _offset + start * (size_t)new_strides[dim] * this->elementSize();
+    TensorMeta meta{this->dtype(), new_shape, new_strides};
+    return std::shared_ptr<Tensor>(new Tensor(meta, _storage, byte_offset));
 }
 
 void Tensor::load(const void *src_) {
-    TO_BE_IMPLEMENTED();
+    CHECK_ARGUMENT(src_ != nullptr, "Load: src is null.");
+    size_t size = this->numel() * this->elementSize();
+    core::context().setDevice(this->deviceType(), this->deviceId());
+    if (!_storage || _offset + size > _storage->size()) {
+        if (this->deviceType() == LLAISYS_DEVICE_CPU) {
+            _storage = core::context().runtime().allocateHostStorage(size);
+        } else {
+            _storage = core::context().runtime().allocateDeviceStorage(size);
+        }
+        _offset = 0;
+    }
+    ASSERT(this->isContiguous(), "Load: tensor must be contiguous.");
+    core::context().runtime().api()->memcpy_sync(
+        this->data(), src_, size,
+        this->deviceType() == LLAISYS_DEVICE_CPU ? LLAISYS_MEMCPY_H2H : LLAISYS_MEMCPY_H2D);
 }
 
 tensor_t Tensor::contiguous() const {

From ad3ead9d72a7bc7919d6496e178b64b5f38fe05a Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Mon, 19 Jan 2026 22:47:26 +0800
Subject: [PATCH 02/14] tensor test

---
 src/tensor/tensor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 6a7f3012..69fea78a 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -165,6 +165,7 @@ void Tensor::debug() const {
 
 bool Tensor::isContiguous() const {
     //first initial expected stride(from lastdim to 0)
+    //test
     ptrdiff_t expected_stride = 1;
     for(int i = (int)ndim()-1; i >= 0; i--)
     {

From 226444adc88b47fb710538bed8f1563e6697e0e0 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Sat, 31 Jan 2026 17:10:29 +0800
Subject: [PATCH 03/14] complete ops: argmax, linear, embedding.

---
 .gitignore                              |  4 +-
 =3.1.0                                  |  2 +
 src/ops/argmax/cpu/argmax_cpu.cpp       | 47 ++++++++++++++++
 src/ops/argmax/cpu/argmax_cpu.hpp       |  8 +++
 src/ops/argmax/op.cpp                   | 31 ++++++++++-
 src/ops/embedding/cpu/embedding_cpu.cpp | 33 ++++++++++++
 src/ops/embedding/cpu/embedding_cpu.hpp |  8 +++
 src/ops/embedding/op.cpp                | 34 +++++++++++-
 src/ops/linear/cpu/linear_cpu.cpp       | 71 +++++++++++++++++++++++++
 src/ops/linear/cpu/linear_cpu.hpp       | 10 ++++
 src/ops/linear/op.cpp                   | 47 +++++++++++++++-
 src/ops/rms_norm/cpu/rms_norm_cpu.cpp   |  0
 src/ops/rms_norm/cpu/rms_norm_cpu.hpp   |  0
 13 files changed, 290 insertions(+), 5 deletions(-)
 create mode 100644 =3.1.0
 create mode 100644 src/ops/argmax/cpu/argmax_cpu.cpp
 create mode 100644 src/ops/argmax/cpu/argmax_cpu.hpp
 create mode 100644 src/ops/embedding/cpu/embedding_cpu.cpp
 create mode 100644 src/ops/embedding/cpu/embedding_cpu.hpp
 create mode 100644 src/ops/linear/cpu/linear_cpu.cpp
 create mode 100644 src/ops/linear/cpu/linear_cpu.hpp
 create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.cpp
 create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.hpp

diff --git a/.gitignore b/.gitignore
index e38cf574..e92328ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+#models
+/models/
 # Xmake cache
 .xmake/
 build/
@@ -87,4 +89,4 @@ htmlcov/
 # Windows
 Thumbs.db
 ehthumbs.db
-desktop.ini
\ No newline at end of file
+desktop.ini
diff --git a/=3.1.0 b/=3.1.0
new file mode 100644
index 00000000..8151c7c6
--- /dev/null
+++ b/=3.1.0
@@ -0,0 +1,2 @@
+Defaulting to user installation because normal site-packages is not writeable
+Requirement already satisfied: jinja2 in /usr/lib/python3/dist-packages (3.0.3)
diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp
new file mode 100644
index 00000000..ca0eb53d
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.cpp
@@ -0,0 +1,47 @@
+#include "argmax_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cstdint>
+
+template <typename T>
+static void argmax_(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t numel) {
+    size_t max_i = 0;
+    const T *v = reinterpret_cast<const T *>(vals);
+    if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+        float max_f = llaisys::utils::cast<float>(v[0]);
+        for (size_t i = 1; i < numel; ++i) {
+            float cur = llaisys::utils::cast<float>(v[i]);
+            if (cur > max_f) {
+                max_i = i;
+                max_f = cur;
+            }
+        }
+        *reinterpret_cast<T *>(max_val) = llaisys::utils::cast<T>(max_f);
+    } else {
+        T max_v = v[0];
+        for (size_t i = 1; i < numel; ++i) {
+            if (v[i] > max_v) {
+                max_i = i;
+                max_v = v[i];
+            }
+        }
+        *reinterpret_cast<T *>(max_val) = max_v;
+    }
+    *reinterpret_cast<int64_t *>(max_idx) = static_cast<int64_t>(max_i);
+}
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return argmax_<float>(max_idx, max_val, vals, numel);
+    case LLAISYS_DTYPE_BF16:
+        return argmax_<llaisys::bf16_t>(max_idx, max_val, vals, numel);
+    case LLAISYS_DTYPE_F16:
+        return argmax_<llaisys::fp16_t>(max_idx, max_val, vals, numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp
new file mode 100644
index 00000000..1f3224cb
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel);
+}
\ No newline at end of file
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d42..034bc431 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,36 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/argmax_cpu.hpp"
+
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(max_idx, max_val, vals);
+    ASSERT(vals->ndim() == 1, "Argmax: vals must be 1D.");
+    ASSERT(max_idx->ndim() == 1 && max_idx->shape()[0] == 1, "Argmax: max_idx shape must be (1, ).");
+    ASSERT(max_val->ndim() == 1 && max_val->shape()[0] == 1, "Argmax: max_val shape must be (1, ).");
+    CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype());
+    ASSERT(max_idx->dtype() == LLAISYS_DTYPE_I64, "Argmax: max_idx must be I64.");
+    ASSERT(vals->isContiguous() && max_idx->isContiguous() && max_val->isContiguous(), "Argmax: all tensors must be contiguous.");
+
+    if (vals->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+    }
+
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+
+    switch (vals->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp
new file mode 100644
index 00000000..e87fb5e2
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.cpp
@@ -0,0 +1,33 @@
+#include "embedding_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+template <typename T>
+static void embedding_(std::byte *out, const std::byte *index, const std::byte *weight, size_t index_len, size_t row_len) {
+    const auto idx = reinterpret_cast<const int64_t *>(index);
+    const auto w = reinterpret_cast<const T *>(weight);
+    auto o = reinterpret_cast<T *>(out);
+    for (size_t i = 0; i < index_len; ++i) {
+        const int64_t row = idx[i];
+        const T *src = w + row * row_len;
+        T *dst = o + i * row_len;
+        for (size_t j = 0; j < row_len; ++j) {
+            dst[j] = src[j];
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return embedding_<float>(out, index, weight, index_len, row_len);
+    case LLAISYS_DTYPE_BF16:
+        return embedding_<llaisys::bf16_t>(out, index, weight, index_len, row_len);
+    case LLAISYS_DTYPE_F16:
+        return embedding_<llaisys::fp16_t>(out, index, weight, index_len, row_len);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp
new file mode 100644
index 00000000..9aec5020
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len);
+}
\ No newline at end of file
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d0..3c33c5d5 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,39 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/embedding_cpu.hpp"
 
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, index, weight);
+    ASSERT(index->ndim() == 1, "Embedding: index must be 1D.");
+    ASSERT(weight->ndim() == 2, "Embedding: weight must be 2D.");
+    ASSERT(out->ndim() == 2, "Embedding: out must be 2D.");
+    ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "Embedding: index must be I64.");
+    CHECK_SAME_DTYPE(out->dtype(), weight->dtype());
+    ASSERT(out->shape()[0] == index->shape()[0], "Embedding: out.shape[0] must equal index.shape[0].");
+    ASSERT(out->shape()[1] == weight->shape()[1], "Embedding: out.shape[1] must equal weight.shape[1].");
+    ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), "Embedding: all tensors must be contiguous.");
+
+    size_t index_len = index->shape()[0];
+    size_t row_len = weight->shape()[1];
+
+    if (weight->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len);
+    }
+
+    llaisys::core::context().setDevice(weight->deviceType(), weight->deviceId());
+
+    switch (weight->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp
new file mode 100644
index 00000000..520d8b0e
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.cpp
@@ -0,0 +1,71 @@
+#include "linear_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+template <typename T>
+static void linear_(T *out, const T *in, const T *W, const T *bias, 
+    size_t batch, size_t in_dim, size_t out_dim)
+{
+    for(size_t i = 0; i < batch; ++i)
+    {
+        const T *in_ = in + i*in_dim; 
+        T *out_ = out + i*out_dim;
+        for(size_t j = 0; j < out_dim; ++j)
+        {
+            const T *weight_ = W + j*in_dim;
+            if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+                || std::is_same_v<T, llaisys::fp16_t>)
+            {
+                float acc = 0.0f;
+                for(size_t k = 0; k < in_dim; ++k)
+                {
+                    acc += llaisys::utils::cast<float>(in_[k]) * 
+                    llaisys::utils::cast<float>(weight_[k]);
+                }
+                if(bias)
+                    acc += llaisys::utils::cast<float>(bias[j]);
+                out_[j] = llaisys::utils::cast<T>(acc);
+            }
+            else
+            {
+                T acc = static_cast<T>(0);
+                for(size_t k = 0; k < in_dim; ++k)
+                {
+                    acc += in_[k] * weight_[k];
+                }
+                if(bias)
+                    acc += bias[j];
+                out_[j] = acc;
+            }
+        }
+    }
+
+}
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *W, 
+    const std::byte *bias, llaisysDataType_t type, size_t batch, 
+    size_t in_dim, size_t out_dim) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return linear_<float>(reinterpret_cast<float *>(out), 
+        reinterpret_cast<const float *>(in), 
+        reinterpret_cast<const float *>(W), 
+        reinterpret_cast<const float *>(bias), 
+        batch, in_dim, out_dim);
+    case LLAISYS_DTYPE_BF16:
+        return linear_<llaisys::bf16_t>(reinterpret_cast<llaisys::bf16_t *>(out), 
+        reinterpret_cast<const llaisys::bf16_t *>(in), 
+        reinterpret_cast<const llaisys::bf16_t *>(W), 
+        reinterpret_cast<const llaisys::bf16_t *>(bias), 
+        batch, in_dim, out_dim);  
+    case LLAISYS_DTYPE_F16: 
+        return linear_<llaisys::fp16_t>(reinterpret_cast<llaisys::fp16_t *>(out), 
+        reinterpret_cast<const llaisys::fp16_t *>(in), 
+        reinterpret_cast<const llaisys::fp16_t *>(W), 
+        reinterpret_cast<const llaisys::fp16_t *>(bias), 
+        batch, in_dim, out_dim);  
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp
new file mode 100644
index 00000000..a6f4e279
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.hpp
@@ -0,0 +1,10 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *W, 
+    const std::byte *bias, llaisysDataType_t type, size_t batch,
+    size_t in_dim, size_t out_dim);
+}
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f865..9585068a 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,50 @@
 #include "op.hpp"
-
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/linear_cpu.hpp"
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
-    TO_BE_IMPLEMENTED();
+    ASSERT(out->isContiguous() && in->isContiguous() 
+    && weight->isContiguous(), "out, in, weight must be contiguous tensors");
+    
+    ASSERT(out->ndim() == 2 && in->ndim() == 2 && 
+    weight->ndim() == 2, "out, in, weight must be 2D tensors");
+    ASSERT(in->shape()[1] == weight->shape()[1],"in.shape[1] must be equal to weight.shape[1]");
+    ASSERT(out->shape()[1] == weight->shape()[0], "out.shape[1] must be equal to weight.shape[0]");
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+    if(bias)
+    {
+        CHECK_SAME_DEVICE(out, bias, weight);
+        ASSERT(bias->ndim() == 1, "bias must be 1D tensor");
+        ASSERT(bias->isContiguous(), "bias must be contiguous tensor");
+        ASSERT(bias->shape()[0] == out->shape()[1], "bias.shape[0] must be equal to out.shape[1]");
+        CHECK_SAME_DTYPE(out->dtype(), bias->dtype());
+    }
+
+    size_t batch = out->shape()[0];
+    size_t in_dim = in->shape()[1];
+    size_t out_dim = out->shape()[1];
+
+    const std::byte *bias_data = bias ? bias->data():nullptr;
+
+    if (weight->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::linear(out->data(), in->data(), 
+        weight->data(), bias_data, bias->dtype(), batch, in_dim, out_dim);
+    }
+
+    llaisys::core::context().setDevice(weight->deviceType(), weight->deviceId());
+
+    switch (weight->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::linear(out->data(), in->data(), 
+        weight->data(), bias_data, bias->dtype(), batch, in_dim, out_dim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
new file mode 100644
index 00000000..e69de29b

From 58c37d8f40225de3064382802bab4b89379491e4 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Sat, 31 Jan 2026 19:56:48 +0800
Subject: [PATCH 04/14] complete ops: rms_norm and rope

---
 src/ops/rms_norm/cpu/rms_norm_cpu.cpp | 77 +++++++++++++++++++++++++++
 src/ops/rms_norm/cpu/rms_norm_cpu.hpp | 10 ++++
 src/ops/rms_norm/op.cpp               | 34 +++++++++++-
 src/ops/rope/cpu/rope_cpu.cpp         | 77 +++++++++++++++++++++++++++
 src/ops/rope/cpu/rope_cpu.hpp         | 10 ++++
 src/ops/rope/op.cpp                   | 35 +++++++++++-
 6 files changed, 239 insertions(+), 4 deletions(-)
 create mode 100644 src/ops/rope/cpu/rope_cpu.cpp
 create mode 100644 src/ops/rope/cpu/rope_cpu.hpp

diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
index e69de29b..cd828d6c 100644
--- a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
@@ -0,0 +1,77 @@
+#include "rms_norm_cpu.hpp"
+#include <cmath>
+#include "../../../utils.hpp"
+
+template <typename T>
+static void rms_norm_(T *out, const T *in, const T *W, 
+    const float eps, size_t batch, size_t in_dim)
+{
+    //for each row in in_tensor
+    for(size_t i = 0; i < batch; ++i)
+    {
+        const T *in_ = in + i*in_dim;
+        T *out_ = out + i*in_dim;
+        float std_dev = 0.0f;
+        //first calculate the standard deviation of each row
+        if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+            || std::is_same_v<T, llaisys::fp16_t>)
+        {
+            for(size_t j = 0; j < in_dim; ++j)
+            {
+                std_dev += llaisys::utils::cast<float>(in_[j]) * 
+                llaisys::utils::cast<float>(in_[j]);
+            }
+            std_dev = std::sqrt(std_dev / (int)in_dim + eps);
+        }
+        else
+        {
+            for(size_t j = 0; j < in_dim; ++j)
+            {
+                std_dev += in_[j] * in_[j];
+            }
+            std_dev = std::sqrt(std_dev / in_dim + eps);
+        }
+        //then normalize each element in the row
+        for(size_t j = 0; j < in_dim; ++j)
+        {
+            if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+                || std::is_same_v<T, llaisys::fp16_t>)
+            {
+                out_[j] = llaisys::utils::cast<T>(
+                    llaisys::utils::cast<float>(in_[j]) * 
+                    llaisys::utils::cast<float>(W[j]) / std_dev);
+            }
+            else
+            {
+                out_[j] = in_[j] * W[j] / std_dev;
+            }
+        }
+    }
+
+}
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, 
+    const std::byte *W, const float eps, 
+    llaisysDataType_t type, 
+    size_t batch, size_t in_dim) {  
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rms_norm_<float>(reinterpret_cast<float *>(out), 
+        reinterpret_cast<const float *>(in), 
+        reinterpret_cast<const float *>(W), 
+        eps, batch, in_dim);
+    case LLAISYS_DTYPE_BF16:
+        return rms_norm_<llaisys::bf16_t>(reinterpret_cast<llaisys::bf16_t *>(out), 
+        reinterpret_cast<const llaisys::bf16_t *>(in), 
+        reinterpret_cast<const llaisys::bf16_t *>(W), 
+        eps, batch, in_dim);  
+    case LLAISYS_DTYPE_F16: 
+        return rms_norm_<llaisys::fp16_t>(reinterpret_cast<llaisys::fp16_t *>(out), 
+        reinterpret_cast<const llaisys::fp16_t *>(in), 
+        reinterpret_cast<const llaisys::fp16_t *>(W), 
+        eps, batch, in_dim);  
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
index e69de29b..dfe43083 100644
--- a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
@@ -0,0 +1,10 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, 
+    const std::byte *W, const float eps,
+    llaisysDataType_t type, size_t batch, size_t in_dim);
+}
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9..d57c96cf 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,37 @@
 #include "op.hpp"
-
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rms_norm_cpu.hpp"
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, weight);
+    ASSERT(out->ndim() == 2 && in->ndim() == 2 && 
+    weight->ndim() == 1, "out, in must be 2D tensors, while weight must be 1D tensor");
+    ASSERT(out->isContiguous() && in->isContiguous() 
+    && weight->isContiguous(), "All tensors must be contiguous");
+    ASSERT(out->shape() == in->shape(), "Output shape must be equal to input shape");
+    ASSERT(weight->shape()[0] == in->shape()[1], "Weight shape must be equal to input shape");
+    size_t batch = out->shape()[0];
+    size_t in_dim = in->shape()[1];
+
+    if (weight->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rms_norm(out->data(), in->data(), 
+        weight->data(), eps, out->dtype(), batch, in_dim);
+    }
+
+    llaisys::core::context().setDevice(weight->deviceType(), weight->deviceId());
+
+    switch (weight->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rms_norm(out->data(), in->data(), 
+        weight->data(), eps, out->dtype(), batch, in_dim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp
new file mode 100644
index 00000000..e25c3e74
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.cpp
@@ -0,0 +1,77 @@
+#include "rope_cpu.hpp"
+#include <cmath>
+#include "../../../utils.hpp"
+
+template <typename T>
+void rope_(T *out, const T *in, const int64_t *pos_ids, const float theta,
+    size_t seq_len, size_t n_heads, size_t head_dim)
+{
+    //for each s in seq_len
+    for(size_t s = 0; s < seq_len; ++s)
+    {
+        //for each h in n_heads
+        for(size_t h = 0; h < n_heads; ++h)
+        {
+            //for each tensor in head_dim
+            for(size_t j = 0; j < head_dim/2; ++j)
+            {
+                size_t p = pos_ids[s];
+                // cal angle
+                float angle = static_cast<float>(p) / std::pow(theta, 2.0f*j / head_dim);
+                // cal the base
+                const T *in_ = in + (s*n_heads + h) * head_dim;
+                T *out_ = out + (s*n_heads + h) * head_dim;
+                //cal sin & cos val
+                float cosine = std::cos(angle);
+                float sine = std::sin(angle);
+                //find a and b
+                T a = in_[j];
+                T b = in_[j + head_dim/2];
+                //rope
+                if constexpr(std::is_same_v<T, llaisys::bf16_t> 
+                    || std::is_same_v<T, llaisys::fp16_t>)
+                {
+                    out_[j] = llaisys::utils::cast<T>(
+                        llaisys::utils::cast<float>(a) * cosine - 
+                        llaisys::utils::cast<float>(b) * sine
+                    );
+                    out_[j + head_dim/2] = llaisys::utils::cast<T>(
+                        llaisys::utils::cast<float>(a) * sine + 
+                        llaisys::utils::cast<float>(b) * cosine
+                    );
+                }
+                else
+                {
+                    out_[j] = a * cosine - b * sine;
+                    out_[j + head_dim/2] = a * sine + b * cosine;
+                }
+            }
+        }
+    }
+}
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, 
+    const std::byte *pos_ids, const float theta, 
+    llaisysDataType_t type, 
+    size_t seq_len, size_t n_heads, size_t head_dim) {  
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rope_<float>(reinterpret_cast<float *>(out), 
+        reinterpret_cast<const float *>(in), 
+        reinterpret_cast<const int64_t *>(pos_ids), theta,
+        seq_len, n_heads, head_dim);
+    case LLAISYS_DTYPE_BF16:
+        return rope_<llaisys::bf16_t>(reinterpret_cast<llaisys::bf16_t *>(out), 
+        reinterpret_cast<const llaisys::bf16_t *>(in), 
+        reinterpret_cast<const int64_t *>(pos_ids), theta,
+        seq_len, n_heads, head_dim);  
+    case LLAISYS_DTYPE_F16: 
+        return rope_<llaisys::fp16_t>(reinterpret_cast<llaisys::fp16_t *>(out), 
+        reinterpret_cast<const llaisys::fp16_t *>(in), 
+        reinterpret_cast<const int64_t *>(pos_ids), theta,
+        seq_len, n_heads, head_dim);    
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp
new file mode 100644
index 00000000..77fb59db
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.hpp
@@ -0,0 +1,10 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, 
+    const std::byte *pos_ids,const float theta, llaisysDataType_t type, 
+    size_t seq_len, size_t n_heads, size_t head_dim);
+}
\ No newline at end of file
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64..d3cab559 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,38 @@
 #include "op.hpp"
-
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/rope_cpu.hpp"
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, pos_ids);
+    ASSERT(out->isContiguous() && in->isContiguous() 
+    && pos_ids->isContiguous(), "All tensors must be Contiguous");
+    ASSERT(out->shape() == in->shape(), 
+    "Output shape must be equal to Input shape");
+    ASSERT(pos_ids->shape()[0] == out->shape()[0], 
+    "Position IDs shape[0] must be equal to Output shape[0]");
+    size_t seq_len = out->shape()[0];
+    size_t n_heads = out->shape()[1];
+    size_t head_dim = out->shape()[2];
+
+    if (pos_ids->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rope(out->data(), in->data(), 
+        pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim);
+    }
+
+    llaisys::core::context().setDevice(pos_ids->deviceType(), pos_ids->deviceId());
+
+    switch (pos_ids->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rope(out->data(), in->data(), 
+        pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops

From 0aa909507ac294e6b2df55c2148b21ab00e87bea Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Sun, 1 Feb 2026 20:47:30 +0800
Subject: [PATCH 05/14] complete ops: rearrange, swiglu and self attention

---
 src/ops/rearrange/cpu/rearrange_cpu.cpp       |  44 ++++++
 src/ops/rearrange/cpu/rearrange_cpu.hpp       |   9 ++
 src/ops/rearrange/op.cpp                      |  35 ++++-
 .../self_attention/cpu/self_attention_cpu.cpp | 128 ++++++++++++++++++
 .../self_attention/cpu/self_attention_cpu.hpp |   8 ++
 src/ops/self_attention/op.cpp                 |  44 +++++-
 src/ops/swiglu/cpu/swiglu_cpu.cpp             |  55 ++++++++
 src/ops/swiglu/cpu/swiglu_cpu.hpp             |   7 +
 src/ops/swiglu/op.cpp                         |  31 ++++-
 9 files changed, 358 insertions(+), 3 deletions(-)
 create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.cpp
 create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.hpp
 create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.cpp
 create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.hpp
 create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.cpp
 create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.hpp

diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp
new file mode 100644
index 00000000..1c8078ca
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp
@@ -0,0 +1,44 @@
+#include "rearrange_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+template <typename T>
+static void rearrange_(T *out, const T *in, const size_t *shape, const ptrdiff_t *out_strides, 
+    const ptrdiff_t *in_strides, const size_t numel, size_t ndim) {
+    for (size_t i = 0; i < numel; ++i) {
+        size_t tmp = i;
+        ptrdiff_t in_offset = 0;
+        ptrdiff_t out_offset = 0;
+        for (size_t d = ndim; d-- > 0;) {
+            // cal index from d-1 to 0;
+            size_t idx = tmp % shape[d];
+            tmp /= shape[d];
+            // based strides to cal offset
+            in_offset += static_cast<ptrdiff_t>(idx) * in_strides[d];
+            out_offset += static_cast<ptrdiff_t>(idx) * out_strides[d];
+        }
+        out[out_offset] = in[in_offset];
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, const size_t *shape,
+               const ptrdiff_t *out_strides, const ptrdiff_t *in_strides, size_t numel, size_t ndim) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rearrange_(reinterpret_cast<float *>(out), 
+        reinterpret_cast<const float *>(in),
+        shape, out_strides, in_strides, numel, ndim);
+    case LLAISYS_DTYPE_BF16:
+        return rearrange_(reinterpret_cast<llaisys::bf16_t *>(out), 
+        reinterpret_cast<const llaisys::bf16_t *>(in),
+        shape, out_strides, in_strides, numel, ndim);
+    case LLAISYS_DTYPE_F16:
+        return rearrange_(reinterpret_cast<llaisys::fp16_t *>(out), 
+        reinterpret_cast<const llaisys::fp16_t *>(in),
+        shape, out_strides, in_strides, numel, ndim);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp
new file mode 100644
index 00000000..e8d4d5a0
--- /dev/null
+++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type,
+               const size_t *shape, const ptrdiff_t *out_strides, 
+               const ptrdiff_t *in_strides, size_t numel, size_t ndim);
+}
\ No newline at end of file
diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp
index 017a6ae5..ae1523fc 100644
--- a/src/ops/rearrange/op.cpp
+++ b/src/ops/rearrange/op.cpp
@@ -1,7 +1,40 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rearrange_cpu.hpp"
+
 namespace llaisys::ops {
 void rearrange(tensor_t out, tensor_t in) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in);
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+    CHECK_SAME_SHAPE(out->shape(), in->shape());
+
+    size_t numel = in->numel();
+    size_t ndim = out->ndim();
+    const auto &shape = out->shape();
+    const auto &out_strides = out->strides();
+    const auto &in_strides = in->strides();
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rearrange(out->data(), in->data(), out->dtype(), shape.data(), out_strides.data(),
+                              in_strides.data(), numel, ndim);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rearrange(out->data(), in->data(), out->dtype(), shape.data(), out_strides.data(),
+                              in_strides.data(), numel, ndim);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp
new file mode 100644
index 00000000..cdf5f391
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp
@@ -0,0 +1,128 @@
+#include "self_attention_cpu.hpp"
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <vector>
+#include "../../../utils.hpp"
+
+template <typename T>
+static void self_attention_(T *out, const T *q, const T *k, const T *v, 
+    float scale,size_t qlen, size_t kvlen, 
+    size_t nh, size_t nkvh, size_t hd, size_t dv) {
+    //cal group size of kv head
+    size_t kv_group = nh / nkvh;
+    // attn scores
+    std::vector<float> scores(kvlen);
+
+    // for each q head
+    for (size_t h = 0; h < nh; ++h) {
+        size_t kvh = h / kv_group;
+        for (size_t i = 0; i < qlen; ++i) {
+            const T *q_ = q + (i * nh + h) * hd;
+            float max_score = -std::numeric_limits<float>::infinity();
+            // causal mask
+            int64_t limit = static_cast<int64_t>(i) + 
+            static_cast<int64_t>(kvlen) - static_cast<int64_t>(qlen);
+
+            for (size_t j = 0; j < kvlen; ++j) {
+                // for j > i, set score to -inf, because it can't be seen
+                if (static_cast<int64_t>(j) > limit) {
+                    scores[j] = -std::numeric_limits<float>::infinity();
+                    continue;
+                }
+                // start to cal attn scores
+                // QK^T
+                float acc = 0.0f;
+                const T *k_ = k + (j * nkvh + kvh) * hd;
+                if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+                    || std::is_same_v<T, llaisys::fp16_t>) {
+                    for (size_t d = 0; d < hd; ++d) {
+                        acc += llaisys::utils::cast<float>(q_[d]) 
+                        * llaisys::utils::cast<float>(k_[d]);
+                    }
+                } else {
+                    for (size_t d = 0; d < hd; ++d) {
+                        acc += q_[d] * k_[d];
+                    }
+                }
+                //QK^T * scale
+                acc *= scale;
+                scores[j] = acc;
+                if (acc > max_score) {
+                    max_score = acc;
+                }
+            }
+
+            float sum = 0.0f;
+            for (size_t j = 0; j < kvlen; ++j) {
+                if (scores[j] == -std::numeric_limits<float>::infinity()) {
+                    scores[j] = 0.0f;
+                    continue;
+                }
+                // softmax exp(j - max_score)
+                float w = std::exp(scores[j] - max_score);
+                scores[j] = w;
+                sum += w;
+            }
+            // softmax sum(exp(j - max_score))
+            float inv_sum = sum > 0.0f ? 1.0f / sum : 0.0f;
+            
+            // start to cal causalsoftmax(A)*V
+            T *out_ = out + (i * nh + h) * dv;
+            for (size_t d = 0; d < dv; ++d) {
+                float acc = 0.0f;
+                for (size_t j = 0; j < kvlen; ++j) {
+                    if (scores[j] == 0.0f) {
+                        continue;
+                    }
+                    const T *v_ = v + (j * nkvh + kvh) * dv;
+                    if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+                        || std::is_same_v<T, llaisys::fp16_t>) {
+                        acc += scores[j] * llaisys::utils::cast<float>(v_[d]);
+                    } else {
+                        acc += scores[j] * v_[d];
+                    }
+                }
+                if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+                    || std::is_same_v<T, llaisys::fp16_t>) {
+                    out_[d] = llaisys::utils::cast<T>(acc * inv_sum);
+                } else {
+                    out_[d] = static_cast<T>(acc * inv_sum);
+                }
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *out, const std::byte *q, const std::byte *k, 
+    const std::byte *v, float scale,
+    llaisysDataType_t type, size_t qlen, 
+    size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return self_attention_<float>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(q),
+            reinterpret_cast<const float *>(k),
+            reinterpret_cast<const float *>(v),
+            scale, qlen, kvlen, nh, nkvh, hd, dv);
+    case LLAISYS_DTYPE_BF16:
+        return self_attention_<llaisys::bf16_t>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(q),
+            reinterpret_cast<const llaisys::bf16_t *>(k),
+            reinterpret_cast<const llaisys::bf16_t *>(v),
+            scale, qlen, kvlen, nh, nkvh, hd, dv);
+    case LLAISYS_DTYPE_F16:
+        return self_attention_<llaisys::fp16_t>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(q),
+            reinterpret_cast<const llaisys::fp16_t *>(k),
+            reinterpret_cast<const llaisys::fp16_t *>(v),
+            scale, qlen, kvlen, nh, nkvh, hd, dv);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp
new file mode 100644
index 00000000..99c1b658
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale,
+    llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv);
+}
\ No newline at end of file
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d62014..abfe1964 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,49 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/self_attention_cpu.hpp"
 
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(attn_val, q, k, v);
+    CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype());
+    ASSERT(attn_val->ndim() == 3 && q->ndim() == 3 && k->ndim() == 3 && v->ndim() == 3,
+           "self_attention: all tensors must be 3D.");
+    ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(),
+           "self_attention: all tensors must be contiguous.");
+    ASSERT(q->shape()[0] == attn_val->shape()[0], "self_attention: qlen mismatch.");
+    ASSERT(q->shape()[1] == attn_val->shape()[1], "self_attention: nhead mismatch.");
+    ASSERT(v->shape()[2] == attn_val->shape()[2], "self_attention: dv mismatch.");
+    ASSERT(q->shape()[2] == k->shape()[2], "self_attention: q/k head dim mismatch.");
+    ASSERT(k->shape()[0] == v->shape()[0], "self_attention: k/v length mismatch.");
+    ASSERT(k->shape()[1] == v->shape()[1], "self_attention: k/v head mismatch.");
+    ASSERT(q->shape()[1] % k->shape()[1] == 0, "self_attention: nhead must be multiple of nkvhead.");
+
+    size_t qlen = q->shape()[0];
+    size_t kvlen = k->shape()[0];
+    size_t nh = q->shape()[1];
+    size_t nkvh = k->shape()[1];
+    size_t hd = q->shape()[2];
+    size_t dv = v->shape()[2];
+
+    if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale,
+                                   attn_val->dtype(), qlen, kvlen, nh, nkvh, hd, dv);
+    }
+
+    llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId());
+
+    switch (attn_val->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale,
+                                   attn_val->dtype(), qlen, kvlen, nh, nkvh, hd, dv);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp
new file mode 100644
index 00000000..f5fabb03
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp
@@ -0,0 +1,55 @@
+#include "swiglu_cpu.hpp"
+#include <cmath>
+#include <cstdint>
+#include "../../../utils.hpp"
+
+template <typename T>
+void swiglu_(T *out, const T *gate, const T *up, size_t seq_len, size_t inter_size)
+{
+    // for each line in up
+    for(size_t i = 0; i < seq_len; i++)
+    {
+        // cal i line base
+        T *out_ = out + i * inter_size;
+        const T* up_ = up + i * inter_size;
+        const T* gate_ = gate + i * inter_size;
+        // for each element in the line
+        for(size_t j = 0; j < inter_size; j++)
+        {
+            if constexpr (std::is_same_v<T, llaisys::bf16_t> 
+                || std::is_same_v<T, llaisys::fp16_t>) {
+                    out_[j] = llaisys::utils::cast<T>(
+                        llaisys::utils::cast<float>(up_[j]) * 
+                        llaisys::utils::cast<float>(gate_[j]) / 
+                        (1.0f + std::exp(-llaisys::utils::cast<float>(gate_[j]))));
+                }
+            else
+                out_[j] = up_[j] * gate_[j] / (1.0f + std::exp(-gate_[j]));
+        }
+        
+    }
+}
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return swiglu_<float>(reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(gate),
+            reinterpret_cast<const float *>(up),
+            seq_len, inter_size);
+    case LLAISYS_DTYPE_BF16:
+        return swiglu_<llaisys::bf16_t>(reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(gate),
+            reinterpret_cast<const llaisys::bf16_t *>(up),
+            seq_len, inter_size);
+    case LLAISYS_DTYPE_F16:
+        return swiglu_<llaisys::fp16_t>(reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(gate),
+            reinterpret_cast<const llaisys::fp16_t *>(up),
+            seq_len, inter_size);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp
new file mode 100644
index 00000000..e796b041
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp
@@ -0,0 +1,7 @@
+#pragma once
+#include "llaisys.h"
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size);
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc9..5ce3c4bf 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,36 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+#include "cpu/swiglu_cpu.hpp"
 
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, gate, up);
+    CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype());
+    CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape());
+    ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(),
+           "swiglu: all tensors must be contiguous.");
+    size_t seq_len = out->shape()[0];
+    size_t inter_size = out->shape()[1];
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::swiglu(out->data(), gate->data(), up->data(), 
+        out->dtype(), seq_len, inter_size);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::swiglu(out->data(), gate->data(), up->data(), 
+        out->dtype(), seq_len, inter_size);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops

From 94eafa51a6c3c73d15c9ac7e75b4c10a69426da5 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Tue, 3 Feb 2026 16:33:41 +0800
Subject: [PATCH 06/14] inplement inference sys

---
 python/llaisys/libllaisys/__init__.py |   9 +
 python/llaisys/libllaisys/models.py   |  65 +++++
 python/llaisys/models/qwen2.py        | 152 +++++++++-
 src/llaisys/qwen2.cc                  | 392 ++++++++++++++++++++++++++
 4 files changed, 608 insertions(+), 10 deletions(-)
 create mode 100644 python/llaisys/libllaisys/models.py
 create mode 100644 src/llaisys/qwen2.cc

diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb52..fa872903 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -12,6 +12,10 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
+# Llaisys infer
+from .models import load_models
+from .models import LlaisysQwen2Meta
+from .models import LlaisysQwen2Weights
 
 
 def load_shared_library():
@@ -38,6 +42,8 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+# Llaisys load_models
+load_models(LIB_LLAISYS)
 
 
 __all__ = [
@@ -52,4 +58,7 @@ def load_shared_library():
     "llaisysMemcpyKind_t",
     "MemcpyKind",
     "llaisysStream_t",
+    # Llaisys c side
+    "LlaisysQwen2Meta",
+    "LlaisysQwen2Weights",
 ]
diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py
new file mode 100644
index 00000000..ea313487
--- /dev/null
+++ b/python/llaisys/libllaisys/models.py
@@ -0,0 +1,65 @@
+import ctypes
+from ctypes import c_size_t, c_int64, c_int, c_float
+from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t
+from .tensor import llaisysTensor_t
+
+# c side wrap
+
+class LlaisysQwen2Meta(ctypes.Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+
+class LlaisysQwen2Weights(ctypes.Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_q_w", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_q_b", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_k_w", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_k_b", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_v_w", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_v_b", ctypes.POINTER(llaisysTensor_t)),
+        ("attn_o_w", ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_up_w", ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_down_w", ctypes.POINTER(llaisysTensor_t)),
+    ]
+
+
+def load_models(lib):
+    lib.llaisysQwen2ModelCreate.argtypes = [
+        ctypes.POINTER(LlaisysQwen2Meta),
+        llaisysDeviceType_t,
+        ctypes.POINTER(c_int),
+        c_int,
+    ]
+    lib.llaisysQwen2ModelCreate.restype = ctypes.c_void_p
+
+    lib.llaisysQwen2ModelDestroy.argtypes = [ctypes.c_void_p]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    lib.llaisysQwen2ModelWeights.argtypes = [ctypes.c_void_p]
+    lib.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights)
+
+    lib.llaisysQwen2ModelInfer.argtypes = [
+        ctypes.c_void_p,
+        ctypes.POINTER(c_int64),
+        c_size_t,
+    ]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b2..98efbe28 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,23 +1,143 @@
 from typing import Sequence
 from ..libllaisys import LIB_LLAISYS
 from ..libllaisys import DeviceType
+from ..libllaisys import DataType
+from ..libllaisys import llaisysDeviceType_t
+from ..libllaisys.models import LlaisysQwen2Meta
 
 from pathlib import Path
 import safetensors
+import json
+import ctypes
 
 
 class Qwen2:
-
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
+        self._device = device
         model_path = Path(model_path)
+        config_path = model_path / "config.json"
+        # read model config from config.json
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+        # parse dim
+        hs = int(config["hidden_size"])                
+        nlayer = int(config["num_hidden_layers"])      
+        nh = int(config["num_attention_heads"])        
+        nkvh = int(config.get("num_key_value_heads", nh))  
+        di = int(config["intermediate_size"])          
+        dh = int(hs // nh)                             
+        
+        # parse key params
+        maxseq = int(config["max_position_embeddings"])
+        voc = int(config["vocab_size"])                
+        epsilon = float(config["rms_norm_eps"])  
+        theta = float(config["rope_theta"])   
+        end_token = int(config["eos_token_id"])
+        
+
+        # construct C struct LlaisysQwen2Meta
+        meta = LlaisysQwen2Meta(
+            dtype=DataType.BF16,
+            nlayer=nlayer,
+            hs=hs,
+            nh=nh,
+            nkvh=nkvh,
+            dh=dh,
+            di=di,
+            maxseq=maxseq,
+            voc=voc,
+            epsilon=epsilon,
+            theta=theta,
+            end_token=end_token,
+        )
+
+        # only use cpu
+        device_ids = (ctypes.c_int * 1)(0)
+        # create model instance
+        self._model = LIB_LLAISYS.llaisysQwen2ModelCreate(
+            ctypes.byref(meta), llaisysDeviceType_t(device), device_ids, 1
+        )
+
+        # get model weights
+        self._weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model).contents
+        self._end_token = end_token
 
+        # traverse all safetensors files, in fact only one file in qwen2
         for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
+            # load on cpu, I use pt framework to load bfloat16 weights here
+            data_ = safetensors.safe_open(file, framework="pt", device="cpu")
             for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+                weight = self._match_weight(name_)
+                if weight is None:
+                    continue
+                # load weight to c side
+                arr = data_.get_tensor(name_).contiguous() #c-contiguous
+                
+                LIB_LLAISYS.tensorLoad(weight, ctypes.c_void_p(arr.data_ptr()))
+
+    
+    def _match_weight(self, name: str):
+        # match weight name to c struct field
+
+        w = self._weights
+        # input embedding
+        if name == "model.embed_tokens.weight":
+            return w.in_embed
+        # output embedding
+        if name in ("lm_head.weight", "model.lm_head.weight"):
+            return w.out_embed
+        # final LayerNorm
+        if name == "model.norm.weight":
+            return w.out_norm_w
+        # only processtransformer layer weights
+        if not name.startswith("model.layers."):
+            return None
+        parts = name.split(".")
+        if len(parts) < 5:
+            return None
+        layer = int(parts[2])      # 提取层索引
+        tail = ".".join(parts[3:])  # 剩余后缀
+        # Attention Layer
+        if tail == "input_layernorm.weight":
+            return w.attn_norm_w[layer]
+        if tail == "self_attn.q_proj.weight":
+            return w.attn_q_w[layer]
+        if tail == "self_attn.q_proj.bias":
+            return w.attn_q_b[layer]
+        if tail == "self_attn.k_proj.weight":
+            return w.attn_k_w[layer]
+        if tail == "self_attn.k_proj.bias":
+            return w.attn_k_b[layer]
+        if tail == "self_attn.v_proj.weight":
+            return w.attn_v_w[layer]
+        if tail == "self_attn.v_proj.bias":
+            return w.attn_v_b[layer]
+        if tail == "self_attn.o_proj.weight":
+            return w.attn_o_w[layer]
+        # FFN layer
+        if tail == "post_attention_layernorm.weight":
+            return w.mlp_norm_w[layer]
+        if tail == "mlp.gate_proj.weight":
+            return w.mlp_gate_w[layer]
+        if tail == "mlp.up_proj.weight":
+            return w.mlp_up_w[layer]
+        if tail == "mlp.down_proj.weight":
+            return w.mlp_down_w[layer]
+        return None
+
+    def _infer(self, tokens: Sequence[int]) -> int:
+        # step forward infer
+
+        if len(tokens) == 0:
+            return self._end_token
+        # convert python list to c int64 array
+        arr = (ctypes.c_int64 * len(tokens))(*tokens)
+        return int(
+            LIB_LLAISYS.llaisysQwen2ModelInfer(
+                self._model, arr, ctypes.c_size_t(len(tokens))
+            )
+        )
 
     def generate(
         self,
@@ -27,7 +147,19 @@ def generate(
         top_p: float = 0.8,
         temperature: float = 0.8,
     ):
-
-        # TODO: Implement generate function
-
-        return []
+        # max new tokens default value:32
+        if max_new_tokens is None:
+            max_new_tokens = 32 
+        tokens = list(inputs)
+        if max_new_tokens == 0:
+            return tokens
+        # prefill
+        next_token = self._infer(tokens)
+        tokens.append(next_token)
+        # decode
+        for _ in range(max_new_tokens - 1):
+            if tokens[-1] == self._end_token:
+                break
+            next_token = self._infer([tokens[-1]])
+            tokens.append(next_token)
+        return tokens
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
new file mode 100644
index 00000000..b1c1f41b
--- /dev/null
+++ b/src/llaisys/qwen2.cc
@@ -0,0 +1,392 @@
+#include "llaisys/models/qwen2.h"
+#include "llaisys_tensor.hpp"
+
+#include "../tensor/tensor.hpp"
+#include "../ops/add/op.hpp"
+#include "../ops/argmax/op.hpp"
+#include "../ops/embedding/op.hpp"
+#include "../ops/linear/op.hpp"
+#include "../ops/rearrange/op.hpp"
+#include "../ops/rms_norm/op.hpp"
+#include "../ops/rope/op.hpp"
+#include "../ops/self_attention/op.hpp"
+#include "../ops/swiglu/op.hpp"
+#include "../utils.hpp"
+
+#include <cmath>
+#include <cstring>
+#include <vector>
+
+// wrap c++ tensor to external handle
+namespace {
+
+// based on modelMetaInfo create tensor
+llaisys::tensor_t make_tensor(
+    const LlaisysQwen2Meta &meta,
+    llaisysDeviceType_t device,
+    int device_id,
+    const std::vector<size_t> &shape) {
+    return llaisys::Tensor::create(shape, meta.dtype, device, device_id);
+}
+
+// based on dtype create tensor
+llaisys::tensor_t make_tensor_dtype(
+    llaisysDataType_t dtype,
+    llaisysDeviceType_t device,
+    int device_id,
+    const std::vector<size_t> &shape) {
+    return llaisys::Tensor::create(shape, dtype, device, device_id);
+}
+
+// set tensor data to zero
+void zero_tensor(const llaisys::tensor_t &t) {
+    CHECK_ARGUMENT(t->deviceType() == LLAISYS_DEVICE_CPU, "Zero: only CPU is supported.");
+    std::memset(t->data(), 0, t->numel() * t->elementSize());
+}
+
+// wrap c++ tensor to external handle and record to handles list
+llaisysTensor_t wrap_tensor(
+    const llaisys::tensor_t &t,
+    std::vector<LlaisysTensor *> &handles) {
+    auto *h = new LlaisysTensor{t};
+    handles.push_back(h);
+    return h;
+}
+
+}
+
+// Qwen2 Model Instance Structure
+struct LlaisysQwen2Model {
+    LlaisysQwen2Meta meta;              // model meta info
+    llaisysDeviceType_t device;         // device type 
+    int device_id;                      // device id
+
+    LlaisysQwen2Weights weights;        // all weight tensors handle
+    std::vector<LlaisysTensor *> handles; // handles list for unified release
+
+    // attn out bias
+    std::vector<llaisys::tensor_t> attn_o_bias;   
+    // MLP gate bias
+    std::vector<llaisys::tensor_t> mlp_gate_bias; 
+    // MLP up bias
+    std::vector<llaisys::tensor_t> mlp_up_bias;   
+    // MLP down bias
+    std::vector<llaisys::tensor_t> mlp_down_bias; 
+    // out bias
+    llaisys::tensor_t out_bias;                  
+
+    // KV cache
+    std::vector<llaisys::tensor_t> k_cache;
+    std::vector<llaisys::tensor_t> v_cache;
+
+    size_t cache_len;
+};
+
+// init model weights, allocate memory and zero bias
+static void init_weights(LlaisysQwen2Model *model) {
+    const auto &m = model->meta;
+    // input embedding
+    model->weights.in_embed = wrap_tensor(
+        make_tensor(m, model->device, model->device_id, {m.voc, m.hs}),
+        model->handles);
+    // output embedding
+    model->weights.out_embed = wrap_tensor(
+        make_tensor(m, model->device, model->device_id, {m.voc, m.hs}),
+        model->handles);
+    // output layer norm
+    model->weights.out_norm_w = wrap_tensor(
+        make_tensor(m, model->device, model->device_id, {m.hs}),
+        model->handles);
+
+    // allocate ptr array for each layer
+    model->weights.attn_norm_w = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_q_w = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_q_b = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_k_w = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_k_b = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_v_w = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_v_b = new llaisysTensor_t[m.nlayer];
+    model->weights.attn_o_w = new llaisysTensor_t[m.nlayer];
+    model->weights.mlp_norm_w = new llaisysTensor_t[m.nlayer];
+    model->weights.mlp_gate_w = new llaisysTensor_t[m.nlayer];
+    model->weights.mlp_up_w = new llaisysTensor_t[m.nlayer];
+    model->weights.mlp_down_w = new llaisysTensor_t[m.nlayer];
+
+    // bias
+    model->attn_o_bias.resize(m.nlayer);
+    model->mlp_gate_bias.resize(m.nlayer);
+    model->mlp_up_bias.resize(m.nlayer);
+    model->mlp_down_bias.resize(m.nlayer);
+
+    // for each layer, create its weight and bias tensor
+    // first weight tensor
+    for (size_t i = 0; i < m.nlayer; ++i) {
+        // attn layer norm
+        model->weights.attn_norm_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.hs}),
+            model->handles);
+        // Q/K/V/O proj weight and bias
+        model->weights.attn_q_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.nh * m.dh, m.hs}),
+            model->handles);
+        model->weights.attn_q_b[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.nh * m.dh}),
+            model->handles);
+        model->weights.attn_k_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh, m.hs}),
+            model->handles);
+        model->weights.attn_k_b[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh}),
+            model->handles);
+        model->weights.attn_v_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh, m.hs}),
+            model->handles);
+        model->weights.attn_v_b[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh}),
+            model->handles);
+        model->weights.attn_o_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.hs, m.nh * m.dh}),
+            model->handles);
+
+        // MLP layer norm and gate/up/down weight
+        model->weights.mlp_norm_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.hs}),
+            model->handles);
+        model->weights.mlp_gate_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.di, m.hs}),
+            model->handles);
+        model->weights.mlp_up_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.di, m.hs}),
+            model->handles);
+        model->weights.mlp_down_w[i] = wrap_tensor(
+            make_tensor(m, model->device, model->device_id, {m.hs, m.di}),
+            model->handles);
+
+        // bias tensor
+        model->attn_o_bias[i] = make_tensor(m, model->device, model->device_id, {m.hs});
+        model->mlp_gate_bias[i] = make_tensor(m, model->device, model->device_id, {m.di});
+        model->mlp_up_bias[i] = make_tensor(m, model->device, model->device_id, {m.di});
+        model->mlp_down_bias[i] = make_tensor(m, model->device, model->device_id, {m.hs});
+
+        // initialize bias tensor to 0
+        zero_tensor(model->weights.attn_q_b[i]->tensor);
+        zero_tensor(model->weights.attn_k_b[i]->tensor);
+        zero_tensor(model->weights.attn_v_b[i]->tensor);
+        zero_tensor(model->attn_o_bias[i]);
+        zero_tensor(model->mlp_gate_bias[i]);
+        zero_tensor(model->mlp_up_bias[i]);
+        zero_tensor(model->mlp_down_bias[i]);
+    }
+
+    // output layerbias
+    model->out_bias = make_tensor(m, model->device, model->device_id, {m.voc});
+    zero_tensor(model->out_bias);
+}
+
+// initialize kv cache tensor
+static void init_cache(LlaisysQwen2Model *model) {
+    const auto &m = model->meta;
+    model->k_cache.resize(m.nlayer);
+    model->v_cache.resize(m.nlayer);
+    for (size_t i = 0; i < m.nlayer; ++i) {
+        // cache shape: [maxseq, num_heads, head_dim]
+        model->k_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh});
+        model->v_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh});
+    }
+    model->cache_len = 0;
+}
+
+// inference implementation
+static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, size_t ntoken) {
+    CHECK_ARGUMENT(model != nullptr, "model is null");
+    CHECK_ARGUMENT(token_ids != nullptr || ntoken == 0, "token_ids is null");
+    CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU, "Only CPU device is supported.");
+
+    if (ntoken == 0) {
+        return model->meta.end_token;
+    }
+
+    // prefill
+    if (ntoken > 1 || model->cache_len == 0) {
+        model->cache_len = 0;
+    }
+
+    size_t seqlen = ntoken;
+    size_t pos_offset = model->cache_len;
+
+    // position ID [pos_offset, pos_offset + seqlen)
+    std::vector<int64_t> pos_ids(seqlen);
+    for (size_t i = 0; i < seqlen; ++i) {
+        pos_ids[i] = static_cast<int64_t>(pos_offset + i);
+    }
+
+    // input token and position ID tensors
+    auto input_ids = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen});
+    input_ids->load(token_ids);
+
+    auto pos_tensor = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen});
+    pos_tensor->load(pos_ids.data());
+
+    // embedding lookup
+    auto x = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+    llaisys::ops::embedding(x, input_ids, model->weights.in_embed->tensor);
+
+    // attn scale factor
+    float scale = 1.0f / std::sqrt(static_cast<float>(model->meta.dh));
+
+    // layer forward
+    for (size_t i = 0; i < model->meta.nlayer; ++i) {
+        // attn input norm
+        auto x_norm = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+        llaisys::ops::rms_norm(x_norm, x, model->weights.attn_norm_w[i]->tensor, model->meta.epsilon);
+
+        // Q/K/V linear proj
+        auto q = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh * model->meta.dh});
+        auto k = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nkvh * model->meta.dh});
+        auto v = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nkvh * model->meta.dh});
+
+        llaisys::ops::linear(q, x_norm, model->weights.attn_q_w[i]->tensor, model->weights.attn_q_b[i]->tensor);
+        llaisys::ops::linear(k, x_norm, model->weights.attn_k_w[i]->tensor, model->weights.attn_k_b[i]->tensor);
+        llaisys::ops::linear(v, x_norm, model->weights.attn_v_w[i]->tensor, model->weights.attn_v_b[i]->tensor);
+
+        // transform to multi-head dim
+        auto q_view = q->view({seqlen, model->meta.nh, model->meta.dh});
+        auto k_view = k->view({seqlen, model->meta.nkvh, model->meta.dh});
+        auto v_view = v->view({seqlen, model->meta.nkvh, model->meta.dh});
+
+        // RoPE
+        auto q_rope = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh, model->meta.dh});
+        auto k_rope = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nkvh, model->meta.dh});
+
+        llaisys::ops::rope(q_rope, q_view, pos_tensor, model->meta.theta);
+        llaisys::ops::rope(k_rope, k_view, pos_tensor, model->meta.theta);
+
+        // write new k/v to cache
+        auto k_cache_slice = model->k_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen);
+        auto v_cache_slice = model->v_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen);
+
+        llaisys::ops::rearrange(k_cache_slice, k_rope);
+        llaisys::ops::rearrange(v_cache_slice, v_view);
+
+        // get all history k/v
+        size_t total_len = model->cache_len + seqlen;
+        auto k_total = model->k_cache[i]->slice(0, 0, total_len);
+        auto v_total = model->v_cache[i]->slice(0, 0, total_len);
+
+        // self attn
+        auto attn = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh, model->meta.dh});
+        llaisys::ops::self_attention(attn, q_rope, k_total, v_total, scale);
+
+        // attn out proj
+        auto attn_flat = attn->view({seqlen, model->meta.nh * model->meta.dh});
+        auto attn_proj = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+        llaisys::ops::linear(attn_proj, attn_flat, model->weights.attn_o_w[i]->tensor, model->attn_o_bias[i]);
+
+        // first residual conn
+        auto res1 = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+        llaisys::ops::add(res1, x, attn_proj);
+
+        // MLP input norm
+        auto x_norm2 = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+        llaisys::ops::rms_norm(x_norm2, res1, model->weights.mlp_norm_w[i]->tensor, model->meta.epsilon);
+
+        // MLP gate and up proj
+        auto gate = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.di});
+        auto up = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.di});
+
+        llaisys::ops::linear(gate, x_norm2, model->weights.mlp_gate_w[i]->tensor, model->mlp_gate_bias[i]);
+        llaisys::ops::linear(up, x_norm2, model->weights.mlp_up_w[i]->tensor, model->mlp_up_bias[i]);
+
+        // SwiGLU activate
+        auto swiglu_out = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.di});
+        llaisys::ops::swiglu(swiglu_out, gate, up);
+
+        // MLP down proj
+        auto down = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+        llaisys::ops::linear(down, swiglu_out, model->weights.mlp_down_w[i]->tensor, model->mlp_down_bias[i]);
+
+        // second residual conn
+        auto res2 = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+        llaisys::ops::add(res2, res1, down);
+
+        x = res2;
+    }
+
+    // update cache len
+    model->cache_len += seqlen;
+
+    // final norm
+    auto x_norm = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
+    llaisys::ops::rms_norm(x_norm, x, model->weights.out_norm_w->tensor, model->meta.epsilon);
+
+    // output proj to vocab
+    auto logits = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.voc});
+    llaisys::ops::linear(logits, x_norm, model->weights.out_embed->tensor, model->out_bias);
+
+    // get last token logits and argmax
+    auto last = logits->slice(0, seqlen - 1, seqlen)->view({model->meta.voc});
+    auto max_idx = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {1});
+    auto max_val = make_tensor(model->meta, model->device, model->device_id, {1});
+    llaisys::ops::argmax(max_idx, max_val, last);
+
+    return reinterpret_cast<int64_t *>(max_idx->data())[0];
+}
+
+// C API wrapper
+__C {
+
+// ModelCreate: Check args, initialize weights and cache
+struct LlaisysQwen2Model *llaisysQwen2ModelCreate(
+    const LlaisysQwen2Meta *meta,
+    llaisysDeviceType_t device,
+    int *device_ids,
+    int ndevice) {
+    CHECK_ARGUMENT(meta != nullptr, "meta is null");
+    CHECK_ARGUMENT(device == LLAISYS_DEVICE_CPU, "Only CPU device is supported.");
+    CHECK_ARGUMENT(ndevice >= 1, "Invalid device count.");
+
+    auto *model = new LlaisysQwen2Model();
+    model->meta = *meta;
+    model->device = device;
+    model->device_id = device_ids ? device_ids[0] : 0;
+    init_weights(model);
+    init_cache(model);
+    return model;
+}
+
+// ModelDestroy: Free weights and cache, all handles
+void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model) {
+    if (!model) {
+        return;
+    }
+    delete[] model->weights.attn_norm_w;
+    delete[] model->weights.attn_q_w;
+    delete[] model->weights.attn_q_b;
+    delete[] model->weights.attn_k_w;
+    delete[] model->weights.attn_k_b;
+    delete[] model->weights.attn_v_w;
+    delete[] model->weights.attn_v_b;
+    delete[] model->weights.attn_o_w;
+    delete[] model->weights.mlp_norm_w;
+    delete[] model->weights.mlp_gate_w;
+    delete[] model->weights.mlp_up_w;
+    delete[] model->weights.mlp_down_w;
+
+    for (auto *h : model->handles) {
+        delete h;
+    }
+    delete model;
+}
+
+// ModelWeights: Get weights pointer for loading pretrained params
+struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model) {
+    CHECK_ARGUMENT(model != nullptr, "model is null");
+    return &model->weights;
+}
+
+// ModelInfer: Single interface for single token prediction
+int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken) {
+    return infer_impl(model, token_ids, ntoken);
+}
+
+}

From 2e900c70d6643cddef040f96d622a57fc6b52135 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Tue, 3 Feb 2026 16:55:41 +0800
Subject: [PATCH 07/14] erase exception throw in qwen2.cc's extern C

---
 src/llaisys/qwen2.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index b1c1f41b..bd070c7d 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -341,9 +341,6 @@ struct LlaisysQwen2Model *llaisysQwen2ModelCreate(
     llaisysDeviceType_t device,
     int *device_ids,
     int ndevice) {
-    CHECK_ARGUMENT(meta != nullptr, "meta is null");
-    CHECK_ARGUMENT(device == LLAISYS_DEVICE_CPU, "Only CPU device is supported.");
-    CHECK_ARGUMENT(ndevice >= 1, "Invalid device count.");
 
     auto *model = new LlaisysQwen2Model();
     model->meta = *meta;
@@ -380,7 +377,6 @@ void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model) {
 
 // ModelWeights: Get weights pointer for loading pretrained params
 struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model) {
-    CHECK_ARGUMENT(model != nullptr, "model is null");
     return &model->weights;
 }
 

From e6c10aa0dee7f44410bd2b3cc19982d103acc625 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Wed, 25 Feb 2026 17:45:03 +0800
Subject: [PATCH 08/14] implement random_sample op to replace argmax && test
 passed.

---
 include/llaisys/models/qwen2.h              |   3 +-
 include/llaisys/ops.h                       |   2 +
 python/llaisys/libllaisys/models.py         |   4 +
 python/llaisys/libllaisys/ops.py            |  13 +-
 python/llaisys/models/qwen2.py              |  16 ++-
 python/llaisys/ops.py                       |  22 +++-
 src/llaisys/ops.cc                          |   5 +
 src/llaisys/qwen2.cc                        |  20 +--
 src/ops/rand_sample/cpu/rand_sample_cpu.cpp | 129 ++++++++++++++++++++
 src/ops/rand_sample/cpu/rand_sample_cpu.hpp |   9 ++
 src/ops/rand_sample/op.cpp                  |  53 ++++++++
 src/ops/rand_sample/op.hpp                  |   8 ++
 12 files changed, 268 insertions(+), 16 deletions(-)
 create mode 100644 src/ops/rand_sample/cpu/rand_sample_cpu.cpp
 create mode 100644 src/ops/rand_sample/cpu/rand_sample_cpu.hpp
 create mode 100644 src/ops/rand_sample/op.cpp
 create mode 100644 src/ops/rand_sample/op.hpp

diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 7054626d..8840660f 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -37,6 +37,7 @@ __C {
 
     __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
 
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
+                                            float temperature, size_t topK, float topP, int64_t seed);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h
index ddb3be24..15b70067 100644
--- a/include/llaisys/ops.h
+++ b/include/llaisys/ops.h
@@ -6,6 +6,8 @@
 __C {
     __export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b);
     __export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals);
+    __export void llaisysRandSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t vals,
+                                    float temperature, size_t topK, float topP, int64_t seed);
     __export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight);
     __export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias);
     __export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in);
diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py
index ea313487..a3807fae 100644
--- a/python/llaisys/libllaisys/models.py
+++ b/python/llaisys/libllaisys/models.py
@@ -61,5 +61,9 @@ def load_models(lib):
         ctypes.c_void_p,
         ctypes.POINTER(c_int64),
         c_size_t,
+        c_float,
+        c_size_t,
+        c_float,
+        c_int64,
     ]
     lib.llaisysQwen2ModelInfer.restype = c_int64
diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py
index 5be095ef..e6ecfaa6 100644
--- a/python/llaisys/libllaisys/ops.py
+++ b/python/llaisys/libllaisys/ops.py
@@ -1,5 +1,5 @@
 from .tensor import llaisysTensor_t
-from ctypes import c_float
+from ctypes import c_float, c_int64, c_size_t
 
 def load_ops(lib):
     lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
@@ -8,6 +8,17 @@ def load_ops(lib):
     lib.llaisysArgmax.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
     lib.llaisysArgmax.restype = None
 
+    lib.llaisysRandSample.argtypes = [
+        llaisysTensor_t,
+        llaisysTensor_t,
+        llaisysTensor_t,
+        c_float,
+        c_size_t,
+        c_float,
+        c_int64,
+    ]
+    lib.llaisysRandSample.restype = None
+
     lib.llaisysEmbedding.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
     lib.llaisysEmbedding.restype = None
 
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 98efbe28..dcd1263b 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -126,7 +126,7 @@ def _match_weight(self, name: str):
             return w.mlp_down_w[layer]
         return None
 
-    def _infer(self, tokens: Sequence[int]) -> int:
+    def _infer(self, tokens: Sequence[int], temperature: float, top_k: int, top_p: float, seed: int) -> int:
         # step forward infer
 
         if len(tokens) == 0:
@@ -135,7 +135,13 @@ def _infer(self, tokens: Sequence[int]) -> int:
         arr = (ctypes.c_int64 * len(tokens))(*tokens)
         return int(
             LIB_LLAISYS.llaisysQwen2ModelInfer(
-                self._model, arr, ctypes.c_size_t(len(tokens))
+                self._model,
+                arr,
+                ctypes.c_size_t(len(tokens)),
+                ctypes.c_float(temperature),
+                ctypes.c_size_t(top_k),
+                ctypes.c_float(top_p),
+                ctypes.c_int64(seed),
             )
         )
 
@@ -146,6 +152,7 @@ def generate(
         top_k: int = 1,
         top_p: float = 0.8,
         temperature: float = 0.8,
+        seed: int = 0,
     ):
         # max new tokens default value:32
         if max_new_tokens is None:
@@ -154,12 +161,13 @@ def generate(
         if max_new_tokens == 0:
             return tokens
         # prefill
-        next_token = self._infer(tokens)
+        next_token = self._infer(tokens, temperature, top_k, top_p, seed)
         tokens.append(next_token)
         # decode
         for _ in range(max_new_tokens - 1):
             if tokens[-1] == self._end_token:
                 break
-            next_token = self._infer([tokens[-1]])
+            seed += 1
+            next_token = self._infer([tokens[-1]], temperature, top_k, top_p, seed)
             tokens.append(next_token)
         return tokens
diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py
index ed0180bc..8c835eeb 100644
--- a/python/llaisys/ops.py
+++ b/python/llaisys/ops.py
@@ -1,6 +1,6 @@
 from .libllaisys import LIB_LLAISYS
 from .tensor import Tensor
-from ctypes import c_float, c_int
+from ctypes import c_float, c_int, c_int64, c_size_t
 
 
 class Ops:
@@ -12,6 +12,26 @@ def add(c: Tensor, a: Tensor, b: Tensor):
     def argmax(max_idx: Tensor, max_val: Tensor, vals: Tensor):
         LIB_LLAISYS.llaisysArgmax(max_idx.lib_tensor(), max_val.lib_tensor(), vals.lib_tensor())
 
+    @staticmethod
+    def rand_sample(
+        sample_idx: Tensor,
+        sample_val: Tensor,
+        vals: Tensor,
+        temperature: float,
+        top_k: int,
+        top_p: float,
+        seed: int,
+    ):
+        LIB_LLAISYS.llaisysRandSample(
+            sample_idx.lib_tensor(),
+            sample_val.lib_tensor(),
+            vals.lib_tensor(),
+            c_float(temperature),
+            c_size_t(top_k),
+            c_float(top_p),
+            c_int64(seed),
+        )
+
     @staticmethod
     def embedding(out: Tensor, index: Tensor, weight: Tensor):
         LIB_LLAISYS.llaisysEmbedding(
diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc
index c99fbc32..50186759 100644
--- a/src/llaisys/ops.cc
+++ b/src/llaisys/ops.cc
@@ -6,6 +6,7 @@
 #include "../ops/argmax/op.hpp"
 #include "../ops/embedding/op.hpp"
 #include "../ops/linear/op.hpp"
+#include "../ops/rand_sample/op.hpp"
 #include "../ops/rearrange/op.hpp"
 #include "../ops/rms_norm/op.hpp"
 #include "../ops/rope/op.hpp"
@@ -19,6 +20,10 @@ __C {
     void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals) {
         llaisys::ops::argmax(max_idx->tensor, max_val->tensor, vals->tensor);
     }
+    void llaisysRandSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t vals,
+                           float temperature, size_t topK, float topP, int64_t seed) {
+        llaisys::ops::rand_sample(sample_idx->tensor, sample_val->tensor, vals->tensor, temperature, topK, topP, seed);
+    }
     void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight) {
         llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor);
     }
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index bd070c7d..e9d123b1 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -3,7 +3,7 @@
 
 #include "../tensor/tensor.hpp"
 #include "../ops/add/op.hpp"
-#include "../ops/argmax/op.hpp"
+#include "../ops/rand_sample/op.hpp"
 #include "../ops/embedding/op.hpp"
 #include "../ops/linear/op.hpp"
 #include "../ops/rearrange/op.hpp"
@@ -197,7 +197,8 @@ static void init_cache(LlaisysQwen2Model *model) {
 }
 
 // inference implementation
-static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, size_t ntoken) {
+static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, size_t ntoken, float temperature,
+                          size_t topK, float topP, int64_t seed) {
     CHECK_ARGUMENT(model != nullptr, "model is null");
     CHECK_ARGUMENT(token_ids != nullptr || ntoken == 0, "token_ids is null");
     CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU, "Only CPU device is supported.");
@@ -323,13 +324,13 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
     auto logits = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.voc});
     llaisys::ops::linear(logits, x_norm, model->weights.out_embed->tensor, model->out_bias);
 
-    // get last token logits and argmax
+    // get last token logits and sample
     auto last = logits->slice(0, seqlen - 1, seqlen)->view({model->meta.voc});
-    auto max_idx = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {1});
-    auto max_val = make_tensor(model->meta, model->device, model->device_id, {1});
-    llaisys::ops::argmax(max_idx, max_val, last);
+    auto sample_idx = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {1});
+    auto sample_val = make_tensor(model->meta, model->device, model->device_id, {1});
+    llaisys::ops::rand_sample(sample_idx, sample_val, last, temperature, topK, topP, seed);
 
-    return reinterpret_cast<int64_t *>(max_idx->data())[0];
+    return reinterpret_cast<int64_t *>(sample_idx->data())[0];
 }
 
 // C API wrapper
@@ -381,8 +382,9 @@ struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *m
 }
 
 // ModelInfer: Single interface for single token prediction
-int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken) {
-    return infer_impl(model, token_ids, ntoken);
+int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken, float temperature,
+                               size_t topK, float topP, int64_t seed) {
+    return infer_impl(model, token_ids, ntoken, temperature, topK, topP, seed);
 }
 
 }
diff --git a/src/ops/rand_sample/cpu/rand_sample_cpu.cpp b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp
new file mode 100644
index 00000000..aa8ffb5e
--- /dev/null
+++ b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp
@@ -0,0 +1,129 @@
+#include "rand_sample_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include <vector>
+
+template <typename T>
+static void rand_sample_(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, const float temperature,
+                         const size_t topK, const float topP, size_t numel, const int64_t batch_size, const int64_t seed) {
+    const T *v_all = reinterpret_cast<const T *>(vals);
+    auto *out_idx = reinterpret_cast<int64_t *>(sample_idx);
+    auto *out_val = reinterpret_cast<T *>(sample_val);
+    float temp = temperature;
+    if (temp <= 1e-6f) {
+        temp = 1e-6f;
+    }
+
+    std::vector<float> scores(numel);
+    std::vector<std::pair<float, size_t>> sorted_scores(numel);
+    std::vector<std::pair<float, size_t>> candidates;
+    candidates.reserve(numel);
+
+    std::mt19937_64 rng(static_cast<uint64_t>(seed));
+    std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+
+    for (int64_t b = 0; b < batch_size; ++b) {
+        const T *v = v_all + b * numel;
+        size_t max_i = 0;
+        float max_v = llaisys::utils::cast<float>(v[0]);
+        for (size_t i = 1; i < numel; ++i) {
+            float cur = llaisys::utils::cast<float>(v[i]);
+            if (cur > max_v) {
+                max_v = cur;
+                max_i = i;
+            }
+        }
+
+        for (size_t i = 0; i < numel; ++i) {
+            float cur = llaisys::utils::cast<float>(v[i]);
+            scores[i] = std::exp((cur - max_v) / temp);
+        }
+        float sum = 0.0f;
+        for (size_t i = 0; i < numel; ++i) {
+            sum += scores[i];
+        }
+        if (sum <= 0.0f) {
+            for (size_t i = 0; i < numel; ++i) {
+                scores[i] = 0.0f;
+            }
+            scores[max_i] = 1.0f;
+            sum = 1.0f;
+        } else {
+            for (size_t i = 0; i < numel; ++i) {
+                scores[i] /= sum;
+            }
+        }
+
+        for (size_t i = 0; i < numel; ++i) {
+            sorted_scores[i] = {scores[i], i};
+        }
+        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto &a, const auto &b) { return a.first > b.first; });
+
+        size_t k = numel;
+        if (topK > 0 && topK < numel) {
+            k = topK;
+        }
+
+        candidates.clear();
+        if (topP > 0.0f && topP < 1.0f) {
+            float cum_score = 0.0f;
+            for (size_t i = 0; i < k; ++i) {
+                candidates.push_back(sorted_scores[i]);
+                cum_score += sorted_scores[i].first;
+                if (cum_score >= topP) {
+                    break;
+                }
+            }
+        } else {
+            for (size_t i = 0; i < k; ++i) {
+                candidates.push_back(sorted_scores[i]);
+            }
+        }
+        if (candidates.empty()) {
+            candidates.push_back(sorted_scores[0]);
+        }
+
+        float cand_sum = 0.0f;
+        for (const auto &item : candidates) {
+            cand_sum += item.first;
+        }
+
+        size_t chosen = candidates[0].second;
+        if (cand_sum > 0.0f) {
+            float r = dist(rng);
+            for (const auto &item : candidates) {
+                r -= item.first / cand_sum;
+                if (r <= 0.0f) {
+                    chosen = item.second;
+                    break;
+                }
+            }
+            if (r > 0.0f) {
+                chosen = candidates.back().second;
+            }
+        }
+
+        out_idx[b] = static_cast<int64_t>(chosen);
+        out_val[b] = v[chosen];
+    }
+}
+
+namespace llaisys::ops::cpu {
+void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel,
+                 const int64_t batch_size, const float temperature, const size_t topK, const float topP, const int64_t seed) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rand_sample_<float>(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed);
+    case LLAISYS_DTYPE_BF16:
+        return rand_sample_<llaisys::bf16_t>(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed);
+    case LLAISYS_DTYPE_F16:
+        return rand_sample_<llaisys::fp16_t>(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
diff --git a/src/ops/rand_sample/cpu/rand_sample_cpu.hpp b/src/ops/rand_sample/cpu/rand_sample_cpu.hpp
new file mode 100644
index 00000000..59ad95e7
--- /dev/null
+++ b/src/ops/rand_sample/cpu/rand_sample_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel,
+                 const int64_t batch_size, const float temperature, const size_t topK, const float topP, const int64_t seed);
+}
diff --git a/src/ops/rand_sample/op.cpp b/src/ops/rand_sample/op.cpp
new file mode 100644
index 00000000..891aa2dc
--- /dev/null
+++ b/src/ops/rand_sample/op.cpp
@@ -0,0 +1,53 @@
+#include "op.hpp"
+
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rand_sample_cpu.hpp"
+
+namespace llaisys::ops {
+void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float temperature, size_t topK, float topP,
+                 int64_t seed) {
+    CHECK_SAME_DEVICE(sample_idx, sample_val, vals);
+    CHECK_SAME_DTYPE(sample_val->dtype(), vals->dtype());
+    ASSERT(sample_idx->dtype() == LLAISYS_DTYPE_I64, "rand_sample: sample_idx must be I64.");
+    ASSERT(sample_idx->isContiguous() && sample_val->isContiguous() && vals->isContiguous(),
+           "rand_sample: all tensors must be contiguous.");
+
+    size_t batch_size = 1;
+    size_t numel = 0;
+    if (vals->ndim() == 1) {
+        numel = vals->shape()[0];
+        ASSERT(sample_idx->ndim() == 1 && sample_idx->shape()[0] == 1, "rand_sample: sample_idx shape must be (1, ).");
+        ASSERT(sample_val->ndim() == 1 && sample_val->shape()[0] == 1, "rand_sample: sample_val shape must be (1, ).");
+    } else {
+        ASSERT(vals->ndim() == 2, "rand_sample: vals must be 1D or 2D.");
+        batch_size = vals->shape()[0];
+        numel = vals->shape()[1];
+        ASSERT(sample_idx->ndim() == 1 && sample_idx->shape()[0] == batch_size,
+               "rand_sample: sample_idx shape must be (batch, ).");
+        ASSERT(sample_val->ndim() == 1 && sample_val->shape()[0] == batch_size,
+               "rand_sample: sample_val shape must be (batch, ).");
+    }
+
+    if (vals->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rand_sample(sample_idx->data(), sample_val->data(), vals->data(), vals->dtype(), numel,
+                                static_cast<int64_t>(batch_size), temperature, topK, topP, seed);
+    }
+
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());
+
+    switch (vals->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rand_sample(sample_idx->data(), sample_val->data(), vals->data(), vals->dtype(), numel,
+                                static_cast<int64_t>(batch_size), temperature, topK, topP, seed);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        TO_BE_IMPLEMENTED();
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
+} // namespace llaisys::ops
diff --git a/src/ops/rand_sample/op.hpp b/src/ops/rand_sample/op.hpp
new file mode 100644
index 00000000..8172f04d
--- /dev/null
+++ b/src/ops/rand_sample/op.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "../../tensor/tensor.hpp"
+
+namespace llaisys::ops {
+void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float temperature, size_t topK, float topP,
+                 int64_t seed);
+}

From bd737c7020c561d8b8520465d529b9abe79acc6c Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Wed, 25 Feb 2026 22:06:47 +0800
Subject: [PATCH 09/14] complete chat server && test passed.

---
 python/llaisys/server/chat_server.py        | 242 ++++++++++++++++++++
 python/setup.cfg                            |   3 +
 src/ops/rand_sample/cpu/rand_sample_cpu.cpp |  12 +-
 3 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100644 python/llaisys/server/chat_server.py

diff --git a/python/llaisys/server/chat_server.py b/python/llaisys/server/chat_server.py
new file mode 100644
index 00000000..cbbbc5d2
--- /dev/null
+++ b/python/llaisys/server/chat_server.py
@@ -0,0 +1,242 @@
+import os
+import time
+import uuid
+import json
+from typing import List, Optional, Iterable
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from starlette.responses import StreamingResponse, JSONResponse
+from transformers import AutoTokenizer
+
+import llaisys
+
+# define Message format
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+# define Request format
+class ChatRequest(BaseModel):
+    model: Optional[str] = None
+    messages: List[ChatMessage]
+    max_tokens: Optional[int] = 128
+    temperature: float = 1.0
+    top_p: float = 0.8
+    top_k: int = 50
+    stream: bool = False
+    seed: int = 0
+
+
+class AppState:
+    tokenizer = None
+    model = None
+    model_path = None
+    model_name = None
+    device = None
+
+
+state = AppState()
+app = FastAPI()
+
+# map device name to llaisys device type
+def llaisys_device(device_name: str):
+    if device_name == "cpu":
+        return llaisys.DeviceType.CPU
+    if device_name == "nvidia":
+        return llaisys.DeviceType.NVIDIA
+    raise ValueError(f"Unsupported device name: {device_name}")
+
+# set default values for model path, device, and model name 
+# if not provided
+def ensure_state_loaded():
+    if state.model is not None and state.tokenizer is not None:
+        return
+    model_path = os.environ.get("LLAISYS_MODEL_PATH", "./models")
+    if not model_path:
+        raise RuntimeError("LLAISYS_MODEL_PATH is required")
+    device = os.environ.get("LLAISYS_DEVICE", "cpu")
+    model_name = os.environ.get("LLAISYS_MODEL_NAME", "llaisys-qwen2")
+    state.model_path = model_path
+    state.device = device
+    state.model_name = model_name
+    state.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    state.model = llaisys.models.Qwen2(model_path, llaisys_device(device))
+
+# build prompt from messages
+def build_prompt(messages: List[ChatMessage]) -> str:
+    conversation = [{"role": m.role, "content": m.content} for m in messages]
+    return state.tokenizer.apply_chat_template(
+        conversation=conversation,
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+
+# decode output token id and mask prompt tokens
+def decode_completion(input_ids: List[int], output_ids: List[int]) -> str:
+    prompt_text = state.tokenizer.decode(input_ids, skip_special_tokens=True)
+    full_text = state.tokenizer.decode(output_ids, skip_special_tokens=True)
+    return full_text[len(prompt_text):]
+
+# generate full completion
+def generate_full(
+    input_ids: List[int],
+    max_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    seed: int,
+):
+    output_ids = state.model.generate(
+        input_ids,
+        max_new_tokens=max_tokens,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        seed=seed,
+    )
+    completion_text = decode_completion(input_ids, output_ids)
+    completion_tokens = max(0, len(output_ids) - len(input_ids))
+    return completion_text, completion_tokens
+
+# stream mode
+def generate_stream(
+    input_ids: List[int],
+    max_tokens: int,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    seed: int,
+) -> Iterable[str]:
+    tokens = list(input_ids)
+    prompt_text = state.tokenizer.decode(tokens, skip_special_tokens=True)
+    prev_text = prompt_text
+    if max_tokens <= 0:
+        return
+    # generate first output token, then append it to prompt
+    next_token = state.model._infer(tokens, temperature, top_k, top_p, seed)
+    tokens.append(next_token)
+    text = state.tokenizer.decode(tokens, skip_special_tokens=True)
+    delta = text[len(prev_text):]
+    prev_text = text
+    # if we decode a non-empty delta, yield it, that's the first token
+    if delta:
+        yield delta
+    # continue generate subsequent tokens, until max_tokens or end_token
+    for _ in range(max_tokens - 1):
+        if tokens[-1] == state.model._end_token:
+            break
+        seed += 1
+        # because of kv-cache, we only need to infer the last token
+        next_token = state.model._infer([tokens[-1]], temperature, top_k, top_p, seed)
+        tokens.append(next_token)
+        text = state.tokenizer.decode(tokens, skip_special_tokens=True)
+        delta = text[len(prev_text):]
+        prev_text = text
+        if delta:
+            yield delta
+
+
+def sse_chunk(payload: dict) -> str:
+    return "data: " + json.dumps(payload, ensure_ascii=False) + "\n\n"
+
+
+@app.post("/v1/chat/completions")
+def chat_completions(req: ChatRequest):
+    ensure_state_loaded()
+    if not req.messages:
+        raise HTTPException(status_code=400, detail="messages is required")
+    prompt = build_prompt(req.messages)
+    input_ids = state.tokenizer.encode(prompt)
+    max_tokens = req.max_tokens if req.max_tokens is not None else 128
+    created = int(time.time())
+    request_id = "chatcmpl-" + uuid.uuid4().hex
+
+    if not req.stream:
+        completion_text, completion_tokens = generate_full(
+            input_ids=input_ids,
+            max_tokens=max_tokens,
+            temperature=req.temperature,
+            top_k=req.top_k,
+            top_p=req.top_p,
+            seed=req.seed,
+        )
+        response = {
+            "id": request_id,
+            "object": "chat.completion",
+            "created": created,
+            "model": req.model or state.model_name,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": completion_text},
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": len(input_ids),
+                "completion_tokens": completion_tokens,
+                "total_tokens": len(input_ids) + completion_tokens,
+            },
+        }
+        return JSONResponse(response)
+
+    def event_stream():
+        yield sse_chunk(
+            {
+                "id": request_id,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": req.model or state.model_name,
+                "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
+            }
+        )
+        for delta in generate_stream(
+            input_ids=input_ids,
+            max_tokens=max_tokens,
+            temperature=req.temperature,
+            top_k=req.top_k,
+            top_p=req.top_p,
+            seed=req.seed,
+        ):
+            yield sse_chunk(
+                {
+                    "id": request_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": req.model or state.model_name,
+                    "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
+                }
+            )
+        yield sse_chunk(
+            {
+                "id": request_id,
+                "object": "chat.completion.chunk",
+                "created": created,
+                "model": req.model or state.model_name,
+                "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
+            }
+        )
+        yield "data: [DONE]\n\n"
+
+    return StreamingResponse(event_stream(), media_type="text/event-stream")
+
+
+if __name__ == "__main__":
+    import argparse
+    import uvicorn
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", default=8000, type=int)
+    parser.add_argument("--model_path", default=None)
+    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--model_name", default="llaisys-qwen2")
+    args = parser.parse_args()
+
+    if args.model_path:
+        os.environ["LLAISYS_MODEL_PATH"] = args.model_path
+    os.environ["LLAISYS_DEVICE"] = args.device
+    os.environ["LLAISYS_MODEL_NAME"] = args.model_name
+
+    uvicorn.run(app, host=args.host, port=args.port)
\ No newline at end of file
diff --git a/python/setup.cfg b/python/setup.cfg
index b35fc65f..88ff8f6e 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -13,6 +13,9 @@ install_requires =
     torch>=2.4.0
     transformers
     accelerate
+    fastapi
+    uvicorn
+    sse-starlette
 
 [options.package_data]
 llaisys = 
diff --git a/src/ops/rand_sample/cpu/rand_sample_cpu.cpp b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp
index aa8ffb5e..9ca5e0e5 100644
--- a/src/ops/rand_sample/cpu/rand_sample_cpu.cpp
+++ b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp
@@ -61,7 +61,8 @@ static void rand_sample_(std::byte *sample_idx, std::byte *sample_val, const std
         for (size_t i = 0; i < numel; ++i) {
             sorted_scores[i] = {scores[i], i};
         }
-        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto &a, const auto &b) { return a.first > b.first; });
+        std::sort(sorted_scores.begin(), sorted_scores.end(), 
+        [](const auto &a, const auto &b) { return a.first > b.first; });
 
         size_t k = numel;
         if (topK > 0 && topK < numel) {
@@ -117,11 +118,14 @@ void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *
                  const int64_t batch_size, const float temperature, const size_t topK, const float topP, const int64_t seed) {
     switch (type) {
     case LLAISYS_DTYPE_F32:
-        return rand_sample_<float>(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed);
+        return rand_sample_<float>(sample_idx, sample_val, vals, 
+            temperature, topK, topP, numel, batch_size, seed);
     case LLAISYS_DTYPE_BF16:
-        return rand_sample_<llaisys::bf16_t>(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed);
+        return rand_sample_<llaisys::bf16_t>(sample_idx, sample_val, vals, 
+            temperature, topK, topP, numel, batch_size, seed);
     case LLAISYS_DTYPE_F16:
-        return rand_sample_<llaisys::fp16_t>(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed);
+        return rand_sample_<llaisys::fp16_t>(sample_idx, sample_val, vals, 
+            temperature, topK, topP, numel, batch_size, seed);
     default:
         EXCEPTION_UNSUPPORTED_DATATYPE(type);
     }

From e20e680ae0b4ae8c1906355ce7086c1eb7afae8c Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Thu, 26 Feb 2026 14:27:02 +0800
Subject: [PATCH 10/14] complete webUI for chatServer && test passed.

---
 python/llaisys/server/chat_server.py |  98 +++++---
 python/llaisys/webUI/index.html      | 342 +++++++++++++++++++++++++++
 2 files changed, 402 insertions(+), 38 deletions(-)
 create mode 100644 python/llaisys/webUI/index.html

diff --git a/python/llaisys/server/chat_server.py b/python/llaisys/server/chat_server.py
index cbbbc5d2..4682a0f6 100644
--- a/python/llaisys/server/chat_server.py
+++ b/python/llaisys/server/chat_server.py
@@ -2,11 +2,14 @@
 import time
 import uuid
 import json
+import threading
+from pathlib import Path
 from typing import List, Optional, Iterable
 
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from starlette.responses import StreamingResponse, JSONResponse
+from starlette.responses import StreamingResponse, JSONResponse, FileResponse
+from starlette.staticfiles import StaticFiles
 from transformers import AutoTokenizer
 
 import llaisys
@@ -38,6 +41,13 @@ class AppState:
 
 state = AppState()
 app = FastAPI()
+model_lock = threading.Lock()
+
+WEBUI_INDEX = Path(__file__).resolve().parent.parent / "webUI" / "index.html"
+WEBUI_DIR = WEBUI_INDEX.parent.parent
+WEBUI_ENABLED = WEBUI_INDEX.is_file()
+if WEBUI_ENABLED:
+    app.mount("/webUI", StaticFiles(directory=WEBUI_DIR), name="webUI")
 
 # map device name to llaisys device type
 def llaisys_device(device_name: str):
@@ -153,14 +163,15 @@ def chat_completions(req: ChatRequest):
     request_id = "chatcmpl-" + uuid.uuid4().hex
 
     if not req.stream:
-        completion_text, completion_tokens = generate_full(
-            input_ids=input_ids,
-            max_tokens=max_tokens,
-            temperature=req.temperature,
-            top_k=req.top_k,
-            top_p=req.top_p,
-            seed=req.seed,
-        )
+        with model_lock:
+            completion_text, completion_tokens = generate_full(
+                input_ids=input_ids,
+                max_tokens=max_tokens,
+                temperature=req.temperature,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                seed=req.seed,
+            )
         response = {
             "id": request_id,
             "object": "chat.completion",
@@ -182,46 +193,57 @@ def chat_completions(req: ChatRequest):
         return JSONResponse(response)
 
     def event_stream():
-        yield sse_chunk(
-            {
-                "id": request_id,
-                "object": "chat.completion.chunk",
-                "created": created,
-                "model": req.model or state.model_name,
-                "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
-            }
-        )
-        for delta in generate_stream(
-            input_ids=input_ids,
-            max_tokens=max_tokens,
-            temperature=req.temperature,
-            top_k=req.top_k,
-            top_p=req.top_p,
-            seed=req.seed,
-        ):
+        model_lock.acquire()
+        try:
+            yield sse_chunk(
+                {
+                    "id": request_id,
+                    "object": "chat.completion.chunk",
+                    "created": created,
+                    "model": req.model or state.model_name,
+                    "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
+                }
+            )
+            for delta in generate_stream(
+                input_ids=input_ids,
+                max_tokens=max_tokens,
+                temperature=req.temperature,
+                top_k=req.top_k,
+                top_p=req.top_p,
+                seed=req.seed,
+            ):
+                yield sse_chunk(
+                    {
+                        "id": request_id,
+                        "object": "chat.completion.chunk",
+                        "created": created,
+                        "model": req.model or state.model_name,
+                        "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
+                    }
+                )
             yield sse_chunk(
                 {
                     "id": request_id,
                     "object": "chat.completion.chunk",
                     "created": created,
                     "model": req.model or state.model_name,
-                    "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
+                    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
                 }
             )
-        yield sse_chunk(
-            {
-                "id": request_id,
-                "object": "chat.completion.chunk",
-                "created": created,
-                "model": req.model or state.model_name,
-                "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
-            }
-        )
-        yield "data: [DONE]\n\n"
+            yield "data: [DONE]\n\n"
+        finally:
+            model_lock.release()
 
     return StreamingResponse(event_stream(), media_type="text/event-stream")
 
 
+@app.get("/")
+def chat_ui():
+    if WEBUI_ENABLED:
+        return FileResponse(WEBUI_INDEX)
+    raise HTTPException(status_code=404, detail="Web UI not found")
+
+
 if __name__ == "__main__":
     import argparse
     import uvicorn
@@ -239,4 +261,4 @@ def event_stream():
     os.environ["LLAISYS_DEVICE"] = args.device
     os.environ["LLAISYS_MODEL_NAME"] = args.model_name
 
-    uvicorn.run(app, host=args.host, port=args.port)
\ No newline at end of file
+    uvicorn.run(app, host=args.host, port=args.port)
diff --git a/python/llaisys/webUI/index.html b/python/llaisys/webUI/index.html
new file mode 100644
index 00000000..b8ac1a3d
--- /dev/null
+++ b/python/llaisys/webUI/index.html
@@ -0,0 +1,342 @@
+<!doctype html>
+<html lang="zh-CN">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>LLAISYS Chat UI</title>
+    <style>
+      * { box-sizing: border-box; }
+      body { margin: 0; font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang SC", "Hiragino Sans GB", "Microsoft YaHei", sans-serif; background: #0f1115; color: #e6e6e6; height: 100vh; }
+      .app { display: grid; grid-template-columns: 260px 1fr; height: 100vh; }
+      .sidebar { background: #141720; border-right: 1px solid #222635; padding: 16px; display: flex; flex-direction: column; gap: 12px; }
+      .sidebar h1 { margin: 0; font-size: 16px; font-weight: 600; }
+      .session-list { flex: 1; overflow: auto; display: flex; flex-direction: column; gap: 8px; }
+      .session-item { background: #1a1f2b; border: 1px solid transparent; border-radius: 8px; padding: 10px; cursor: pointer; display: flex; justify-content: space-between; align-items: center; gap: 8px; }
+      .session-item.active { border-color: #4c7dff; background: #1d2434; }
+      .session-title { font-size: 13px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }
+      .badge { background: #2a2f3d; border-radius: 999px; padding: 2px 8px; font-size: 12px; color: #9aa0b0; }
+      .main { display: flex; flex-direction: column; height: 100vh; }
+      .topbar { padding: 12px 16px; border-bottom: 1px solid #222635; display: flex; align-items: center; gap: 12px; }
+      .topbar h2 { margin: 0; font-size: 16px; font-weight: 600; }
+      .settings { margin-left: auto; display: flex; align-items: center; gap: 10px; }
+      .settings label { font-size: 12px; color: #a7adbd; display: flex; align-items: center; gap: 6px; }
+      .settings input[type="number"], .settings input[type="text"] { width: 72px; background: #0f1115; border: 1px solid #2a2f3d; color: #e6e6e6; border-radius: 6px; padding: 4px 6px; }
+      .settings input[type="checkbox"] { transform: translateY(1px); }
+      .messages { flex: 1; overflow: auto; padding: 16px; display: flex; flex-direction: column; gap: 12px; }
+      .msg { background: #151a24; border: 1px solid #222635; border-radius: 10px; padding: 10px 12px; display: flex; flex-direction: column; gap: 8px; }
+      .msg.user { background: #182035; border-color: #273457; }
+      .msg.assistant { background: #151a24; }
+      .msg-header { font-size: 12px; color: #8f97aa; display: flex; justify-content: space-between; align-items: center; }
+      .msg-actions { display: flex; gap: 6px; }
+      .content { white-space: pre-wrap; line-height: 1.5; }
+      .composer { border-top: 1px solid #222635; padding: 12px 16px; display: flex; gap: 8px; }
+      textarea { flex: 1; min-height: 54px; max-height: 180px; resize: vertical; background: #0f1115; color: #e6e6e6; border: 1px solid #2a2f3d; border-radius: 8px; padding: 10px; font-size: 14px; }
+      button { background: #4c7dff; color: white; border: none; border-radius: 8px; padding: 8px 12px; cursor: pointer; font-size: 13px; }
+      button.secondary { background: #2a2f3d; color: #c5cbda; }
+      button.ghost { background: transparent; border: 1px solid #2a2f3d; color: #c5cbda; }
+      .hint { font-size: 12px; color: #7e8598; }
+      .empty { text-align: center; color: #6f768a; margin-top: 40px; }
+      .edit-box { display: flex; flex-direction: column; gap: 8px; }
+      .edit-actions { display: flex; gap: 8px; justify-content: flex-end; }
+    </style>
+  </head>
+  <body>
+    <div class="app">
+      <aside class="sidebar">
+        <h1>LLAISYS Chat</h1>
+        <button id="newSessionBtn">新建对话</button>
+        <div class="session-list" id="sessionList"></div>
+      </aside>
+      <main class="main">
+        <div class="topbar">
+          <h2 id="activeTitle">新对话</h2>
+          <div class="settings">
+            <label>Max
+              <input id="maxTokens" type="number" min="1" value="128" />
+            </label>
+            <label>T
+              <input id="temperature" type="number" step="0.1" value="1.0" />
+            </label>
+            <label>TopP
+              <input id="topP" type="number" step="0.05" value="0.8" />
+            </label>
+            <label>TopK
+              <input id="topK" type="number" step="1" value="50" />
+            </label>
+            <label>流式
+              <input id="streaming" type="checkbox" checked />
+            </label>
+          </div>
+        </div>
+        <div class="messages" id="messages"></div>
+        <div class="composer">
+          <textarea id="userInput" placeholder="输入消息，Enter 发送，Shift+Enter 换行"></textarea>
+          <button id="sendBtn">发送</button>
+        </div>
+      </main>
+    </div>
+    <script>
+      const sessionListEl = document.getElementById("sessionList");
+      const messagesEl = document.getElementById("messages");
+      const userInputEl = document.getElementById("userInput");
+      const sendBtn = document.getElementById("sendBtn");
+      const newSessionBtn = document.getElementById("newSessionBtn");
+      const activeTitleEl = document.getElementById("activeTitle");
+      const maxTokensEl = document.getElementById("maxTokens");
+      const temperatureEl = document.getElementById("temperature");
+      const topPEl = document.getElementById("topP");
+      const topKEl = document.getElementById("topK");
+      const streamingEl = document.getElementById("streaming");
+
+      const state = {
+        sessions: [],
+        activeId: null,
+        sending: false,
+        editIndex: null
+      };
+
+      function createId() {
+        if (crypto && crypto.randomUUID) return crypto.randomUUID();
+        return String(Date.now()) + "-" + Math.random().toString(16).slice(2);
+      }
+
+      function createSession(title = "新对话") {
+        const id = createId();
+        state.sessions.unshift({ id, title, messages: [] });
+        state.activeId = id;
+        state.editIndex = null;
+        render();
+      }
+
+      function getActiveSession() {
+        return state.sessions.find(s => s.id === state.activeId);
+      }
+
+      function setActiveSession(id) {
+        state.activeId = id;
+        state.editIndex = null;
+        render();
+      }
+
+      function updateTitleFromFirstUser(session) {
+        const firstUser = session.messages.find(m => m.role === "user");
+        if (!firstUser) return;
+        if (session.title === "新对话" || session.title.startsWith("对话")) {
+          const trimmed = firstUser.content.trim();
+          session.title = trimmed.slice(0, 16) || "新对话";
+        }
+      }
+
+      function renderSessions() {
+        sessionListEl.innerHTML = "";
+        state.sessions.forEach(session => {
+          const item = document.createElement("div");
+          item.className = "session-item" + (session.id === state.activeId ? " active" : "");
+          const title = document.createElement("div");
+          title.className = "session-title";
+          title.textContent = session.title;
+          const badge = document.createElement("div");
+          badge.className = "badge";
+          badge.textContent = session.messages.length;
+          item.appendChild(title);
+          item.appendChild(badge);
+          item.onclick = () => setActiveSession(session.id);
+          sessionListEl.appendChild(item);
+        });
+      }
+
+      function renderMessages() {
+        const session = getActiveSession();
+        messagesEl.innerHTML = "";
+        if (!session || session.messages.length === 0) {
+          const empty = document.createElement("div");
+          empty.className = "empty";
+          empty.textContent = "开始一个新对话";
+          messagesEl.appendChild(empty);
+          return;
+        }
+        session.messages.forEach((msg, index) => {
+          const wrap = document.createElement("div");
+          wrap.className = "msg " + msg.role;
+          wrap.dataset.index = index;
+          const header = document.createElement("div");
+          header.className = "msg-header";
+          header.textContent = msg.role === "user" ? "用户" : "助手";
+          wrap.appendChild(header);
+          if (msg.role === "user" && index === state.editIndex) {
+            const box = document.createElement("div");
+            box.className = "edit-box";
+            const textarea = document.createElement("textarea");
+            textarea.value = msg.content;
+            const actions = document.createElement("div");
+            actions.className = "edit-actions";
+            const saveBtn = document.createElement("button");
+            saveBtn.textContent = "保存并重答";
+            const cancelBtn = document.createElement("button");
+            cancelBtn.textContent = "取消";
+            cancelBtn.className = "secondary";
+            saveBtn.onclick = () => saveEdit(index, textarea.value);
+            cancelBtn.onclick = () => { state.editIndex = null; renderMessages(); };
+            actions.appendChild(cancelBtn);
+            actions.appendChild(saveBtn);
+            box.appendChild(textarea);
+            box.appendChild(actions);
+            wrap.appendChild(box);
+          } else {
+            const content = document.createElement("div");
+            content.className = "content";
+            content.textContent = msg.content;
+            wrap.appendChild(content);
+            if (msg.role === "user") {
+              const actions = document.createElement("div");
+              actions.className = "msg-actions";
+              const editBtn = document.createElement("button");
+              editBtn.className = "ghost";
+              editBtn.textContent = "编辑";
+              editBtn.onclick = () => { if (!state.sending) { state.editIndex = index; renderMessages(); } };
+              actions.appendChild(editBtn);
+              wrap.appendChild(actions);
+            }
+          }
+          messagesEl.appendChild(wrap);
+        });
+        messagesEl.scrollTop = messagesEl.scrollHeight;
+      }
+
+      function render() {
+        const session = getActiveSession();
+        activeTitleEl.textContent = session ? session.title : "新对话";
+        renderSessions();
+        renderMessages();
+      }
+
+      function appendMessage(role, content) {
+        const session = getActiveSession();
+        if (!session) return null;
+        session.messages.push({ role, content });
+        renderMessages();
+        return session.messages.length - 1;
+      }
+
+      function updateMessageContent(index, content) {
+        const session = getActiveSession();
+        if (!session || index == null) return;
+        session.messages[index].content = content;
+        const el = messagesEl.querySelector('.msg[data-index="' + index + '"] .content');
+        if (el) el.textContent = content;
+        messagesEl.scrollTop = messagesEl.scrollHeight;
+      }
+
+      async function sendMessage() {
+        if (state.sending) return;
+        const text = userInputEl.value.trim();
+        if (!text) return;
+        userInputEl.value = "";
+        const session = getActiveSession();
+        if (!session) return;
+        state.editIndex = null;
+        appendMessage("user", text);
+        const assistantIndex = appendMessage("assistant", "");
+        updateTitleFromFirstUser(session);
+        renderSessions();
+        state.sending = true;
+        await requestCompletion(session.messages, assistantIndex);
+        state.sending = false;
+      }
+
+      async function saveEdit(index, content) {
+        if (state.sending) return;
+        const trimmed = content.trim();
+        if (!trimmed) return;
+        const session = getActiveSession();
+        if (!session) return;
+        session.messages[index].content = trimmed;
+        session.messages = session.messages.slice(0, index + 1);
+        state.editIndex = null;
+        renderMessages();
+        const assistantIndex = appendMessage("assistant", "");
+        state.sending = true;
+        await requestCompletion(session.messages, assistantIndex);
+        state.sending = false;
+      }
+
+      function buildRequest(messages) {
+        return {
+          model: null,
+          messages: messages.map(m => ({ role: m.role, content: m.content })),
+          max_tokens: Number(maxTokensEl.value) || 128,
+          temperature: Number(temperatureEl.value) || 1.0,
+          top_p: Number(topPEl.value) || 0.8,
+          top_k: Number(topKEl.value) || 50,
+          stream: streamingEl.checked,
+          seed: 0
+        };
+      }
+
+      async function requestCompletion(messages, assistantIndex) {
+        const payload = buildRequest(messages);
+        try {
+          if (!payload.stream) {
+            const res = await fetch("/v1/chat/completions", {
+              method: "POST",
+              headers: { "Content-Type": "application/json" },
+              body: JSON.stringify(payload)
+            });
+            if (!res.ok) throw new Error("请求失败: " + res.status);
+            const data = await res.json();
+            const content = data.choices?.[0]?.message?.content || "";
+            updateMessageContent(assistantIndex, content);
+            return;
+          }
+          const res = await fetch("/v1/chat/completions", {
+            method: "POST",
+            headers: { "Content-Type": "application/json" },
+            body: JSON.stringify(payload)
+          });
+          if (!res.ok || !res.body) throw new Error("流式请求失败: " + res.status);
+          const reader = res.body.getReader();
+          const decoder = new TextDecoder();
+          let buffer = "";
+          let current = "";
+          while (true) {
+            const { value, done } = await reader.read();
+            if (done) break;
+            buffer += decoder.decode(value, { stream: true });
+            const parts = buffer.split("\\n\\n");
+            buffer = parts.pop();
+            for (const part of parts) {
+              const lines = part.split("\\n");
+              for (const line of lines) {
+                if (!line.startsWith("data: ")) continue;
+                const data = line.slice(6).trim();
+                if (data === "[DONE]") {
+                  updateMessageContent(assistantIndex, current);
+                  return;
+                }
+                const payload = JSON.parse(data);
+                const delta = payload.choices?.[0]?.delta?.content || "";
+                if (delta) {
+                  current += delta;
+                  updateMessageContent(assistantIndex, current);
+                }
+              }
+            }
+          }
+        } catch (err) {
+          updateMessageContent(assistantIndex, String(err));
+        }
+      }
+
+      userInputEl.addEventListener("keydown", (event) => {
+        if (event.key === "Enter" && !event.shiftKey) {
+          event.preventDefault();
+          sendMessage();
+        }
+      });
+      sendBtn.onclick = sendMessage;
+      newSessionBtn.onclick = () => createSession("新对话");
+
+      createSession("新对话");
+    </script>
+  </body>
+</html>

From c2727b364dabe36240e5b55b0def92646e69e755 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Fri, 27 Feb 2026 00:26:34 +0800
Subject: [PATCH 11/14] complete cuda ops.

---
 .gitignore                                    |   3 +-
 src/device/nvidia/nvidia_runtime_api.cu       |  69 +++++--
 src/ops/add/nvidia/add_nvidia.cu              |  54 +++++
 src/ops/add/nvidia/add_nvidia.hpp             |   9 +
 src/ops/add/op.cpp                            |   4 +-
 src/ops/argmax/nvidia/argmax_nvidia.cu        |  66 ++++++
 src/ops/argmax/nvidia/argmax_nvidia.hpp       |   9 +
 src/ops/argmax/op.cpp                         |   4 +-
 src/ops/embedding/nvidia/embedding_nvidia.cu  |  56 ++++++
 src/ops/embedding/nvidia/embedding_nvidia.hpp |   9 +
 src/ops/embedding/op.cpp                      |   4 +-
 src/ops/linear/nvidia/linear_nvidia.cu        |  75 +++++++
 src/ops/linear/nvidia/linear_nvidia.hpp       |   9 +
 src/ops/linear/op.cpp                         |   4 +-
 src/ops/nvidia_utils.cuh                      | 123 ++++++++++++
 .../rand_sample/nvidia/rand_sample_nvidia.cu  |  29 +++
 .../rand_sample/nvidia/rand_sample_nvidia.hpp |  11 +
 src/ops/rand_sample/op.cpp                    |   5 +-
 src/ops/rearrange/nvidia/rearrange_nvidia.cu  |  88 ++++++++
 src/ops/rearrange/nvidia/rearrange_nvidia.hpp |  11 +
 src/ops/rearrange/op.cpp                      |   5 +-
 src/ops/rms_norm/nvidia/rms_norm_nvidia.cu    | 179 +++++++++++++++++
 src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp   |  10 +
 src/ops/rms_norm/op.cpp                       |   4 +-
 src/ops/rope/nvidia/rope_nvidia.cu            | 188 ++++++++++++++++++
 src/ops/rope/nvidia/rope_nvidia.hpp           |  10 +
 src/ops/rope/op.cpp                           |   4 +-
 .../nvidia/self_attention_nvidia.cu           |  34 ++++
 .../nvidia/self_attention_nvidia.hpp          |  10 +
 src/ops/self_attention/op.cpp                 |   5 +-
 src/ops/swiglu/nvidia/swiglu_nvidia.cu        | 172 ++++++++++++++++
 src/ops/swiglu/nvidia/swiglu_nvidia.hpp       |   9 +
 src/ops/swiglu/op.cpp                         |   4 +-
 xmake.lua                                     |  25 ++-
 xmake/nvidia.lua                              |  32 +++
 35 files changed, 1292 insertions(+), 41 deletions(-)
 create mode 100644 src/ops/add/nvidia/add_nvidia.cu
 create mode 100644 src/ops/add/nvidia/add_nvidia.hpp
 create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.cu
 create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.hpp
 create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.cu
 create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.hpp
 create mode 100644 src/ops/linear/nvidia/linear_nvidia.cu
 create mode 100644 src/ops/linear/nvidia/linear_nvidia.hpp
 create mode 100644 src/ops/nvidia_utils.cuh
 create mode 100644 src/ops/rand_sample/nvidia/rand_sample_nvidia.cu
 create mode 100644 src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp
 create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.cu
 create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.hpp
 create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
 create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp
 create mode 100644 src/ops/rope/nvidia/rope_nvidia.cu
 create mode 100644 src/ops/rope/nvidia/rope_nvidia.hpp
 create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.cu
 create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.hpp
 create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.cu
 create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.hpp
 create mode 100644 xmake/nvidia.lua

diff --git a/.gitignore b/.gitignore
index e92328ca..e9960a4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,8 @@
 # Xmake cache
 .xmake/
 build/
-
+# md docs
+*.md
 # Binaries
 bin/
 lib/
diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu
index cab92826..adc39208 100644
--- a/src/device/nvidia/nvidia_runtime_api.cu
+++ b/src/device/nvidia/nvidia_runtime_api.cu
@@ -1,56 +1,97 @@
 #include "../runtime_api.hpp"
 
+#include <cuda_runtime.h>
 #include <cstdlib>
 #include <cstring>
+#include <stdexcept>
+#include <string>
 
 namespace llaisys::device::nvidia {
 
 namespace runtime_api {
+static void checkCuda(cudaError_t err) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err));
+    }
+}
+
+static cudaMemcpyKind toCudaMemcpyKind(llaisysMemcpyKind_t kind) {
+    switch (kind) {
+    case LLAISYS_MEMCPY_H2H:
+        return cudaMemcpyHostToHost;
+    case LLAISYS_MEMCPY_H2D:
+        return cudaMemcpyHostToDevice;
+    case LLAISYS_MEMCPY_D2H:
+        return cudaMemcpyDeviceToHost;
+    case LLAISYS_MEMCPY_D2D:
+        return cudaMemcpyDeviceToDevice;
+    default:
+        throw std::runtime_error("Unsupported memcpy kind");
+    }
+}
+
 int getDeviceCount() {
-    TO_BE_IMPLEMENTED();
+    int count = 0;
+    checkCuda(cudaGetDeviceCount(&count));
+    return count;
 }
 
-void setDevice(int) {
-    TO_BE_IMPLEMENTED();
+void setDevice(int device_id) {
+    checkCuda(cudaSetDevice(device_id));
 }
 
 void deviceSynchronize() {
-    TO_BE_IMPLEMENTED();
+    checkCuda(cudaDeviceSynchronize());
 }
 
 llaisysStream_t createStream() {
-    TO_BE_IMPLEMENTED();
+    cudaStream_t stream = nullptr;
+    checkCuda(cudaStreamCreate(&stream));
+    return reinterpret_cast<llaisysStream_t>(stream);
 }
 
 void destroyStream(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
+    if (stream == nullptr) {
+        return;
+    }
+    checkCuda(cudaStreamDestroy(reinterpret_cast<cudaStream_t>(stream)));
 }
 void streamSynchronize(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
+    checkCuda(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
 }
 
 void *mallocDevice(size_t size) {
-    TO_BE_IMPLEMENTED();
+    void *ptr = nullptr;
+    checkCuda(cudaMalloc(&ptr, size));
+    return ptr;
 }
 
 void freeDevice(void *ptr) {
-    TO_BE_IMPLEMENTED();
+    if (ptr == nullptr) {
+        return;
+    }
+    checkCuda(cudaFree(ptr));
 }
 
 void *mallocHost(size_t size) {
-    TO_BE_IMPLEMENTED();
+    void *ptr = nullptr;
+    checkCuda(cudaMallocHost(&ptr, size));
+    return ptr;
 }
 
 void freeHost(void *ptr) {
-    TO_BE_IMPLEMENTED();
+    if (ptr == nullptr) {
+        return;
+    }
+    checkCuda(cudaFreeHost(ptr));
 }
 
 void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
+    checkCuda(cudaMemcpy(dst, src, size, toCudaMemcpyKind(kind)));
 }
 
-void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
+void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) {
+    checkCuda(cudaMemcpyAsync(dst, src, size, toCudaMemcpyKind(kind), reinterpret_cast<cudaStream_t>(stream)));
 }
 
 static const LlaisysRuntimeAPI RUNTIME_API = {
diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu
new file mode 100644
index 00000000..e7307d64
--- /dev/null
+++ b/src/ops/add/nvidia/add_nvidia.cu
@@ -0,0 +1,54 @@
+#include "add_nvidia.hpp"
+
+#include "../../nvidia_utils.cuh"
+#include "../../../core/llaisys_core.hpp"
+
+namespace llaisys::ops::nvidia {
+template <typename T>
+__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+    if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+        float av = detail::to_float(a[idx]);
+        float bv = detail::to_float(b[idx]);
+        c[idx] = detail::from_float<T>(av + bv);
+    } else {
+        c[idx] = a[idx] + b[idx];
+    }
+}
+
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel) {
+    int threads = 256;
+    int blocks = static_cast<int>((numel + threads - 1) / threads);
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        add_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(c),
+            reinterpret_cast<const float *>(a),
+            reinterpret_cast<const float *>(b),
+            numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        add_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(c),
+            reinterpret_cast<const llaisys::bf16_t *>(a),
+            reinterpret_cast<const llaisys::bf16_t *>(b),
+            numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        add_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(c),
+            reinterpret_cast<const llaisys::fp16_t *>(a),
+            reinterpret_cast<const llaisys::fp16_t *>(b),
+            numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/add/nvidia/add_nvidia.hpp b/src/ops/add/nvidia/add_nvidia.hpp
new file mode 100644
index 00000000..2e67b497
--- /dev/null
+++ b/src/ops/add/nvidia/add_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp
index a057330d..9b36ba8e 100644
--- a/src/ops/add/op.cpp
+++ b/src/ops/add/op.cpp
@@ -4,6 +4,7 @@
 #include "../../utils.hpp"
 
 #include "cpu/add_cpu.hpp"
+#include "nvidia/add_nvidia.hpp"
 
 namespace llaisys::ops {
 void add(tensor_t c, tensor_t a, tensor_t b) {
@@ -25,8 +26,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) {
         return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu
new file mode 100644
index 00000000..79a51059
--- /dev/null
+++ b/src/ops/argmax/nvidia/argmax_nvidia.cu
@@ -0,0 +1,66 @@
+#include "argmax_nvidia.hpp"
+
+#include "../../nvidia_utils.cuh"
+#include "../../../core/llaisys_core.hpp"
+
+namespace llaisys::ops::nvidia {
+template <typename T>
+__global__ void argmax_kernel(int64_t *max_idx, T *max_val, const T *vals, size_t numel) {
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+        return;
+    }
+    size_t max_i = 0;
+    if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+        float max_f = detail::to_float(vals[0]);
+        for (size_t i = 1; i < numel; ++i) {
+            float cur = detail::to_float(vals[i]);
+            if (cur > max_f) {
+                max_f = cur;
+                max_i = i;
+            }
+        }
+        *max_val = detail::from_float<T>(max_f);
+    } else {
+        T max_v = vals[0];
+        for (size_t i = 1; i < numel; ++i) {
+            if (vals[i] > max_v) {
+                max_v = vals[i];
+                max_i = i;
+            }
+        }
+        *max_val = max_v;
+    }
+    *max_idx = static_cast<int64_t>(max_i);
+}
+
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) {
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        argmax_kernel<<<1, 1, 0, stream>>>(
+            reinterpret_cast<int64_t *>(max_idx),
+            reinterpret_cast<float *>(max_val),
+            reinterpret_cast<const float *>(vals),
+            numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        argmax_kernel<<<1, 1, 0, stream>>>(
+            reinterpret_cast<int64_t *>(max_idx),
+            reinterpret_cast<llaisys::bf16_t *>(max_val),
+            reinterpret_cast<const llaisys::bf16_t *>(vals),
+            numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        argmax_kernel<<<1, 1, 0, stream>>>(
+            reinterpret_cast<int64_t *>(max_idx),
+            reinterpret_cast<llaisys::fp16_t *>(max_val),
+            reinterpret_cast<const llaisys::fp16_t *>(vals),
+            numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/argmax/nvidia/argmax_nvidia.hpp b/src/ops/argmax/nvidia/argmax_nvidia.hpp
new file mode 100644
index 00000000..054fa353
--- /dev/null
+++ b/src/ops/argmax/nvidia/argmax_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 034bc431..3482db29 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -4,6 +4,7 @@
 #include "../../utils.hpp"
 
 #include "cpu/argmax_cpu.hpp"
+#include "nvidia/argmax_nvidia.hpp"
 
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
@@ -26,8 +27,7 @@ void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
         return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu
new file mode 100644
index 00000000..c8832472
--- /dev/null
+++ b/src/ops/embedding/nvidia/embedding_nvidia.cu
@@ -0,0 +1,56 @@
+#include "embedding_nvidia.hpp"
+
+#include "../../nvidia_utils.cuh"
+#include "../../../core/llaisys_core.hpp"
+
+namespace llaisys::ops::nvidia {
+template <typename T>
+__global__ void embedding_kernel(T *out, const int64_t *index, const T *weight, size_t index_len, size_t row_len) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = index_len * row_len;
+    if (idx >= total) {
+        return;
+    }
+    size_t i = idx / row_len;
+    size_t j = idx - i * row_len;
+    int64_t row = index[i];
+    out[idx] = weight[static_cast<size_t>(row) * row_len + j];
+}
+
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len) {
+    size_t total = index_len * row_len;
+    int threads = 256;
+    int blocks = static_cast<int>((total + threads - 1) / threads);
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        embedding_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const int64_t *>(index),
+            reinterpret_cast<const float *>(weight),
+            index_len,
+            row_len);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        embedding_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const int64_t *>(index),
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            index_len,
+            row_len);
+        break;
+    case LLAISYS_DTYPE_F16:
+        embedding_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const int64_t *>(index),
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            index_len,
+            row_len);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/embedding/nvidia/embedding_nvidia.hpp b/src/ops/embedding/nvidia/embedding_nvidia.hpp
new file mode 100644
index 00000000..29ce3a28
--- /dev/null
+++ b/src/ops/embedding/nvidia/embedding_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len);
+}
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 3c33c5d5..01abaac7 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -2,6 +2,7 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/embedding_cpu.hpp"
+#include "nvidia/embedding_nvidia.hpp"
 
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
@@ -29,8 +30,7 @@ void embedding(tensor_t out, tensor_t index, tensor_t weight) {
         return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu
new file mode 100644
index 00000000..fc7066f3
--- /dev/null
+++ b/src/ops/linear/nvidia/linear_nvidia.cu
@@ -0,0 +1,75 @@
+#include "linear_nvidia.hpp"
+
+#include "../../nvidia_utils.cuh"
+#include "../../../core/llaisys_core.hpp"
+
+namespace llaisys::ops::nvidia {
+template <typename T>
+__global__ void linear_kernel(T *out, const T *in, const T *W, const T *bias, size_t batch, size_t in_dim, size_t out_dim) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = batch * out_dim;
+    if (idx >= total) {
+        return;
+    }
+    size_t i = idx / out_dim;
+    size_t j = idx - i * out_dim;
+    const T *in_row = in + i * in_dim;
+    const T *w_row = W + j * in_dim;
+    if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+        float acc = 0.0f;
+        for (size_t k = 0; k < in_dim; ++k) {
+            acc += detail::to_float(in_row[k]) * detail::to_float(w_row[k]);
+        }
+        if (bias) {
+            acc += detail::to_float(bias[j]);
+        }
+        out[idx] = detail::from_float<T>(acc);
+    } else {
+        T acc = static_cast<T>(0);
+        for (size_t k = 0; k < in_dim; ++k) {
+            acc += in_row[k] * w_row[k];
+        }
+        if (bias) {
+            acc += bias[j];
+        }
+        out[idx] = acc;
+    }
+}
+
+void linear(std::byte *out, const std::byte *in, const std::byte *W, const std::byte *bias, llaisysDataType_t type, size_t batch, size_t in_dim, size_t out_dim) {
+    size_t total = batch * out_dim;
+    int threads = 256;
+    int blocks = static_cast<int>((total + threads - 1) / threads);
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        linear_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            reinterpret_cast<const float *>(W),
+            reinterpret_cast<const float *>(bias),
+            batch, in_dim, out_dim);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        linear_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            reinterpret_cast<const llaisys::bf16_t *>(W),
+            reinterpret_cast<const llaisys::bf16_t *>(bias),
+            batch, in_dim, out_dim);
+        break;
+    case LLAISYS_DTYPE_F16:
+        linear_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            reinterpret_cast<const llaisys::fp16_t *>(W),
+            reinterpret_cast<const llaisys::fp16_t *>(bias),
+            batch, in_dim, out_dim);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/linear/nvidia/linear_nvidia.hpp b/src/ops/linear/nvidia/linear_nvidia.hpp
new file mode 100644
index 00000000..f31ea3fb
--- /dev/null
+++ b/src/ops/linear/nvidia/linear_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void linear(std::byte *out, const std::byte *in, const std::byte *W, const std::byte *bias, llaisysDataType_t type, size_t batch, size_t in_dim, size_t out_dim);
+}
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 9585068a..0804ca50 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -2,6 +2,7 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/linear_cpu.hpp"
+#include "nvidia/linear_nvidia.hpp"
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
     ASSERT(out->isContiguous() && in->isContiguous() 
@@ -40,8 +41,7 @@ void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
         weight->data(), bias_data, bias->dtype(), batch, in_dim, out_dim);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::linear(out->data(), in->data(), weight->data(), bias_data, out->dtype(), batch, in_dim, out_dim);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/nvidia_utils.cuh b/src/ops/nvidia_utils.cuh
new file mode 100644
index 00000000..04ed31f2
--- /dev/null
+++ b/src/ops/nvidia_utils.cuh
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "llaisys.h"
+#include "../utils.hpp"
+#include <cuda_runtime.h>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace llaisys::ops::nvidia::detail {
+
+inline void checkCuda(cudaError_t err) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err));
+    }
+}
+
+__device__ inline float f16_to_f32(llaisys::fp16_t val) {
+    uint16_t h = val._v;
+    uint32_t sign = (h & 0x8000) << 16;
+    int32_t exponent = (h >> 10) & 0x1F;
+    uint32_t mantissa = h & 0x3FF;
+    uint32_t f32;
+    if (exponent == 31) {
+        f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000);
+    } else if (exponent == 0) {
+        if (mantissa == 0) {
+            f32 = sign;
+        } else {
+            exponent = -14;
+            while ((mantissa & 0x400) == 0) {
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;
+            f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+        }
+    } else {
+        f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+    }
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = f32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::fp16_t f32_to_f16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t f32 = tmp.u;
+    uint16_t sign = (f32 >> 16) & 0x8000;
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127;
+    uint32_t mantissa = f32 & 0x7FFFFF;
+    if (exponent >= 16) {
+        if (exponent == 128 && mantissa != 0) {
+            return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7E00)};
+        }
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7C00)};
+    } else if (exponent >= -14) {
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | ((exponent + 15) << 10) | (mantissa >> 13))};
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;
+        mantissa >>= (-14 - exponent);
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | (mantissa >> 13))};
+    }
+    return llaisys::fp16_t{static_cast<uint16_t>(sign)};
+}
+
+__device__ inline float bf16_to_f32(llaisys::bf16_t val) {
+    uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = bits32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::bf16_t f32_to_bf16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t bits32 = tmp.u;
+    const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1);
+    uint16_t bf16_bits = static_cast<uint16_t>((bits32 + rounding_bias) >> 16);
+    return llaisys::bf16_t{bf16_bits};
+}
+
+template <typename T>
+__device__ inline float to_float(T v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f16_to_f32(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return bf16_to_f32(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename T>
+__device__ inline T from_float(float v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f32_to_f16(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return f32_to_bf16(v);
+    } else {
+        return static_cast<T>(v);
+    }
+}
+
+}
diff --git a/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu
new file mode 100644
index 00000000..b0326878
--- /dev/null
+++ b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu
@@ -0,0 +1,29 @@
+#include "rand_sample_nvidia.hpp"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../utils.hpp"
+#include "../../nvidia_utils.cuh"
+#include "../cpu/rand_sample_cpu.hpp"
+
+namespace llaisys::ops::nvidia {
+void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel,
+                 int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed) {
+    auto &runtime = llaisys::core::context().runtime();
+    size_t vals_bytes = numel * static_cast<size_t>(batch_size) * llaisys::utils::dsize(type);
+    size_t idx_bytes = static_cast<size_t>(batch_size) * sizeof(int64_t);
+    size_t val_bytes = static_cast<size_t>(batch_size) * llaisys::utils::dsize(type);
+
+    auto host_vals = static_cast<std::byte *>(runtime.api()->malloc_host(vals_bytes));
+    auto host_idx = static_cast<std::byte *>(runtime.api()->malloc_host(idx_bytes));
+    auto host_val = static_cast<std::byte *>(runtime.api()->malloc_host(val_bytes));
+
+    runtime.api()->memcpy_sync(host_vals, vals, vals_bytes, LLAISYS_MEMCPY_D2H);
+    cpu::rand_sample(host_idx, host_val, host_vals, type, numel, batch_size, temperature, topK, topP, seed);
+    runtime.api()->memcpy_sync(sample_idx, host_idx, idx_bytes, LLAISYS_MEMCPY_H2D);
+    runtime.api()->memcpy_sync(sample_val, host_val, val_bytes, LLAISYS_MEMCPY_H2D);
+
+    runtime.api()->free_host(host_vals);
+    runtime.api()->free_host(host_idx);
+    runtime.api()->free_host(host_val);
+}
+}
diff --git a/src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp b/src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp
new file mode 100644
index 00000000..e9c9bcab
--- /dev/null
+++ b/src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llaisys::ops::nvidia {
+void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel,
+                 int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed);
+}
diff --git a/src/ops/rand_sample/op.cpp b/src/ops/rand_sample/op.cpp
index 891aa2dc..8cf7bae6 100644
--- a/src/ops/rand_sample/op.cpp
+++ b/src/ops/rand_sample/op.cpp
@@ -4,6 +4,7 @@
 #include "../../utils.hpp"
 
 #include "cpu/rand_sample_cpu.hpp"
+#include "nvidia/rand_sample_nvidia.hpp"
 
 namespace llaisys::ops {
 void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float temperature, size_t topK, float topP,
@@ -43,8 +44,8 @@ void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float
                                 static_cast<int64_t>(batch_size), temperature, topK, topP, seed);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::rand_sample(sample_idx->data(), sample_val->data(), vals->data(), vals->dtype(), numel,
+                                   static_cast<int64_t>(batch_size), temperature, topK, topP, seed);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/ops/rearrange/nvidia/rearrange_nvidia.cu
new file mode 100644
index 00000000..2cea4da8
--- /dev/null
+++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cu
@@ -0,0 +1,88 @@
+#include "rearrange_nvidia.hpp"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../utils.hpp"
+
+#include <cuda_runtime.h>
+#include <cstddef>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+
+namespace llaisys::ops::nvidia {
+namespace detail {
+inline void checkCuda(cudaError_t err) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err));
+    }
+}
+}
+
+template <typename T>
+__global__ void rearrange_kernel(T *out, const T *in, const size_t *shape, const ptrdiff_t *out_strides, const ptrdiff_t *in_strides,
+                                 size_t numel, size_t ndim) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+    size_t tmp = idx;
+    ptrdiff_t in_offset = 0;
+    ptrdiff_t out_offset = 0;
+    for (size_t d = ndim; d-- > 0;) {
+        size_t id = tmp % shape[d];
+        tmp /= shape[d];
+        in_offset += static_cast<ptrdiff_t>(id) * in_strides[d];
+        out_offset += static_cast<ptrdiff_t>(id) * out_strides[d];
+    }
+    out[out_offset] = in[in_offset];
+}
+
+void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, const size_t *shape, const ptrdiff_t *out_strides,
+               const ptrdiff_t *in_strides, size_t numel, size_t ndim) {
+    auto &runtime = llaisys::core::context().runtime();
+    size_t shape_bytes = ndim * sizeof(size_t);
+    size_t stride_bytes = ndim * sizeof(ptrdiff_t);
+    auto d_shape = static_cast<size_t *>(runtime.api()->malloc_device(shape_bytes));
+    auto d_out_strides = static_cast<ptrdiff_t *>(runtime.api()->malloc_device(stride_bytes));
+    auto d_in_strides = static_cast<ptrdiff_t *>(runtime.api()->malloc_device(stride_bytes));
+
+    runtime.api()->memcpy_sync(d_shape, shape, shape_bytes, LLAISYS_MEMCPY_H2D);
+    runtime.api()->memcpy_sync(d_out_strides, out_strides, stride_bytes, LLAISYS_MEMCPY_H2D);
+    runtime.api()->memcpy_sync(d_in_strides, in_strides, stride_bytes, LLAISYS_MEMCPY_H2D);
+
+    int threads = 256;
+    int blocks = static_cast<int>((numel + threads - 1) / threads);
+    auto stream = reinterpret_cast<cudaStream_t>(runtime.stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        rearrange_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            d_shape, d_out_strides, d_in_strides, numel, ndim);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        rearrange_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            d_shape, d_out_strides, d_in_strides, numel, ndim);
+        break;
+    case LLAISYS_DTYPE_F16:
+        rearrange_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            d_shape, d_out_strides, d_in_strides, numel, ndim);
+        break;
+    default:
+        runtime.api()->free_device(d_shape);
+        runtime.api()->free_device(d_out_strides);
+        runtime.api()->free_device(d_in_strides);
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+
+    runtime.api()->free_device(d_shape);
+    runtime.api()->free_device(d_out_strides);
+    runtime.api()->free_device(d_in_strides);
+}
+}
diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.hpp b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp
new file mode 100644
index 00000000..19bdbc2d
--- /dev/null
+++ b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llaisys::ops::nvidia {
+void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, const size_t *shape, const ptrdiff_t *out_strides,
+               const ptrdiff_t *in_strides, size_t numel, size_t ndim);
+}
diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp
index ae1523fc..74a89df5 100644
--- a/src/ops/rearrange/op.cpp
+++ b/src/ops/rearrange/op.cpp
@@ -4,6 +4,7 @@
 #include "../../utils.hpp"
 
 #include "cpu/rearrange_cpu.hpp"
+#include "nvidia/rearrange_nvidia.hpp"
 
 namespace llaisys::ops {
 void rearrange(tensor_t out, tensor_t in) {
@@ -30,8 +31,8 @@ void rearrange(tensor_t out, tensor_t in) {
                               in_strides.data(), numel, ndim);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::rearrange(out->data(), in->data(), out->dtype(), shape.data(), out_strides.data(),
+                                 in_strides.data(), numel, ndim);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
new file mode 100644
index 00000000..623d6aba
--- /dev/null
+++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu
@@ -0,0 +1,179 @@
+#include "rms_norm_nvidia.hpp"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../utils.hpp"
+
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace llaisys::ops::nvidia {
+namespace detail {
+inline void checkCuda(cudaError_t err) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err));
+    }
+}
+
+__device__ inline float f16_to_f32(llaisys::fp16_t val) {
+    uint16_t h = val._v;
+    uint32_t sign = (h & 0x8000) << 16;
+    int32_t exponent = (h >> 10) & 0x1F;
+    uint32_t mantissa = h & 0x3FF;
+    uint32_t f32;
+    if (exponent == 31) {
+        f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000);
+    } else if (exponent == 0) {
+        if (mantissa == 0) {
+            f32 = sign;
+        } else {
+            exponent = -14;
+            while ((mantissa & 0x400) == 0) {
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;
+            f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+        }
+    } else {
+        f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+    }
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = f32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::fp16_t f32_to_f16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t f32 = tmp.u;
+    uint16_t sign = (f32 >> 16) & 0x8000;
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127;
+    uint32_t mantissa = f32 & 0x7FFFFF;
+    if (exponent >= 16) {
+        if (exponent == 128 && mantissa != 0) {
+            return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7E00)};
+        }
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7C00)};
+    } else if (exponent >= -14) {
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | ((exponent + 15) << 10) | (mantissa >> 13))};
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;
+        mantissa >>= (-14 - exponent);
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | (mantissa >> 13))};
+    }
+    return llaisys::fp16_t{static_cast<uint16_t>(sign)};
+}
+
+__device__ inline float bf16_to_f32(llaisys::bf16_t val) {
+    uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = bits32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::bf16_t f32_to_bf16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t bits32 = tmp.u;
+    const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1);
+    uint16_t bf16_bits = static_cast<uint16_t>((bits32 + rounding_bias) >> 16);
+    return llaisys::bf16_t{bf16_bits};
+}
+
+template <typename T>
+__device__ inline float to_float(T v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f16_to_f32(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return bf16_to_f32(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename T>
+__device__ inline T from_float(float v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f32_to_f16(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return f32_to_bf16(v);
+    } else {
+        return static_cast<T>(v);
+    }
+}
+}
+
+template <typename T>
+__global__ void rms_norm_kernel(T *out, const T *in, const T *weight, float eps, size_t batch, size_t in_dim) {
+    size_t row = blockIdx.x;
+    if (row >= batch) {
+        return;
+    }
+    const T *in_row = in + row * in_dim;
+    T *out_row = out + row * in_dim;
+    float acc = 0.0f;
+    for (size_t j = 0; j < in_dim; ++j) {
+        float v = detail::to_float(in_row[j]);
+        acc += v * v;
+    }
+    float denom = sqrtf(acc / static_cast<float>(in_dim) + eps);
+    for (size_t j = 0; j < in_dim; ++j) {
+        float v = detail::to_float(in_row[j]) * detail::to_float(weight[j]) / denom;
+        out_row[j] = detail::from_float<T>(v);
+    }
+}
+
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch,
+              size_t in_dim) {
+    int threads = 1;
+    int blocks = static_cast<int>(batch);
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        rms_norm_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            reinterpret_cast<const float *>(weight),
+            eps, batch, in_dim);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        rms_norm_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            eps, batch, in_dim);
+        break;
+    case LLAISYS_DTYPE_F16:
+        rms_norm_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            eps, batch, in_dim);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp
new file mode 100644
index 00000000..15922384
--- /dev/null
+++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch,
+              size_t in_dim);
+}
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index d57c96cf..fb8cbc12 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -2,6 +2,7 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/rms_norm_cpu.hpp"
+#include "nvidia/rms_norm_nvidia.hpp"
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
     CHECK_SAME_DEVICE(out, in, weight);
@@ -27,8 +28,7 @@ void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
         weight->data(), eps, out->dtype(), batch, in_dim);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::rms_norm(out->data(), in->data(), weight->data(), eps, out->dtype(), batch, in_dim);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu
new file mode 100644
index 00000000..f098db48
--- /dev/null
+++ b/src/ops/rope/nvidia/rope_nvidia.cu
@@ -0,0 +1,188 @@
+#include "rope_nvidia.hpp"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../utils.hpp"
+
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace llaisys::ops::nvidia {
+namespace detail {
+inline void checkCuda(cudaError_t err) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err));
+    }
+}
+
+__device__ inline float f16_to_f32(llaisys::fp16_t val) {
+    uint16_t h = val._v;
+    uint32_t sign = (h & 0x8000) << 16;
+    int32_t exponent = (h >> 10) & 0x1F;
+    uint32_t mantissa = h & 0x3FF;
+    uint32_t f32;
+    if (exponent == 31) {
+        f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000);
+    } else if (exponent == 0) {
+        if (mantissa == 0) {
+            f32 = sign;
+        } else {
+            exponent = -14;
+            while ((mantissa & 0x400) == 0) {
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;
+            f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+        }
+    } else {
+        f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+    }
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = f32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::fp16_t f32_to_f16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t f32 = tmp.u;
+    uint16_t sign = (f32 >> 16) & 0x8000;
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127;
+    uint32_t mantissa = f32 & 0x7FFFFF;
+    if (exponent >= 16) {
+        if (exponent == 128 && mantissa != 0) {
+            return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7E00)};
+        }
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7C00)};
+    } else if (exponent >= -14) {
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | ((exponent + 15) << 10) | (mantissa >> 13))};
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;
+        mantissa >>= (-14 - exponent);
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | (mantissa >> 13))};
+    }
+    return llaisys::fp16_t{static_cast<uint16_t>(sign)};
+}
+
+__device__ inline float bf16_to_f32(llaisys::bf16_t val) {
+    uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = bits32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::bf16_t f32_to_bf16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t bits32 = tmp.u;
+    const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1);
+    uint16_t bf16_bits = static_cast<uint16_t>((bits32 + rounding_bias) >> 16);
+    return llaisys::bf16_t{bf16_bits};
+}
+
+template <typename T>
+__device__ inline float to_float(T v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f16_to_f32(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return bf16_to_f32(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename T>
+__device__ inline T from_float(float v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f32_to_f16(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return f32_to_bf16(v);
+    } else {
+        return static_cast<T>(v);
+    }
+}
+}
+
+template <typename T>
+__global__ void rope_kernel(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads,
+                            size_t head_dim) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t half = head_dim / 2;
+    size_t total = seq_len * n_heads * half;
+    if (idx >= total) {
+        return;
+    }
+    size_t tmp = idx;
+    size_t j = tmp % half;
+    tmp /= half;
+    size_t h = tmp % n_heads;
+    size_t s = tmp / n_heads;
+    int64_t p = pos_ids[s];
+    float angle = static_cast<float>(p) / powf(theta, 2.0f * static_cast<float>(j) / static_cast<float>(head_dim));
+    float cosine = cosf(angle);
+    float sine = sinf(angle);
+    size_t base = (s * n_heads + h) * head_dim;
+    T a = in[base + j];
+    T b = in[base + j + half];
+    float af = detail::to_float(a);
+    float bf = detail::to_float(b);
+    out[base + j] = detail::from_float<T>(af * cosine - bf * sine);
+    out[base + j + half] = detail::from_float<T>(af * sine + bf * cosine);
+}
+
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seq_len,
+          size_t n_heads, size_t head_dim) {
+    size_t half = head_dim / 2;
+    size_t total = seq_len * n_heads * half;
+    int threads = 256;
+    int blocks = static_cast<int>((total + threads - 1) / threads);
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        rope_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            reinterpret_cast<const int64_t *>(pos_ids),
+            theta, seq_len, n_heads, head_dim);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        rope_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            reinterpret_cast<const int64_t *>(pos_ids),
+            theta, seq_len, n_heads, head_dim);
+        break;
+    case LLAISYS_DTYPE_F16:
+        rope_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            reinterpret_cast<const int64_t *>(pos_ids),
+            theta, seq_len, n_heads, head_dim);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/rope/nvidia/rope_nvidia.hpp b/src/ops/rope/nvidia/rope_nvidia.hpp
new file mode 100644
index 00000000..ea72276b
--- /dev/null
+++ b/src/ops/rope/nvidia/rope_nvidia.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seq_len,
+          size_t n_heads, size_t head_dim);
+}
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d3cab559..64250fa1 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -2,6 +2,7 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/rope_cpu.hpp"
+#include "nvidia/rope_nvidia.hpp"
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
     CHECK_SAME_DEVICE(out, in, pos_ids);
@@ -28,8 +29,7 @@ void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
         pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::rope(out->data(), in->data(), pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
new file mode 100644
index 00000000..aae4d79e
--- /dev/null
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
@@ -0,0 +1,34 @@
+#include "self_attention_nvidia.hpp"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../utils.hpp"
+#include "../cpu/self_attention_cpu.hpp"
+
+namespace llaisys::ops::nvidia {
+void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale,
+                    llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv) {
+    auto &runtime = llaisys::core::context().runtime();
+    size_t elem_size = llaisys::utils::dsize(type);
+    size_t q_bytes = qlen * nh * hd * elem_size;
+    size_t k_bytes = kvlen * nkvh * hd * elem_size;
+    size_t v_bytes = kvlen * nkvh * dv * elem_size;
+    size_t out_bytes = qlen * nh * dv * elem_size;
+
+    auto host_q = static_cast<std::byte *>(runtime.api()->malloc_host(q_bytes));
+    auto host_k = static_cast<std::byte *>(runtime.api()->malloc_host(k_bytes));
+    auto host_v = static_cast<std::byte *>(runtime.api()->malloc_host(v_bytes));
+    auto host_out = static_cast<std::byte *>(runtime.api()->malloc_host(out_bytes));
+
+    runtime.api()->memcpy_sync(host_q, q, q_bytes, LLAISYS_MEMCPY_D2H);
+    runtime.api()->memcpy_sync(host_k, k, k_bytes, LLAISYS_MEMCPY_D2H);
+    runtime.api()->memcpy_sync(host_v, v, v_bytes, LLAISYS_MEMCPY_D2H);
+
+    cpu::self_attention(host_out, host_q, host_k, host_v, scale, type, qlen, kvlen, nh, nkvh, hd, dv);
+    runtime.api()->memcpy_sync(out, host_out, out_bytes, LLAISYS_MEMCPY_H2D);
+
+    runtime.api()->free_host(host_q);
+    runtime.api()->free_host(host_k);
+    runtime.api()->free_host(host_v);
+    runtime.api()->free_host(host_out);
+}
+}
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.hpp b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp
new file mode 100644
index 00000000..fe25c256
--- /dev/null
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale,
+                    llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv);
+}
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index abfe1964..cd8220d7 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -2,6 +2,7 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/self_attention_cpu.hpp"
+#include "nvidia/self_attention_nvidia.hpp"
 
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
@@ -39,8 +40,8 @@ void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float
                                    attn_val->dtype(), qlen, kvlen, nh, nkvh, hd, dv);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, attn_val->dtype(), qlen,
+                                      kvlen, nh, nkvh, hd, dv);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu
new file mode 100644
index 00000000..e4d416e7
--- /dev/null
+++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu
@@ -0,0 +1,172 @@
+#include "swiglu_nvidia.hpp"
+
+#include "../../../core/llaisys_core.hpp"
+#include "../../../utils.hpp"
+
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstdint>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+namespace llaisys::ops::nvidia {
+namespace detail {
+inline void checkCuda(cudaError_t err) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err));
+    }
+}
+
+__device__ inline float f16_to_f32(llaisys::fp16_t val) {
+    uint16_t h = val._v;
+    uint32_t sign = (h & 0x8000) << 16;
+    int32_t exponent = (h >> 10) & 0x1F;
+    uint32_t mantissa = h & 0x3FF;
+    uint32_t f32;
+    if (exponent == 31) {
+        f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000);
+    } else if (exponent == 0) {
+        if (mantissa == 0) {
+            f32 = sign;
+        } else {
+            exponent = -14;
+            while ((mantissa & 0x400) == 0) {
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;
+            f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+        }
+    } else {
+        f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+    }
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = f32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::fp16_t f32_to_f16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t f32 = tmp.u;
+    uint16_t sign = (f32 >> 16) & 0x8000;
+    int32_t exponent = ((f32 >> 23) & 0xFF) - 127;
+    uint32_t mantissa = f32 & 0x7FFFFF;
+    if (exponent >= 16) {
+        if (exponent == 128 && mantissa != 0) {
+            return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7E00)};
+        }
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | 0x7C00)};
+    } else if (exponent >= -14) {
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | ((exponent + 15) << 10) | (mantissa >> 13))};
+    } else if (exponent >= -24) {
+        mantissa |= 0x800000;
+        mantissa >>= (-14 - exponent);
+        return llaisys::fp16_t{static_cast<uint16_t>(sign | (mantissa >> 13))};
+    }
+    return llaisys::fp16_t{static_cast<uint16_t>(sign)};
+}
+
+__device__ inline float bf16_to_f32(llaisys::bf16_t val) {
+    uint32_t bits32 = static_cast<uint32_t>(val._v) << 16;
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.u = bits32;
+    return tmp.f;
+}
+
+__device__ inline llaisys::bf16_t f32_to_bf16(float val) {
+    union {
+        uint32_t u;
+        float f;
+    } tmp;
+    tmp.f = val;
+    uint32_t bits32 = tmp.u;
+    const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1);
+    uint16_t bf16_bits = static_cast<uint16_t>((bits32 + rounding_bias) >> 16);
+    return llaisys::bf16_t{bf16_bits};
+}
+
+template <typename T>
+__device__ inline float to_float(T v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f16_to_f32(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return bf16_to_f32(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename T>
+__device__ inline T from_float(float v) {
+    if constexpr (std::is_same_v<T, float>) {
+        return v;
+    } else if constexpr (std::is_same_v<T, llaisys::fp16_t>) {
+        return f32_to_f16(v);
+    } else if constexpr (std::is_same_v<T, llaisys::bf16_t>) {
+        return f32_to_bf16(v);
+    } else {
+        return static_cast<T>(v);
+    }
+}
+}
+
+template <typename T>
+__global__ void swiglu_kernel(T *out, const T *gate, const T *up, size_t seq_len, size_t inter_size) {
+    size_t idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t total = seq_len * inter_size;
+    if (idx >= total) {
+        return;
+    }
+    float gate_v = detail::to_float(gate[idx]);
+    float up_v = detail::to_float(up[idx]);
+    float out_v = up_v * gate_v / (1.0f + expf(-gate_v));
+    out[idx] = detail::from_float<T>(out_v);
+}
+
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size) {
+    size_t total = seq_len * inter_size;
+    int threads = 256;
+    int blocks = static_cast<int>((total + threads - 1) / threads);
+    auto stream = reinterpret_cast<cudaStream_t>(llaisys::core::context().runtime().stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        swiglu_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(gate),
+            reinterpret_cast<const float *>(up),
+            seq_len, inter_size);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        swiglu_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(gate),
+            reinterpret_cast<const llaisys::bf16_t *>(up),
+            seq_len, inter_size);
+        break;
+    case LLAISYS_DTYPE_F16:
+        swiglu_kernel<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(gate),
+            reinterpret_cast<const llaisys::fp16_t *>(up),
+            seq_len, inter_size);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+}
+}
diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.hpp b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp
new file mode 100644
index 00000000..e4b9b506
--- /dev/null
+++ b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::nvidia {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size);
+}
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 5ce3c4bf..1390d779 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -2,6 +2,7 @@
 #include "../../core/llaisys_core.hpp"
 #include "../../utils.hpp"
 #include "cpu/swiglu_cpu.hpp"
+#include "nvidia/swiglu_nvidia.hpp"
 
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
@@ -26,8 +27,7 @@ void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
         out->dtype(), seq_len, inter_size);
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
-        return;
+        return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), seq_len, inter_size);
 #endif
     default:
         EXCEPTION_UNSUPPORTED_DEVICE;
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a9..3a376751 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -15,7 +15,6 @@ option_end()
 
 if has_config("nv-gpu") then
     add_defines("ENABLE_NVIDIA_API")
-    includes("xmake/nvidia.lua")
 end
 
 target("llaisys-utils")
@@ -26,6 +25,10 @@ target("llaisys-utils")
     if not is_plat("windows") then
         add_cxflags("-fPIC", "-Wno-unknown-pragmas")
     end
+    if has_config("nv-gpu") then
+        add_cuflags("-Xcompiler=-fPIC")
+        add_cuflags("-rdc=true")
+    end
 
     add_files("src/utils/*.cpp")
 
@@ -37,13 +40,11 @@ target("llaisys-device")
     set_kind("static")
     add_deps("llaisys-utils")
     add_deps("llaisys-device-cpu")
-
     set_languages("cxx17")
     set_warnings("all", "error")
     if not is_plat("windows") then
         add_cxflags("-fPIC", "-Wno-unknown-pragmas")
     end
-
     add_files("src/device/*.cpp")
 
     on_install(function (target) end)
@@ -83,7 +84,6 @@ target_end()
 target("llaisys-ops")
     set_kind("static")
     add_deps("llaisys-ops-cpu")
-
     set_languages("cxx17")
     set_warnings("all", "error")
     if not is_plat("windows") then
@@ -103,7 +103,20 @@ target("llaisys")
     add_deps("llaisys-tensor")
     add_deps("llaisys-ops")
 
-    set_languages("cxx17")
+    if has_config("nv-gpu") then
+        set_languages("cxx17", "cuda")
+        add_syslinks("cudart")
+        add_links("cudadevrt")
+        set_policy("build.cuda.devlink", true)
+        add_cuflags("-rdc=true")
+        add_cuflags("-Xcompiler=-fPIC")
+        add_files("src/device/nvidia/*.cu")
+        add_files("src/device/nvidia/*.cpp")
+        add_files("src/ops/*/nvidia/*.cu")
+        add_files("src/ops/*/nvidia/*.cpp")
+    else
+        set_languages("cxx17")
+    end
     set_warnings("all", "error")
     add_files("src/llaisys/*.cc")
     set_installdir(".")
@@ -119,4 +132,4 @@ target("llaisys")
             os.cp("lib/*.so", "python/llaisys/libllaisys/")
         end
     end)
-target_end()
\ No newline at end of file
+target_end()
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
new file mode 100644
index 00000000..cedaa090
--- /dev/null
+++ b/xmake/nvidia.lua
@@ -0,0 +1,32 @@
+target("llaisys-device-nvidia")
+    set_kind("static")
+    set_languages("cxx17", "cuda")
+    set_warnings("all", "error")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+        add_cuflags("-Xcompiler=-fPIC")
+    end
+    add_cuflags("-rdc=true")
+
+    add_files("../src/device/nvidia/*.cu")
+    add_files("../src/device/nvidia/*.cpp")
+
+    on_install(function (target) end)
+target_end()
+
+target("llaisys-ops-nvidia")
+    set_kind("static")
+    add_deps("llaisys-tensor")
+    set_languages("cxx17", "cuda")
+    set_warnings("all", "error")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+        add_cuflags("-Xcompiler=-fPIC")
+    end
+    add_cuflags("-rdc=true")
+
+    add_files("../src/ops/*/nvidia/*.cu")
+    add_files("../src/ops/*/nvidia/*.cpp")
+
+    on_install(function (target) end)
+target_end()

From ca565603880d3d5dcade422c799d09294a977cd0 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Fri, 27 Feb 2026 21:56:10 +0800
Subject: [PATCH 12/14] implement: inference using cuda. && test passed.

---
 python/llaisys/models/qwen2.py       |  34 +-
 python/llaisys/server/chat_server.py |   2 +-
 python/llaisys/webUI/index.html      |  29 +-
 src/llaisys/qwen2.cc                 | 487 +++++++++++++++++++++++++--
 4 files changed, 514 insertions(+), 38 deletions(-)

diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index dcd1263b..c89cbb04 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -6,9 +6,11 @@
 from ..libllaisys.models import LlaisysQwen2Meta
 
 from pathlib import Path
+import os
 import safetensors
 import json
 import ctypes
+import torch
 
 
 class Qwen2:
@@ -36,9 +38,11 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
         end_token = int(config["eos_token_id"])
         
 
+        dtype = self._select_dtype(device)
+        self._dtype = dtype
         # construct C struct LlaisysQwen2Meta
         meta = LlaisysQwen2Meta(
-            dtype=DataType.BF16,
+            dtype=dtype,
             nlayer=nlayer,
             hs=hs,
             nh=nh,
@@ -52,7 +56,6 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
             end_token=end_token,
         )
 
-        # only use cpu
         device_ids = (ctypes.c_int * 1)(0)
         # create model instance
         self._model = LIB_LLAISYS.llaisysQwen2ModelCreate(
@@ -72,7 +75,11 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
                 if weight is None:
                     continue
                 # load weight to c side
-                arr = data_.get_tensor(name_).contiguous() #c-contiguous
+                arr = data_.get_tensor(name_)
+                torch_dtype = self._torch_dtype(self._dtype)
+                if arr.dtype != torch_dtype:
+                    arr = arr.to(torch_dtype)
+                arr = arr.contiguous()
                 
                 LIB_LLAISYS.tensorLoad(weight, ctypes.c_void_p(arr.data_ptr()))
 
@@ -126,6 +133,27 @@ def _match_weight(self, name: str):
             return w.mlp_down_w[layer]
         return None
 
+    def _select_dtype(self, device: DeviceType) -> DataType:
+        dtype_env = os.environ.get("LLAISYS_DTYPE", "").strip().lower()
+        if dtype_env in ("f16", "float16"):
+            return DataType.F16
+        if dtype_env in ("f32", "float32"):
+            return DataType.F32
+        if dtype_env in ("bf16", "bfloat16"):
+            return DataType.BF16
+        if device == DeviceType.NVIDIA:
+            return DataType.F32
+        return DataType.BF16
+
+    def _torch_dtype(self, dtype: DataType):
+        if dtype == DataType.F16:
+            return torch.float16
+        if dtype == DataType.F32:
+            return torch.float32
+        if dtype == DataType.BF16:
+            return torch.bfloat16
+        return torch.float32
+
     def _infer(self, tokens: Sequence[int], temperature: float, top_k: int, top_p: float, seed: int) -> int:
         # step forward infer
 
diff --git a/python/llaisys/server/chat_server.py b/python/llaisys/server/chat_server.py
index 4682a0f6..72af35ff 100644
--- a/python/llaisys/server/chat_server.py
+++ b/python/llaisys/server/chat_server.py
@@ -252,7 +252,7 @@ def chat_ui():
     parser.add_argument("--host", default="0.0.0.0")
     parser.add_argument("--port", default=8000, type=int)
     parser.add_argument("--model_path", default=None)
-    parser.add_argument("--device", default="cpu")
+    parser.add_argument("--device", default="nvidia")
     parser.add_argument("--model_name", default="llaisys-qwen2")
     args = parser.parse_args()
 
diff --git a/python/llaisys/webUI/index.html b/python/llaisys/webUI/index.html
index b8ac1a3d..6da35694 100644
--- a/python/llaisys/webUI/index.html
+++ b/python/llaisys/webUI/index.html
@@ -273,6 +273,23 @@ <h2 id="activeTitle">新对话</h2>
         };
       }
 
+      function safeParseJson(data) {
+        try {
+          return JSON.parse(data);
+        } catch (err) {
+          const first = data.indexOf("{");
+          const last = data.lastIndexOf("}");
+          if (first !== -1 && last !== -1 && last > first) {
+            try {
+              return JSON.parse(data.slice(first, last + 1));
+            } catch (innerErr) {
+              return null;
+            }
+          }
+          return null;
+        }
+      }
+
       async function requestCompletion(messages, assistantIndex) {
         const payload = buildRequest(messages);
         try {
@@ -283,7 +300,9 @@ <h2 id="activeTitle">新对话</h2>
               body: JSON.stringify(payload)
             });
             if (!res.ok) throw new Error("请求失败: " + res.status);
-            const data = await res.json();
+            const text = await res.text();
+            const data = safeParseJson(text);
+            if (!data) throw new Error("响应不是合法JSON");
             const content = data.choices?.[0]?.message?.content || "";
             updateMessageContent(assistantIndex, content);
             return;
@@ -302,10 +321,11 @@ <h2 id="activeTitle">新对话</h2>
             const { value, done } = await reader.read();
             if (done) break;
             buffer += decoder.decode(value, { stream: true });
-            const parts = buffer.split("\\n\\n");
+            buffer = buffer.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
+            const parts = buffer.split("\n\n");
             buffer = parts.pop();
             for (const part of parts) {
-              const lines = part.split("\\n");
+              const lines = part.split("\n");
               for (const line of lines) {
                 if (!line.startsWith("data: ")) continue;
                 const data = line.slice(6).trim();
@@ -313,7 +333,8 @@ <h2 id="activeTitle">新对话</h2>
                   updateMessageContent(assistantIndex, current);
                   return;
                 }
-                const payload = JSON.parse(data);
+                const payload = safeParseJson(data);
+                if (!payload) continue;
                 const delta = payload.choices?.[0]?.delta?.content || "";
                 if (delta) {
                   current += delta;
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index e9d123b1..52b25db9 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -2,6 +2,7 @@
 #include "llaisys_tensor.hpp"
 
 #include "../tensor/tensor.hpp"
+#include "../core/llaisys_core.hpp"
 #include "../ops/add/op.hpp"
 #include "../ops/rand_sample/op.hpp"
 #include "../ops/embedding/op.hpp"
@@ -14,7 +15,12 @@
 #include "../utils.hpp"
 
 #include <cmath>
+#include <cstdint>
 #include <cstring>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <unordered_map>
 #include <vector>
 
 // wrap c++ tensor to external handle
@@ -40,8 +46,13 @@ llaisys::tensor_t make_tensor_dtype(
 
 // set tensor data to zero
 void zero_tensor(const llaisys::tensor_t &t) {
-    CHECK_ARGUMENT(t->deviceType() == LLAISYS_DEVICE_CPU, "Zero: only CPU is supported.");
-    std::memset(t->data(), 0, t->numel() * t->elementSize());
+    size_t size = t->numel() * t->elementSize();
+    if (t->deviceType() == LLAISYS_DEVICE_CPU) {
+        std::memset(t->data(), 0, size);
+        return;
+    }
+    std::vector<uint8_t> zeros(size);
+    t->load(zeros.data());
 }
 
 // wrap c++ tensor to external handle and record to handles list
@@ -53,6 +64,315 @@ llaisysTensor_t wrap_tensor(
     return h;
 }
 
+size_t ceil_div(size_t a, size_t b) {
+    return (a + b - 1) / b;
+}
+
+size_t read_env_size_t(const char *name, size_t default_value) {
+    const char *value = std::getenv(name);
+    if (!value) {
+        return default_value;
+    }
+    char *end = nullptr;
+    unsigned long long parsed = std::strtoull(value, &end, 10);
+    if (end == value || *end != '\0') {
+        return default_value;
+    }
+    return static_cast<size_t>(parsed);
+}
+
+uint64_t hash_mix(uint64_t h, int64_t v, uint64_t base) {
+    return h * base + (static_cast<uint64_t>(v) + 0x9e3779b97f4a7c15ull);
+}
+
+struct KVPagePoolLayer {
+    std::vector<llaisys::tensor_t> k_pages;
+    std::vector<llaisys::tensor_t> v_pages;
+    std::vector<size_t> refcnt;
+    std::vector<uint64_t> last_access;
+    std::vector<size_t> free_ids;
+    std::vector<uint8_t> is_free;
+};
+
+class KVCachePool {
+public:
+    void init(const LlaisysQwen2Meta &meta, llaisysDeviceType_t device, int device_id, size_t page_len, size_t max_pages) {
+        meta_ = meta;
+        device_ = device;
+        device_id_ = device_id;
+        page_len_ = page_len;
+        max_pages_ = max_pages;
+        access_clock_ = 1;
+        layers_.assign(meta.nlayer, KVPagePoolLayer{});
+    }
+
+    size_t page_len() const {
+        return page_len_;
+    }
+
+    size_t acquire_page(size_t layer) {
+        auto &pool = layers_[layer];
+        while (!pool.free_ids.empty()) {
+            size_t id = pool.free_ids.back();
+            pool.free_ids.pop_back();
+            if (pool.is_free[id] && pool.refcnt[id] == 0) {
+                pool.is_free[id] = 0;
+                pool.last_access[id] = access_clock_++;
+                return id;
+            }
+        }
+
+        if (pool.k_pages.size() < max_pages_) {
+            size_t id = pool.k_pages.size();
+            pool.k_pages.push_back(make_tensor(meta_, device_, device_id_, {page_len_, meta_.nkvh, meta_.dh}));
+            pool.v_pages.push_back(make_tensor(meta_, device_, device_id_, {page_len_, meta_.nkvh, meta_.dh}));
+            pool.refcnt.push_back(0);
+            pool.last_access.push_back(access_clock_++);
+            pool.is_free.push_back(0);
+            return id;
+        }
+
+        size_t selected = std::numeric_limits<size_t>::max();
+        uint64_t best_access = std::numeric_limits<uint64_t>::max();
+        for (size_t i = 0; i < pool.k_pages.size(); ++i) {
+            if (pool.refcnt[i] == 0 && pool.last_access[i] <= best_access) {
+                best_access = pool.last_access[i];
+                selected = i;
+            }
+        }
+        CHECK_ARGUMENT(selected != std::numeric_limits<size_t>::max(), "KV cache pool has no free page.");
+        pool.is_free[selected] = 0;
+        pool.last_access[selected] = access_clock_++;
+        return selected;
+    }
+
+    void incref(size_t layer, size_t page_id) {
+        auto &pool = layers_[layer];
+        if (pool.is_free[page_id]) {
+            pool.is_free[page_id] = 0;
+        }
+        pool.refcnt[page_id] += 1;
+        pool.last_access[page_id] = access_clock_++;
+    }
+
+    void decref(size_t layer, size_t page_id) {
+        auto &pool = layers_[layer];
+        if (pool.refcnt[page_id] > 0) {
+            pool.refcnt[page_id] -= 1;
+            if (pool.refcnt[page_id] == 0 && !pool.is_free[page_id]) {
+                pool.is_free[page_id] = 1;
+                pool.free_ids.push_back(page_id);
+            }
+        }
+        pool.last_access[page_id] = access_clock_++;
+    }
+
+    llaisys::tensor_t k_page(size_t layer, size_t page_id) {
+        auto &pool = layers_[layer];
+        pool.last_access[page_id] = access_clock_++;
+        return pool.k_pages[page_id];
+    }
+
+    llaisys::tensor_t v_page(size_t layer, size_t page_id) {
+        auto &pool = layers_[layer];
+        pool.last_access[page_id] = access_clock_++;
+        return pool.v_pages[page_id];
+    }
+
+private:
+    LlaisysQwen2Meta meta_;
+    llaisysDeviceType_t device_;
+    int device_id_;
+    size_t page_len_;
+    size_t max_pages_;
+    uint64_t access_clock_ = 1;
+    std::vector<KVPagePoolLayer> layers_;
+};
+
+struct KVHandle {
+    std::vector<std::vector<size_t>> layer_pages;
+    size_t token_count = 0;
+    uint64_t last_access = 0;
+    std::vector<int64_t> tokens;
+    std::vector<uint64_t> hash_keys;
+};
+
+class PrefixCacheIndex {
+public:
+    void init(size_t max_handles) {
+        max_handles_ = max_handles;
+        access_clock_ = 1;
+        handles_.clear();
+        alive_.clear();
+        key_to_handles_.clear();
+        free_handle_ids_.clear();
+    }
+
+    size_t find_longest_prefix(const int64_t *tokens, size_t ntoken, const KVCachePool &pool, KVHandle &out_handle) {
+        if (ntoken == 0) {
+            return 0;
+        }
+        auto hashes = prefix_hashes(tokens, ntoken);
+        for (size_t len = ntoken; len > 0; --len) {
+            uint64_t key = make_key(hashes[len], len);
+            auto it = key_to_handles_.find(key);
+            if (it == key_to_handles_.end()) {
+                continue;
+            }
+            for (size_t handle_id : it->second) {
+                if (handle_id >= handles_.size() || !alive_[handle_id]) {
+                    continue;
+                }
+                const auto &h = handles_[handle_id];
+                if (h.tokens.size() < len) {
+                    continue;
+                }
+                if (!std::equal(tokens, tokens + len, h.tokens.begin())) {
+                    continue;
+                }
+                size_t pages_needed = ceil_div(len, pool.page_len());
+                out_handle.layer_pages.assign(h.layer_pages.size(), {});
+                for (size_t i = 0; i < h.layer_pages.size(); ++i) {
+                    out_handle.layer_pages[i].assign(h.layer_pages[i].begin(), h.layer_pages[i].begin() + pages_needed);
+                }
+                out_handle.token_count = len;
+                out_handle.tokens.assign(tokens, tokens + len);
+                out_handle.last_access = access_clock_++;
+                handles_[handle_id].last_access = out_handle.last_access;
+                return len;
+            }
+        }
+        return 0;
+    }
+
+    void insert_handle(const KVHandle &handle, const int64_t *tokens, size_t ntoken, KVCachePool &pool) {
+        if (ntoken == 0) {
+            return;
+        }
+        size_t handle_id = acquire_handle_id();
+        if (handle_id >= handles_.size()) {
+            handles_.resize(handle_id + 1);
+            alive_.resize(handle_id + 1, 0);
+        }
+        if (alive_[handle_id]) {
+            release_handle(handle_id, pool);
+        }
+        KVHandle stored;
+        stored.token_count = ntoken;
+        stored.tokens.assign(tokens, tokens + ntoken);
+        stored.layer_pages = handle.layer_pages;
+        size_t pages_needed = ceil_div(ntoken, pool.page_len());
+        for (size_t i = 0; i < stored.layer_pages.size(); ++i) {
+            if (stored.layer_pages[i].size() > pages_needed) {
+                stored.layer_pages[i].resize(pages_needed);
+            }
+            for (size_t page_id : stored.layer_pages[i]) {
+                pool.incref(i, page_id);
+            }
+        }
+        auto hashes = prefix_hashes(tokens, ntoken);
+        stored.hash_keys.reserve(ntoken);
+        for (size_t len = 1; len <= ntoken; ++len) {
+            uint64_t key = make_key(hashes[len], len);
+            key_to_handles_[key].push_back(handle_id);
+            stored.hash_keys.push_back(key);
+        }
+        stored.last_access = access_clock_++;
+        handles_[handle_id] = std::move(stored);
+        alive_[handle_id] = 1;
+        enforce_capacity(pool);
+    }
+
+    void release_handle(size_t handle_id, KVCachePool &pool) {
+        if (handle_id >= handles_.size() || !alive_[handle_id]) {
+            return;
+        }
+        auto &h = handles_[handle_id];
+        for (size_t i = 0; i < h.layer_pages.size(); ++i) {
+            for (size_t page_id : h.layer_pages[i]) {
+                pool.decref(i, page_id);
+            }
+        }
+        for (uint64_t key : h.hash_keys) {
+            auto it = key_to_handles_.find(key);
+            if (it == key_to_handles_.end()) {
+                continue;
+            }
+            auto &vec = it->second;
+            vec.erase(std::remove(vec.begin(), vec.end(), handle_id), vec.end());
+            if (vec.empty()) {
+                key_to_handles_.erase(it);
+            }
+        }
+        h.layer_pages.clear();
+        h.tokens.clear();
+        h.hash_keys.clear();
+        alive_[handle_id] = 0;
+        free_handle_ids_.push_back(handle_id);
+    }
+
+private:
+    uint64_t make_key(uint64_t hash, size_t len) const {
+        return hash ^ (salt_ * static_cast<uint64_t>(len));
+    }
+
+    std::vector<uint64_t> prefix_hashes(const int64_t *tokens, size_t ntoken) const {
+        std::vector<uint64_t> hashes(ntoken + 1);
+        uint64_t h = 0;
+        for (size_t i = 0; i < ntoken; ++i) {
+            h = hash_mix(h, tokens[i], base_);
+            hashes[i + 1] = h;
+        }
+        return hashes;
+    }
+
+    size_t acquire_handle_id() {
+        if (!free_handle_ids_.empty()) {
+            size_t id = free_handle_ids_.back();
+            free_handle_ids_.pop_back();
+            return id;
+        }
+        return handles_.size();
+    }
+
+    void enforce_capacity(KVCachePool &pool) {
+        if (max_handles_ == 0) {
+            return;
+        }
+        size_t alive_count = 0;
+        for (uint8_t v : alive_) {
+            alive_count += v;
+        }
+        while (alive_count > max_handles_) {
+            size_t oldest = std::numeric_limits<size_t>::max();
+            uint64_t oldest_access = std::numeric_limits<uint64_t>::max();
+            for (size_t i = 0; i < handles_.size(); ++i) {
+                if (!alive_[i]) {
+                    continue;
+                }
+                if (handles_[i].last_access <= oldest_access) {
+                    oldest_access = handles_[i].last_access;
+                    oldest = i;
+                }
+            }
+            if (oldest == std::numeric_limits<size_t>::max()) {
+                break;
+            }
+            release_handle(oldest, pool);
+            alive_count -= 1;
+        }
+    }
+
+    uint64_t base_ = 1469598103934665603ull;
+    uint64_t salt_ = 1099511628211ull;
+    size_t max_handles_ = 64;
+    uint64_t access_clock_ = 1;
+    std::unordered_map<uint64_t, std::vector<size_t>> key_to_handles_;
+    std::vector<KVHandle> handles_;
+    std::vector<uint8_t> alive_;
+    std::vector<size_t> free_handle_ids_;
+};
+
 }
 
 // Qwen2 Model Instance Structure
@@ -76,12 +396,28 @@ struct LlaisysQwen2Model {
     llaisys::tensor_t out_bias;                  
 
     // KV cache
-    std::vector<llaisys::tensor_t> k_cache;
-    std::vector<llaisys::tensor_t> v_cache;
-
+    KVCachePool kv_pool;
+    PrefixCacheIndex prefix_index;
+    KVHandle active_handle;
+    bool active_valid;
     size_t cache_len;
 };
 
+static void release_active_handle(LlaisysQwen2Model *model) {
+    if (!model || !model->active_valid) {
+        return;
+    }
+    for (size_t i = 0; i < model->active_handle.layer_pages.size(); ++i) {
+        for (size_t page_id : model->active_handle.layer_pages[i]) {
+            model->kv_pool.decref(i, page_id);
+        }
+    }
+    model->active_handle.layer_pages.assign(model->meta.nlayer, {});
+    model->active_handle.tokens.clear();
+    model->active_handle.token_count = 0;
+    model->active_valid = false;
+}
+
 // init model weights, allocate memory and zero bias
 static void init_weights(LlaisysQwen2Model *model) {
     const auto &m = model->meta;
@@ -186,13 +522,15 @@ static void init_weights(LlaisysQwen2Model *model) {
 // initialize kv cache tensor
 static void init_cache(LlaisysQwen2Model *model) {
     const auto &m = model->meta;
-    model->k_cache.resize(m.nlayer);
-    model->v_cache.resize(m.nlayer);
-    for (size_t i = 0; i < m.nlayer; ++i) {
-        // cache shape: [maxseq, num_heads, head_dim]
-        model->k_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh});
-        model->v_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh});
-    }
+    size_t page_len = read_env_size_t("LLAISYS_KV_PAGE_LEN", 128);
+    size_t max_pages = read_env_size_t("LLAISYS_KV_MAX_PAGES", ceil_div(m.maxseq, page_len));
+    size_t max_handles = read_env_size_t("LLAISYS_KV_MAX_HANDLES", 64);
+    model->kv_pool.init(m, model->device, model->device_id, page_len, max_pages);
+    model->prefix_index.init(max_handles);
+    model->active_handle.layer_pages.assign(m.nlayer, {});
+    model->active_handle.tokens.clear();
+    model->active_handle.token_count = 0;
+    model->active_valid = false;
     model->cache_len = 0;
 }
 
@@ -201,18 +539,44 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
                           size_t topK, float topP, int64_t seed) {
     CHECK_ARGUMENT(model != nullptr, "model is null");
     CHECK_ARGUMENT(token_ids != nullptr || ntoken == 0, "token_ids is null");
-    CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU, "Only CPU device is supported.");
+    CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU || model->device == LLAISYS_DEVICE_NVIDIA,
+                   "Unsupported device type.");
 
     if (ntoken == 0) {
         return model->meta.end_token;
     }
 
-    // prefill
-    if (ntoken > 1 || model->cache_len == 0) {
-        model->cache_len = 0;
+    size_t reuse_len = 0;
+    if (ntoken > 1) {
+        release_active_handle(model);
+        reuse_len = model->prefix_index.find_longest_prefix(token_ids, ntoken, model->kv_pool, model->active_handle);
+        if (reuse_len > 0) {
+            model->active_valid = true;
+            for (size_t i = 0; i < model->active_handle.layer_pages.size(); ++i) {
+                for (size_t page_id : model->active_handle.layer_pages[i]) {
+                    model->kv_pool.incref(i, page_id);
+                }
+            }
+        }
+        if (reuse_len >= ntoken) {
+            reuse_len = ntoken - 1;
+        }
+        if (model->active_valid) {
+            size_t max_pages = ceil_div(reuse_len, model->kv_pool.page_len());
+            for (size_t i = 0; i < model->active_handle.layer_pages.size(); ++i) {
+                while (model->active_handle.layer_pages[i].size() > max_pages) {
+                    size_t page_id = model->active_handle.layer_pages[i].back();
+                    model->active_handle.layer_pages[i].pop_back();
+                    model->kv_pool.decref(i, page_id);
+                }
+            }
+            model->active_handle.token_count = reuse_len;
+            model->active_handle.tokens.assign(token_ids, token_ids + reuse_len);
+        }
+        model->cache_len = reuse_len;
     }
 
-    size_t seqlen = ntoken;
+    size_t seqlen = ntoken - reuse_len;
     size_t pos_offset = model->cache_len;
 
     // position ID [pos_offset, pos_offset + seqlen)
@@ -223,7 +587,7 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
 
     // input token and position ID tensors
     auto input_ids = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen});
-    input_ids->load(token_ids);
+    input_ids->load(token_ids + reuse_len);
 
     auto pos_tensor = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen});
     pos_tensor->load(pos_ids.data());
@@ -235,6 +599,21 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
     // attn scale factor
     float scale = 1.0f / std::sqrt(static_cast<float>(model->meta.dh));
 
+    size_t total_len = model->cache_len + seqlen;
+    size_t page_len = model->kv_pool.page_len();
+    size_t pages_needed = ceil_div(total_len, page_len);
+    if (model->active_handle.layer_pages.size() != model->meta.nlayer) {
+        model->active_handle.layer_pages.assign(model->meta.nlayer, {});
+    }
+    for (size_t i = 0; i < model->meta.nlayer; ++i) {
+        while (model->active_handle.layer_pages[i].size() < pages_needed) {
+            size_t page_id = model->kv_pool.acquire_page(i);
+            model->kv_pool.incref(i, page_id);
+            model->active_handle.layer_pages[i].push_back(page_id);
+            model->active_valid = true;
+        }
+    }
+
     // layer forward
     for (size_t i = 0; i < model->meta.nlayer; ++i) {
         // attn input norm
@@ -262,17 +641,43 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
         llaisys::ops::rope(q_rope, q_view, pos_tensor, model->meta.theta);
         llaisys::ops::rope(k_rope, k_view, pos_tensor, model->meta.theta);
 
-        // write new k/v to cache
-        auto k_cache_slice = model->k_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen);
-        auto v_cache_slice = model->v_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen);
-
-        llaisys::ops::rearrange(k_cache_slice, k_rope);
-        llaisys::ops::rearrange(v_cache_slice, v_view);
-
-        // get all history k/v
-        size_t total_len = model->cache_len + seqlen;
-        auto k_total = model->k_cache[i]->slice(0, 0, total_len);
-        auto v_total = model->v_cache[i]->slice(0, 0, total_len);
+        size_t write_offset = model->cache_len;
+        size_t remaining = seqlen;
+        size_t src_offset = 0;
+        while (remaining > 0) {
+            size_t page_index = write_offset / page_len;
+            size_t page_offset = write_offset % page_len;
+            size_t chunk = std::min(remaining, page_len - page_offset);
+            size_t page_id = model->active_handle.layer_pages[i][page_index];
+            auto k_page = model->kv_pool.k_page(i, page_id);
+            auto v_page = model->kv_pool.v_page(i, page_id);
+            auto k_page_slice = k_page->slice(0, page_offset, page_offset + chunk);
+            auto v_page_slice = v_page->slice(0, page_offset, page_offset + chunk);
+            auto k_chunk = k_rope->slice(0, src_offset, src_offset + chunk);
+            auto v_chunk = v_view->slice(0, src_offset, src_offset + chunk);
+            llaisys::ops::rearrange(k_page_slice, k_chunk);
+            llaisys::ops::rearrange(v_page_slice, v_chunk);
+            write_offset += chunk;
+            src_offset += chunk;
+            remaining -= chunk;
+        }
+
+        auto k_total = make_tensor(model->meta, model->device, model->device_id, {total_len, model->meta.nkvh, model->meta.dh});
+        auto v_total = make_tensor(model->meta, model->device, model->device_id, {total_len, model->meta.nkvh, model->meta.dh});
+        size_t read_offset = 0;
+        for (size_t page_index = 0; page_index < pages_needed; ++page_index) {
+            size_t chunk = std::min(page_len, total_len - read_offset);
+            size_t page_id = model->active_handle.layer_pages[i][page_index];
+            auto k_page = model->kv_pool.k_page(i, page_id);
+            auto v_page = model->kv_pool.v_page(i, page_id);
+            auto k_page_slice = k_page->slice(0, 0, chunk);
+            auto v_page_slice = v_page->slice(0, 0, chunk);
+            auto k_total_slice = k_total->slice(0, read_offset, read_offset + chunk);
+            auto v_total_slice = v_total->slice(0, read_offset, read_offset + chunk);
+            llaisys::ops::rearrange(k_total_slice, k_page_slice);
+            llaisys::ops::rearrange(v_total_slice, v_page_slice);
+            read_offset += chunk;
+        }
 
         // self attn
         auto attn = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh, model->meta.dh});
@@ -315,6 +720,18 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
 
     // update cache len
     model->cache_len += seqlen;
+    model->active_handle.token_count = model->cache_len;
+    if (model->active_valid) {
+        if (model->active_handle.tokens.size() < model->cache_len) {
+            model->active_handle.tokens.insert(
+                model->active_handle.tokens.end(),
+                token_ids + reuse_len,
+                token_ids + reuse_len + seqlen);
+        }
+    }
+    if (ntoken > 1 && model->active_valid) {
+        model->prefix_index.insert_handle(model->active_handle, token_ids, ntoken, model->kv_pool);
+    }
 
     // final norm
     auto x_norm = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs});
@@ -330,7 +747,17 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si
     auto sample_val = make_tensor(model->meta, model->device, model->device_id, {1});
     llaisys::ops::rand_sample(sample_idx, sample_val, last, temperature, topK, topP, seed);
 
-    return reinterpret_cast<int64_t *>(sample_idx->data())[0];
+    if (model->device == LLAISYS_DEVICE_CPU) {
+        return reinterpret_cast<int64_t *>(sample_idx->data())[0];
+    }
+    int64_t host_value = 0;
+    llaisys::core::context().setDevice(model->device, model->device_id);
+    llaisys::core::context().runtime().api()->memcpy_sync(
+        &host_value,
+        sample_idx->data(),
+        sizeof(int64_t),
+        LLAISYS_MEMCPY_D2H);
+    return host_value;
 }
 
 // C API wrapper

From 101e733a11141b216da0ea76f948af14f120c8d1 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Sun, 15 Mar 2026 11:06:05 +0800
Subject: [PATCH 13/14] implement cuda ops: rand_sample && self attention.

---
 .../rand_sample/nvidia/rand_sample_nvidia.cu  | 179 ++++++++++++++++--
 .../nvidia/self_attention_nvidia.cu           | 151 ++++++++++++---
 2 files changed, 288 insertions(+), 42 deletions(-)

diff --git a/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu
index b0326878..cdb00f7b 100644
--- a/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu
+++ b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu
@@ -1,29 +1,172 @@
 #include "rand_sample_nvidia.hpp"
 
 #include "../../../core/llaisys_core.hpp"
-#include "../../../utils.hpp"
 #include "../../nvidia_utils.cuh"
-#include "../cpu/rand_sample_cpu.hpp"
+
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
 
 namespace llaisys::ops::nvidia {
-void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel,
-                 int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed) {
-    auto &runtime = llaisys::core::context().runtime();
-    size_t vals_bytes = numel * static_cast<size_t>(batch_size) * llaisys::utils::dsize(type);
-    size_t idx_bytes = static_cast<size_t>(batch_size) * sizeof(int64_t);
-    size_t val_bytes = static_cast<size_t>(batch_size) * llaisys::utils::dsize(type);
+__device__ inline uint64_t lcg_next(uint64_t &state) {
+    state = state * 6364136223846793005ULL + 1ULL;
+    return state;
+}
 
-    auto host_vals = static_cast<std::byte *>(runtime.api()->malloc_host(vals_bytes));
-    auto host_idx = static_cast<std::byte *>(runtime.api()->malloc_host(idx_bytes));
-    auto host_val = static_cast<std::byte *>(runtime.api()->malloc_host(val_bytes));
+__device__ inline float rng_uniform(uint64_t &state) {
+    uint64_t x = lcg_next(state);
+    uint32_t mant = static_cast<uint32_t>(x >> 40);
+    return static_cast<float>(mant) * (1.0f / 16777216.0f);
+}
 
-    runtime.api()->memcpy_sync(host_vals, vals, vals_bytes, LLAISYS_MEMCPY_D2H);
-    cpu::rand_sample(host_idx, host_val, host_vals, type, numel, batch_size, temperature, topK, topP, seed);
-    runtime.api()->memcpy_sync(sample_idx, host_idx, idx_bytes, LLAISYS_MEMCPY_H2D);
-    runtime.api()->memcpy_sync(sample_val, host_val, val_bytes, LLAISYS_MEMCPY_H2D);
+template <typename T>
+__global__ void rand_sample_kernel(int64_t *out_idx, T *out_val, const T *vals, size_t numel, int64_t batch_size,
+                                   float temperature, size_t topK, float topP, int64_t seed, float *probs, int *idx) {
+    int64_t b = static_cast<int64_t>(blockIdx.x);
+    if (b >= batch_size) {
+        return;
+    }
+    if (threadIdx.x != 0) {
+        return;
+    }
+    const T *v = vals + static_cast<size_t>(b) * numel;
+    float *p = probs + static_cast<size_t>(b) * numel;
+    int *id = idx + static_cast<size_t>(b) * numel;
+    float temp = temperature <= 1e-6f ? 1e-6f : temperature;
+    size_t max_i = 0;
+    float max_v = detail::to_float(v[0]);
+    for (size_t i = 1; i < numel; ++i) {
+        float cur = detail::to_float(v[i]);
+        if (cur > max_v) {
+            max_v = cur;
+            max_i = i;
+        }
+    }
+    float sum = 0.0f;
+    for (size_t i = 0; i < numel; ++i) {
+        float cur = detail::to_float(v[i]);
+        float val = expf((cur - max_v) / temp);
+        p[i] = val;
+        sum += val;
+        id[i] = static_cast<int>(i);
+    }
+    if (sum <= 0.0f) {
+        for (size_t i = 0; i < numel; ++i) {
+            p[i] = 0.0f;
+        }
+        p[max_i] = 1.0f;
+        sum = 1.0f;
+    } else {
+        float inv = 1.0f / sum;
+        for (size_t i = 0; i < numel; ++i) {
+            p[i] *= inv;
+        }
+    }
+    size_t k = numel;
+    if (topK > 0 && topK < numel) {
+        k = topK;
+    }
+    if ((topK > 0 && topK < numel) || (topP > 0.0f && topP < 1.0f)) {
+        for (size_t i = 0; i < k; ++i) {
+            size_t max_pos = i;
+            float max_val = p[i];
+            for (size_t j = i + 1; j < numel; ++j) {
+                float vj = p[j];
+                if (vj > max_val) {
+                    max_val = vj;
+                    max_pos = j;
+                }
+            }
+            if (max_pos != i) {
+                float tmp = p[i];
+                p[i] = p[max_pos];
+                p[max_pos] = tmp;
+                int tmp_i = id[i];
+                id[i] = id[max_pos];
+                id[max_pos] = tmp_i;
+            }
+        }
+    }
+    size_t cand = k;
+    if (topP > 0.0f && topP < 1.0f) {
+        float cum = 0.0f;
+        cand = 0;
+        for (size_t i = 0; i < k; ++i) {
+            cum += p[i];
+            cand = i + 1;
+            if (cum >= topP) {
+                break;
+            }
+        }
+        if (cand == 0) {
+            cand = 1;
+        }
+    }
+    float cand_sum = 0.0f;
+    for (size_t i = 0; i < cand; ++i) {
+        cand_sum += p[i];
+    }
+    size_t chosen = static_cast<size_t>(id[0]);
+    if (cand_sum > 0.0f) {
+        uint64_t state = static_cast<uint64_t>(seed) ^ (static_cast<uint64_t>(b + 1) * 0x9e3779b97f4a7c15ULL);
+        float r = rng_uniform(state) * cand_sum;
+        for (size_t i = 0; i < cand; ++i) {
+            r -= p[i];
+            if (r <= 0.0f) {
+                chosen = static_cast<size_t>(id[i]);
+                break;
+            }
+            if (i == cand - 1) {
+                chosen = static_cast<size_t>(id[i]);
+            }
+        }
+    }
+    out_idx[b] = static_cast<int64_t>(chosen);
+    out_val[b] = v[chosen];
+}
 
-    runtime.api()->free_host(host_vals);
-    runtime.api()->free_host(host_idx);
-    runtime.api()->free_host(host_val);
+void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel,
+                 int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed) {
+    auto &runtime = llaisys::core::context().runtime();
+    if (batch_size <= 0 || numel == 0) {
+        return;
+    }
+    auto d_probs = static_cast<float *>(runtime.api()->malloc_device(sizeof(float) * static_cast<size_t>(batch_size) * numel));
+    auto d_idx = static_cast<int *>(runtime.api()->malloc_device(sizeof(int) * static_cast<size_t>(batch_size) * numel));
+    dim3 grid(static_cast<unsigned int>(batch_size), 1, 1);
+    int threads = 1;
+    auto stream = reinterpret_cast<cudaStream_t>(runtime.stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        rand_sample_kernel<<<grid, threads, 0, stream>>>(
+            reinterpret_cast<int64_t *>(sample_idx),
+            reinterpret_cast<float *>(sample_val),
+            reinterpret_cast<const float *>(vals),
+            numel, batch_size, temperature, topK, topP, seed, d_probs, d_idx);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        rand_sample_kernel<<<grid, threads, 0, stream>>>(
+            reinterpret_cast<int64_t *>(sample_idx),
+            reinterpret_cast<llaisys::bf16_t *>(sample_val),
+            reinterpret_cast<const llaisys::bf16_t *>(vals),
+            numel, batch_size, temperature, topK, topP, seed, d_probs, d_idx);
+        break;
+    case LLAISYS_DTYPE_F16:
+        rand_sample_kernel<<<grid, threads, 0, stream>>>(
+            reinterpret_cast<int64_t *>(sample_idx),
+            reinterpret_cast<llaisys::fp16_t *>(sample_val),
+            reinterpret_cast<const llaisys::fp16_t *>(vals),
+            numel, batch_size, temperature, topK, topP, seed, d_probs, d_idx);
+        break;
+    default:
+        runtime.api()->free_device(d_probs);
+        runtime.api()->free_device(d_idx);
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
+    runtime.api()->free_device(d_probs);
+    runtime.api()->free_device(d_idx);
 }
 }
diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
index aae4d79e..110217ff 100644
--- a/src/ops/self_attention/nvidia/self_attention_nvidia.cu
+++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu
@@ -1,34 +1,137 @@
 #include "self_attention_nvidia.hpp"
 
 #include "../../../core/llaisys_core.hpp"
-#include "../../../utils.hpp"
-#include "../cpu/self_attention_cpu.hpp"
+#include "../../nvidia_utils.cuh"
+
+#include <cuda_runtime.h>
+#include <cmath>
+#include <cstddef>
 
 namespace llaisys::ops::nvidia {
+template <typename T>
+__global__ void self_attention_kernel(T *out, const T *q, const T *k, const T *v, float scale, size_t qlen, size_t kvlen,
+                                      size_t nh, size_t nkvh, size_t hd, size_t dv) {
+    size_t i = static_cast<size_t>(blockIdx.x);
+    size_t h = static_cast<size_t>(blockIdx.y);
+    if (i >= qlen || h >= nh) {
+        return;
+    }
+    size_t kv_group = nh / nkvh;
+    size_t kvh = h / kv_group;
+    const T *q_ptr = q + (i * nh + h) * hd;
+    extern __shared__ float shared[];
+    float *smax = shared;
+    float *ssum = shared + blockDim.x;
+    int tid = threadIdx.x;
+    float local_max = -INFINITY;
+    int64_t limit = static_cast<int64_t>(i) + static_cast<int64_t>(kvlen) - static_cast<int64_t>(qlen);
+    for (size_t j = static_cast<size_t>(tid); j < kvlen; j += static_cast<size_t>(blockDim.x)) {
+        if (static_cast<int64_t>(j) > limit) {
+            continue;
+        }
+        const T *k_ptr = k + (j * nkvh + kvh) * hd;
+        float acc = 0.0f;
+        for (size_t d = 0; d < hd; ++d) {
+            acc += detail::to_float(q_ptr[d]) * detail::to_float(k_ptr[d]);
+        }
+        acc *= scale;
+        if (acc > local_max) {
+            local_max = acc;
+        }
+    }
+    smax[tid] = local_max;
+    __syncthreads();
+    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < static_cast<int>(s)) {
+            float other = smax[tid + s];
+            if (other > smax[tid]) {
+                smax[tid] = other;
+            }
+        }
+        __syncthreads();
+    }
+    float max_score = smax[0];
+    float local_sum = 0.0f;
+    for (size_t j = static_cast<size_t>(tid); j < kvlen; j += static_cast<size_t>(blockDim.x)) {
+        if (static_cast<int64_t>(j) > limit) {
+            continue;
+        }
+        const T *k_ptr = k + (j * nkvh + kvh) * hd;
+        float acc = 0.0f;
+        for (size_t d = 0; d < hd; ++d) {
+            acc += detail::to_float(q_ptr[d]) * detail::to_float(k_ptr[d]);
+        }
+        acc *= scale;
+        local_sum += expf(acc - max_score);
+    }
+    ssum[tid] = local_sum;
+    __syncthreads();
+    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < static_cast<int>(s)) {
+            ssum[tid] += ssum[tid + s];
+        }
+        __syncthreads();
+    }
+    float sum_score = ssum[0];
+    if (sum_score <= 0.0f) {
+        sum_score = 1.0f;
+    }
+    for (size_t d = static_cast<size_t>(tid); d < dv; d += static_cast<size_t>(blockDim.x)) {
+        float acc = 0.0f;
+        for (size_t j = 0; j < kvlen; ++j) {
+            if (static_cast<int64_t>(j) > limit) {
+                continue;
+            }
+            const T *k_ptr = k + (j * nkvh + kvh) * hd;
+            float score = 0.0f;
+            for (size_t kd = 0; kd < hd; ++kd) {
+                score += detail::to_float(q_ptr[kd]) * detail::to_float(k_ptr[kd]);
+            }
+            score *= scale;
+            float w = expf(score - max_score) / sum_score;
+            const T *v_ptr = v + (j * nkvh + kvh) * dv;
+            acc += w * detail::to_float(v_ptr[d]);
+        }
+        out[(i * nh + h) * dv + d] = detail::from_float<T>(acc);
+    }
+}
+
 void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale,
                     llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv) {
     auto &runtime = llaisys::core::context().runtime();
-    size_t elem_size = llaisys::utils::dsize(type);
-    size_t q_bytes = qlen * nh * hd * elem_size;
-    size_t k_bytes = kvlen * nkvh * hd * elem_size;
-    size_t v_bytes = kvlen * nkvh * dv * elem_size;
-    size_t out_bytes = qlen * nh * dv * elem_size;
-
-    auto host_q = static_cast<std::byte *>(runtime.api()->malloc_host(q_bytes));
-    auto host_k = static_cast<std::byte *>(runtime.api()->malloc_host(k_bytes));
-    auto host_v = static_cast<std::byte *>(runtime.api()->malloc_host(v_bytes));
-    auto host_out = static_cast<std::byte *>(runtime.api()->malloc_host(out_bytes));
-
-    runtime.api()->memcpy_sync(host_q, q, q_bytes, LLAISYS_MEMCPY_D2H);
-    runtime.api()->memcpy_sync(host_k, k, k_bytes, LLAISYS_MEMCPY_D2H);
-    runtime.api()->memcpy_sync(host_v, v, v_bytes, LLAISYS_MEMCPY_D2H);
-
-    cpu::self_attention(host_out, host_q, host_k, host_v, scale, type, qlen, kvlen, nh, nkvh, hd, dv);
-    runtime.api()->memcpy_sync(out, host_out, out_bytes, LLAISYS_MEMCPY_H2D);
-
-    runtime.api()->free_host(host_q);
-    runtime.api()->free_host(host_k);
-    runtime.api()->free_host(host_v);
-    runtime.api()->free_host(host_out);
+    dim3 grid(static_cast<unsigned int>(qlen), static_cast<unsigned int>(nh), 1);
+    int threads = 256;
+    size_t shared_bytes = static_cast<size_t>(threads) * sizeof(float) * 2;
+    auto stream = reinterpret_cast<cudaStream_t>(runtime.stream());
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        self_attention_kernel<<<grid, threads, shared_bytes, stream>>>(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(q),
+            reinterpret_cast<const float *>(k),
+            reinterpret_cast<const float *>(v),
+            scale, qlen, kvlen, nh, nkvh, hd, dv);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        self_attention_kernel<<<grid, threads, shared_bytes, stream>>>(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(q),
+            reinterpret_cast<const llaisys::bf16_t *>(k),
+            reinterpret_cast<const llaisys::bf16_t *>(v),
+            scale, qlen, kvlen, nh, nkvh, hd, dv);
+        break;
+    case LLAISYS_DTYPE_F16:
+        self_attention_kernel<<<grid, threads, shared_bytes, stream>>>(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(q),
+            reinterpret_cast<const llaisys::fp16_t *>(k),
+            reinterpret_cast<const llaisys::fp16_t *>(v),
+            scale, qlen, kvlen, nh, nkvh, hd, dv);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+    detail::checkCuda(cudaGetLastError());
+    detail::checkCuda(cudaStreamSynchronize(stream));
 }
 }

From 4643f0a62a127dd2db85e675ce7cea093cd06136 Mon Sep 17 00:00:00 2001
From: leninist1 <282516536@qq.com>
Date: Sun, 15 Mar 2026 11:24:52 +0800
Subject: [PATCH 14/14] fix: build error on windows.

---
 src/llaisys/qwen2.cc | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
index 52b25db9..286ec4ea 100644
--- a/src/llaisys/qwen2.cc
+++ b/src/llaisys/qwen2.cc
@@ -69,6 +69,20 @@ size_t ceil_div(size_t a, size_t b) {
 }
 
 size_t read_env_size_t(const char *name, size_t default_value) {
+#ifdef _WIN32
+    char *value = nullptr;
+    size_t len = 0;
+    if (_dupenv_s(&value, &len, name) != 0 || value == nullptr) {
+        return default_value;
+    }
+    char *end = nullptr;
+    unsigned long long parsed = std::strtoull(value, &end, 10);
+    std::free(value);
+    if (end == value || *end != '\0') {
+        return default_value;
+    }
+    return static_cast<size_t>(parsed);
+#else
     const char *value = std::getenv(name);
     if (!value) {
         return default_value;
@@ -79,6 +93,7 @@ size_t read_env_size_t(const char *name, size_t default_value) {
         return default_value;
     }
     return static_cast<size_t>(parsed);
+#endif
 }
 
 uint64_t hash_mix(uint64_t h, int64_t v, uint64_t base) {