From b3f96ac17d1dec7662358d4365276498ebbea8cb Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Mon, 19 Jan 2026 22:30:45 +0800 Subject: [PATCH 01/14] tensor task --- src/tensor/tensor.cpp | 182 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 174 insertions(+), 8 deletions(-) diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 2f594bb6..6a7f3012 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -164,27 +164,193 @@ void Tensor::debug() const { } bool Tensor::isContiguous() const { - TO_BE_IMPLEMENTED(); + //first initial expected stride(from lastdim to 0) + ptrdiff_t expected_stride = 1; + for(int i = (int)ndim()-1; i >= 0; i--) + { + size_t current_shape = shape()[i]; + if(current_shape == 0) return true; + if(current_shape > 1) + { + if(strides()[i] != expected_stride) + return false; + expected_stride *= current_shape; + } + } return true; } tensor_t Tensor::permute(const std::vector &order) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // valid order.size() == this->ndim() + CHECK_ARGUMENT(order.size() == this->ndim(), "Permute: order size mismatch."); + std::vector seen(order.size(), false); + std::vector new_shape(order.size()); + std::vector new_strides(order.size()); + for(int i = 0; i < (int)order.size(); ++i) + { + size_t dim = order[i]; + CHECK_ARGUMENT(dim < this->ndim(), "Permute: order index out of range."); + CHECK_ARGUMENT(!seen[dim], "Permute: order index duplicated."); + seen[dim] = true; + new_shape[i] = this->shape()[dim]; + new_strides[i] = this->strides()[dim]; + } + TensorMeta meta{this->dtype(), new_shape, new_strides}; + return std::shared_ptr(new Tensor(meta, _storage, _offset)); } tensor_t Tensor::view(const std::vector &shape) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + + // calculate new_shape's numel, must be the same as the old + size_t new_numel = 1; + for(auto s : shape) + new_numel *= s; + CHECK_ARGUMENT(new_numel == this->numel(), "View: total elements mismatch."); + std::vector new_strides(shape.size(),0); + + //filter size = 1 shape in old_shape + std::vector old_shape; + std::vector old_strides; + old_shape.reserve(this->shape().size()); + old_strides.reserve(this->strides().size()); + for(size_t i = 0; i < this->shape().size(); ++i) + { + if(this->shape()[i] > 1) + { + old_shape.push_back(this->shape()[i]); + old_strides.push_back(this->strides()[i]); + } + } + + // filter size = 1 shape in new_shape + std::vector new_shape; + std::vector new_indices; + new_shape.reserve(shape.size()); + new_indices.reserve(shape.size()); + for(size_t i = 0; i < shape.size(); ++i) + { + if(shape[i] > 1) + { + new_shape.push_back(shape[i]); + new_indices.push_back(i); + } + } + + // if no size > 1 in old_shape or new_shape + // no need to spilt blocks, generate continual new_strides + if(old_shape.empty() || new_shape.empty()) + { + for(int i = (int)shape.size()-1; i>=0; --i) + { + if(i+1 < (int)shape.size()) + { + new_strides[i] = new_strides[i+1]*shape[i+1]; + } + else + { + new_strides[i] = 1; + } + } + TensorMeta meta{this->dtype(), shape, new_strides}; + return std::shared_ptr(new Tensor(meta, _storage)); + } + // spilt blocks + struct Block + { + size_t numel; + ptrdiff_t inner_stride; + }; + + std::vector blocks; + for(int i = (int)old_shape.size() - 1; i >= 0; --i) + { + if(blocks.empty()) + { + blocks.push_back({old_shape[i], old_strides[i]}); + } + else + { + //continual add to current block + if(old_strides[i] == (ptrdiff_t)old_shape[i+1] * old_strides[i+1]) + { + blocks.back().numel *= old_shape[i]; + } + //break, spilt new block + else + { + blocks.push_back({old_shape[i], old_strides[i]}); + } + } + } + // for every block, judge which dims can cover it + size_t dim_pos = 0; + for(int b = (int)blocks.size() - 1; b >= 0; --b) + { + const auto &block = blocks[b]; + size_t start = dim_pos; + size_t prod = 1; + while(dim_pos < new_shape.size() && prod < block.numel) + { + prod *= new_shape[dim_pos]; + dim_pos++; + } + CHECK_ARGUMENT(prod == block.numel, "View: shape is not compatible with storage."); + // set new_strides for this block + ptrdiff_t stride = block.inner_stride; + for(int j = (int)dim_pos - 1; j >= (int)start; --j) + { + new_strides[new_indices[j]] = stride; + stride *= (ptrdiff_t)new_shape[j]; + } + } + CHECK_ARGUMENT(dim_pos == new_shape.size(), "View: shape is not compatible with storage."); + // for size = 1 dims, set new_strides[i] = new_strides[i+1] * shape[i+1] + for(int i = (int)shape.size() - 1; i >= 0; --i) + { + if(shape[i] == 1) + { + if(i + 1 < (int)shape.size()) + { + new_strides[i] = new_strides[i+1] * shape[i+1]; + } + else + { + new_strides[i] = 1; + } + } + } + TensorMeta meta{this->dtype(), shape, new_strides}; + return std::shared_ptr(new Tensor(meta, _storage, _offset)); } tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + CHECK_ARGUMENT(dim < this->ndim(), "Slice: dim out of range."); + CHECK_ARGUMENT(start <= end, "Slice: start must be less than or equal to end."); + CHECK_ARGUMENT(end <= this->shape()[dim], "Slice: end out of range."); + std::vector new_shape = this->shape(); + new_shape[dim] = end - start; + std::vector new_strides = this->strides(); + size_t byte_offset = _offset + start * (size_t)new_strides[dim] * this->elementSize(); + TensorMeta meta{this->dtype(), new_shape, new_strides}; + return std::shared_ptr(new Tensor(meta, _storage, byte_offset)); } void Tensor::load(const void *src_) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(src_ != nullptr, "Load: src is null."); + size_t size = this->numel() * this->elementSize(); + core::context().setDevice(this->deviceType(), this->deviceId()); + if (!_storage || _offset + size > _storage->size()) { + if (this->deviceType() == LLAISYS_DEVICE_CPU) { + _storage = core::context().runtime().allocateHostStorage(size); + } else { + _storage = core::context().runtime().allocateDeviceStorage(size); + } + _offset = 0; + } + ASSERT(this->isContiguous(), "Load: tensor must be contiguous."); + core::context().runtime().api()->memcpy_sync( + this->data(), src_, size, + this->deviceType() == LLAISYS_DEVICE_CPU ? LLAISYS_MEMCPY_H2H : LLAISYS_MEMCPY_H2D); } tensor_t Tensor::contiguous() const { From ad3ead9d72a7bc7919d6496e178b64b5f38fe05a Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Mon, 19 Jan 2026 22:47:26 +0800 Subject: [PATCH 02/14] tensor test --- src/tensor/tensor.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 6a7f3012..69fea78a 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -165,6 +165,7 @@ void Tensor::debug() const { bool Tensor::isContiguous() const { //first initial expected stride(from lastdim to 0) + //test ptrdiff_t expected_stride = 1; for(int i = (int)ndim()-1; i >= 0; i--) { From 226444adc88b47fb710538bed8f1563e6697e0e0 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Sat, 31 Jan 2026 17:10:29 +0800 Subject: [PATCH 03/14] complete ops: argmax, linear, embedding. --- .gitignore | 4 +- =3.1.0 | 2 + src/ops/argmax/cpu/argmax_cpu.cpp | 47 ++++++++++++++++ src/ops/argmax/cpu/argmax_cpu.hpp | 8 +++ src/ops/argmax/op.cpp | 31 ++++++++++- src/ops/embedding/cpu/embedding_cpu.cpp | 33 ++++++++++++ src/ops/embedding/cpu/embedding_cpu.hpp | 8 +++ src/ops/embedding/op.cpp | 34 +++++++++++- src/ops/linear/cpu/linear_cpu.cpp | 71 +++++++++++++++++++++++++ src/ops/linear/cpu/linear_cpu.hpp | 10 ++++ src/ops/linear/op.cpp | 47 +++++++++++++++- src/ops/rms_norm/cpu/rms_norm_cpu.cpp | 0 src/ops/rms_norm/cpu/rms_norm_cpu.hpp | 0 13 files changed, 290 insertions(+), 5 deletions(-) create mode 100644 =3.1.0 create mode 100644 src/ops/argmax/cpu/argmax_cpu.cpp create mode 100644 src/ops/argmax/cpu/argmax_cpu.hpp create mode 100644 src/ops/embedding/cpu/embedding_cpu.cpp create mode 100644 src/ops/embedding/cpu/embedding_cpu.hpp create mode 100644 src/ops/linear/cpu/linear_cpu.cpp create mode 100644 src/ops/linear/cpu/linear_cpu.hpp create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.cpp create mode 100644 src/ops/rms_norm/cpu/rms_norm_cpu.hpp diff --git a/.gitignore b/.gitignore index e38cf574..e92328ca 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +#models +/models/ # Xmake cache .xmake/ build/ @@ -87,4 +89,4 @@ htmlcov/ # Windows Thumbs.db ehthumbs.db -desktop.ini \ No newline at end of file +desktop.ini diff --git a/=3.1.0 b/=3.1.0 new file mode 100644 index 00000000..8151c7c6 --- /dev/null +++ b/=3.1.0 @@ -0,0 +1,2 @@ +Defaulting to user installation because normal site-packages is not writeable +Requirement already satisfied: jinja2 in /usr/lib/python3/dist-packages (3.0.3) diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp new file mode 100644 index 00000000..ca0eb53d --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.cpp @@ -0,0 +1,47 @@ +#include "argmax_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +static void argmax_(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t numel) { + size_t max_i = 0; + const T *v = reinterpret_cast(vals); + if constexpr (std::is_same_v || std::is_same_v) { + float max_f = llaisys::utils::cast(v[0]); + for (size_t i = 1; i < numel; ++i) { + float cur = llaisys::utils::cast(v[i]); + if (cur > max_f) { + max_i = i; + max_f = cur; + } + } + *reinterpret_cast(max_val) = llaisys::utils::cast(max_f); + } else { + T max_v = v[0]; + for (size_t i = 1; i < numel; ++i) { + if (v[i] > max_v) { + max_i = i; + max_v = v[i]; + } + } + *reinterpret_cast(max_val) = max_v; + } + *reinterpret_cast(max_idx) = static_cast(max_i); +} + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return argmax_(max_idx, max_val, vals, numel); + case LLAISYS_DTYPE_BF16: + return argmax_(max_idx, max_val, vals, numel); + case LLAISYS_DTYPE_F16: + return argmax_(max_idx, max_val, vals, numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp new file mode 100644 index 00000000..1f3224cb --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel); +} \ No newline at end of file diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d42..034bc431 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,36 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/argmax_cpu.hpp" + namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(max_idx, max_val, vals); + ASSERT(vals->ndim() == 1, "Argmax: vals must be 1D."); + ASSERT(max_idx->ndim() == 1 && max_idx->shape()[0] == 1, "Argmax: max_idx shape must be (1, )."); + ASSERT(max_val->ndim() == 1 && max_val->shape()[0] == 1, "Argmax: max_val shape must be (1, )."); + CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype()); + ASSERT(max_idx->dtype() == LLAISYS_DTYPE_I64, "Argmax: max_idx must be I64."); + ASSERT(vals->isContiguous() && max_idx->isContiguous() && max_val->isContiguous(), "Argmax: all tensors must be contiguous."); + + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); + } + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp new file mode 100644 index 00000000..e87fb5e2 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.cpp @@ -0,0 +1,33 @@ +#include "embedding_cpu.hpp" + +#include "../../../utils.hpp" + +template +static void embedding_(std::byte *out, const std::byte *index, const std::byte *weight, size_t index_len, size_t row_len) { + const auto idx = reinterpret_cast(index); + const auto w = reinterpret_cast(weight); + auto o = reinterpret_cast(out); + for (size_t i = 0; i < index_len; ++i) { + const int64_t row = idx[i]; + const T *src = w + row * row_len; + T *dst = o + i * row_len; + for (size_t j = 0; j < row_len; ++j) { + dst[j] = src[j]; + } + } +} + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len) { + switch (type) { + case LLAISYS_DTYPE_F32: + return embedding_(out, index, weight, index_len, row_len); + case LLAISYS_DTYPE_BF16: + return embedding_(out, index, weight, index_len, row_len); + case LLAISYS_DTYPE_F16: + return embedding_(out, index, weight, index_len, row_len); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp new file mode 100644 index 00000000..9aec5020 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len); +} \ No newline at end of file diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d0..3c33c5d5 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,39 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/embedding_cpu.hpp" namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, index, weight); + ASSERT(index->ndim() == 1, "Embedding: index must be 1D."); + ASSERT(weight->ndim() == 2, "Embedding: weight must be 2D."); + ASSERT(out->ndim() == 2, "Embedding: out must be 2D."); + ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "Embedding: index must be I64."); + CHECK_SAME_DTYPE(out->dtype(), weight->dtype()); + ASSERT(out->shape()[0] == index->shape()[0], "Embedding: out.shape[0] must equal index.shape[0]."); + ASSERT(out->shape()[1] == weight->shape()[1], "Embedding: out.shape[1] must equal weight.shape[1]."); + ASSERT(out->isContiguous() && index->isContiguous() && weight->isContiguous(), "Embedding: all tensors must be contiguous."); + + size_t index_len = index->shape()[0]; + size_t row_len = weight->shape()[1]; + + if (weight->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len); + } + + llaisys::core::context().setDevice(weight->deviceType(), weight->deviceId()); + + switch (weight->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp new file mode 100644 index 00000000..520d8b0e --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.cpp @@ -0,0 +1,71 @@ +#include "linear_cpu.hpp" + +#include "../../../utils.hpp" + +template +static void linear_(T *out, const T *in, const T *W, const T *bias, + size_t batch, size_t in_dim, size_t out_dim) +{ + for(size_t i = 0; i < batch; ++i) + { + const T *in_ = in + i*in_dim; + T *out_ = out + i*out_dim; + for(size_t j = 0; j < out_dim; ++j) + { + const T *weight_ = W + j*in_dim; + if constexpr (std::is_same_v + || std::is_same_v) + { + float acc = 0.0f; + for(size_t k = 0; k < in_dim; ++k) + { + acc += llaisys::utils::cast(in_[k]) * + llaisys::utils::cast(weight_[k]); + } + if(bias) + acc += llaisys::utils::cast(bias[j]); + out_[j] = llaisys::utils::cast(acc); + } + else + { + T acc = static_cast(0); + for(size_t k = 0; k < in_dim; ++k) + { + acc += in_[k] * weight_[k]; + } + if(bias) + acc += bias[j]; + out_[j] = acc; + } + } + } + +} +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *W, + const std::byte *bias, llaisysDataType_t type, size_t batch, + size_t in_dim, size_t out_dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return linear_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + reinterpret_cast(bias), + batch, in_dim, out_dim); + case LLAISYS_DTYPE_BF16: + return linear_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + reinterpret_cast(bias), + batch, in_dim, out_dim); + case LLAISYS_DTYPE_F16: + return linear_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + reinterpret_cast(bias), + batch, in_dim, out_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp new file mode 100644 index 00000000..a6f4e279 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *W, + const std::byte *bias, llaisysDataType_t type, size_t batch, + size_t in_dim, size_t out_dim); +} diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f865..9585068a 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,50 @@ #include "op.hpp" - +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/linear_cpu.hpp" namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + ASSERT(out->isContiguous() && in->isContiguous() + && weight->isContiguous(), "out, in, weight must be contiguous tensors"); + + ASSERT(out->ndim() == 2 && in->ndim() == 2 && + weight->ndim() == 2, "out, in, weight must be 2D tensors"); + ASSERT(in->shape()[1] == weight->shape()[1],"in.shape[1] must be equal to weight.shape[1]"); + ASSERT(out->shape()[1] == weight->shape()[0], "out.shape[1] must be equal to weight.shape[0]"); + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + if(bias) + { + CHECK_SAME_DEVICE(out, bias, weight); + ASSERT(bias->ndim() == 1, "bias must be 1D tensor"); + ASSERT(bias->isContiguous(), "bias must be contiguous tensor"); + ASSERT(bias->shape()[0] == out->shape()[1], "bias.shape[0] must be equal to out.shape[1]"); + CHECK_SAME_DTYPE(out->dtype(), bias->dtype()); + } + + size_t batch = out->shape()[0]; + size_t in_dim = in->shape()[1]; + size_t out_dim = out->shape()[1]; + + const std::byte *bias_data = bias ? bias->data():nullptr; + + if (weight->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::linear(out->data(), in->data(), + weight->data(), bias_data, bias->dtype(), batch, in_dim, out_dim); + } + + llaisys::core::context().setDevice(weight->deviceType(), weight->deviceId()); + + switch (weight->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::linear(out->data(), in->data(), + weight->data(), bias_data, bias->dtype(), batch, in_dim, out_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp new file mode 100644 index 00000000..e69de29b diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp new file mode 100644 index 00000000..e69de29b From 58c37d8f40225de3064382802bab4b89379491e4 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Sat, 31 Jan 2026 19:56:48 +0800 Subject: [PATCH 04/14] complete ops: rms_norm and rope --- src/ops/rms_norm/cpu/rms_norm_cpu.cpp | 77 +++++++++++++++++++++++++++ src/ops/rms_norm/cpu/rms_norm_cpu.hpp | 10 ++++ src/ops/rms_norm/op.cpp | 34 +++++++++++- src/ops/rope/cpu/rope_cpu.cpp | 77 +++++++++++++++++++++++++++ src/ops/rope/cpu/rope_cpu.hpp | 10 ++++ src/ops/rope/op.cpp | 35 +++++++++++- 6 files changed, 239 insertions(+), 4 deletions(-) create mode 100644 src/ops/rope/cpu/rope_cpu.cpp create mode 100644 src/ops/rope/cpu/rope_cpu.hpp diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp index e69de29b..cd828d6c 100644 --- a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp @@ -0,0 +1,77 @@ +#include "rms_norm_cpu.hpp" +#include +#include "../../../utils.hpp" + +template +static void rms_norm_(T *out, const T *in, const T *W, + const float eps, size_t batch, size_t in_dim) +{ + //for each row in in_tensor + for(size_t i = 0; i < batch; ++i) + { + const T *in_ = in + i*in_dim; + T *out_ = out + i*in_dim; + float std_dev = 0.0f; + //first calculate the standard deviation of each row + if constexpr (std::is_same_v + || std::is_same_v) + { + for(size_t j = 0; j < in_dim; ++j) + { + std_dev += llaisys::utils::cast(in_[j]) * + llaisys::utils::cast(in_[j]); + } + std_dev = std::sqrt(std_dev / (int)in_dim + eps); + } + else + { + for(size_t j = 0; j < in_dim; ++j) + { + std_dev += in_[j] * in_[j]; + } + std_dev = std::sqrt(std_dev / in_dim + eps); + } + //then normalize each element in the row + for(size_t j = 0; j < in_dim; ++j) + { + if constexpr (std::is_same_v + || std::is_same_v) + { + out_[j] = llaisys::utils::cast( + llaisys::utils::cast(in_[j]) * + llaisys::utils::cast(W[j]) / std_dev); + } + else + { + out_[j] = in_[j] * W[j] / std_dev; + } + } + } + +} +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, + const std::byte *W, const float eps, + llaisysDataType_t type, + size_t batch, size_t in_dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rms_norm_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + eps, batch, in_dim); + case LLAISYS_DTYPE_BF16: + return rms_norm_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + eps, batch, in_dim); + case LLAISYS_DTYPE_F16: + return rms_norm_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + eps, batch, in_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp index e69de29b..dfe43083 100644 --- a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, + const std::byte *W, const float eps, + llaisysDataType_t type, size_t batch, size_t in_dim); +} diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9..d57c96cf 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,37 @@ #include "op.hpp" - +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/rms_norm_cpu.hpp" namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + ASSERT(out->ndim() == 2 && in->ndim() == 2 && + weight->ndim() == 1, "out, in must be 2D tensors, while weight must be 1D tensor"); + ASSERT(out->isContiguous() && in->isContiguous() + && weight->isContiguous(), "All tensors must be contiguous"); + ASSERT(out->shape() == in->shape(), "Output shape must be equal to input shape"); + ASSERT(weight->shape()[0] == in->shape()[1], "Weight shape must be equal to input shape"); + size_t batch = out->shape()[0]; + size_t in_dim = in->shape()[1]; + + if (weight->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm(out->data(), in->data(), + weight->data(), eps, out->dtype(), batch, in_dim); + } + + llaisys::core::context().setDevice(weight->deviceType(), weight->deviceId()); + + switch (weight->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rms_norm(out->data(), in->data(), + weight->data(), eps, out->dtype(), batch, in_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp new file mode 100644 index 00000000..e25c3e74 --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.cpp @@ -0,0 +1,77 @@ +#include "rope_cpu.hpp" +#include +#include "../../../utils.hpp" + +template +void rope_(T *out, const T *in, const int64_t *pos_ids, const float theta, + size_t seq_len, size_t n_heads, size_t head_dim) +{ + //for each s in seq_len + for(size_t s = 0; s < seq_len; ++s) + { + //for each h in n_heads + for(size_t h = 0; h < n_heads; ++h) + { + //for each tensor in head_dim + for(size_t j = 0; j < head_dim/2; ++j) + { + size_t p = pos_ids[s]; + // cal angle + float angle = static_cast(p) / std::pow(theta, 2.0f*j / head_dim); + // cal the base + const T *in_ = in + (s*n_heads + h) * head_dim; + T *out_ = out + (s*n_heads + h) * head_dim; + //cal sin & cos val + float cosine = std::cos(angle); + float sine = std::sin(angle); + //find a and b + T a = in_[j]; + T b = in_[j + head_dim/2]; + //rope + if constexpr(std::is_same_v + || std::is_same_v) + { + out_[j] = llaisys::utils::cast( + llaisys::utils::cast(a) * cosine - + llaisys::utils::cast(b) * sine + ); + out_[j + head_dim/2] = llaisys::utils::cast( + llaisys::utils::cast(a) * sine + + llaisys::utils::cast(b) * cosine + ); + } + else + { + out_[j] = a * cosine - b * sine; + out_[j + head_dim/2] = a * sine + b * cosine; + } + } + } + } +} +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, + const std::byte *pos_ids, const float theta, + llaisysDataType_t type, + size_t seq_len, size_t n_heads, size_t head_dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rope_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), theta, + seq_len, n_heads, head_dim); + case LLAISYS_DTYPE_BF16: + return rope_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), theta, + seq_len, n_heads, head_dim); + case LLAISYS_DTYPE_F16: + return rope_(reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), theta, + seq_len, n_heads, head_dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp new file mode 100644 index 00000000..77fb59db --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, + const std::byte *pos_ids,const float theta, llaisysDataType_t type, + size_t seq_len, size_t n_heads, size_t head_dim); +} \ No newline at end of file diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64..d3cab559 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,38 @@ #include "op.hpp" - +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/rope_cpu.hpp" namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, pos_ids); + ASSERT(out->isContiguous() && in->isContiguous() + && pos_ids->isContiguous(), "All tensors must be Contiguous"); + ASSERT(out->shape() == in->shape(), + "Output shape must be equal to Input shape"); + ASSERT(pos_ids->shape()[0] == out->shape()[0], + "Position IDs shape[0] must be equal to Output shape[0]"); + size_t seq_len = out->shape()[0]; + size_t n_heads = out->shape()[1]; + size_t head_dim = out->shape()[2]; + + if (pos_ids->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope(out->data(), in->data(), + pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim); + } + + llaisys::core::context().setDevice(pos_ids->deviceType(), pos_ids->deviceId()); + + switch (pos_ids->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rope(out->data(), in->data(), + pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops From 0aa909507ac294e6b2df55c2148b21ab00e87bea Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Sun, 1 Feb 2026 20:47:30 +0800 Subject: [PATCH 05/14] complete ops: rearrange, swiglu and self attention --- src/ops/rearrange/cpu/rearrange_cpu.cpp | 44 ++++++ src/ops/rearrange/cpu/rearrange_cpu.hpp | 9 ++ src/ops/rearrange/op.cpp | 35 ++++- .../self_attention/cpu/self_attention_cpu.cpp | 128 ++++++++++++++++++ .../self_attention/cpu/self_attention_cpu.hpp | 8 ++ src/ops/self_attention/op.cpp | 44 +++++- src/ops/swiglu/cpu/swiglu_cpu.cpp | 55 ++++++++ src/ops/swiglu/cpu/swiglu_cpu.hpp | 7 + src/ops/swiglu/op.cpp | 31 ++++- 9 files changed, 358 insertions(+), 3 deletions(-) create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.cpp create mode 100644 src/ops/rearrange/cpu/rearrange_cpu.hpp create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.cpp create mode 100644 src/ops/self_attention/cpu/self_attention_cpu.hpp create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.cpp create mode 100644 src/ops/swiglu/cpu/swiglu_cpu.hpp diff --git a/src/ops/rearrange/cpu/rearrange_cpu.cpp b/src/ops/rearrange/cpu/rearrange_cpu.cpp new file mode 100644 index 00000000..1c8078ca --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.cpp @@ -0,0 +1,44 @@ +#include "rearrange_cpu.hpp" + +#include "../../../utils.hpp" + +template +static void rearrange_(T *out, const T *in, const size_t *shape, const ptrdiff_t *out_strides, + const ptrdiff_t *in_strides, const size_t numel, size_t ndim) { + for (size_t i = 0; i < numel; ++i) { + size_t tmp = i; + ptrdiff_t in_offset = 0; + ptrdiff_t out_offset = 0; + for (size_t d = ndim; d-- > 0;) { + // cal index from d-1 to 0; + size_t idx = tmp % shape[d]; + tmp /= shape[d]; + // based strides to cal offset + in_offset += static_cast(idx) * in_strides[d]; + out_offset += static_cast(idx) * out_strides[d]; + } + out[out_offset] = in[in_offset]; + } +} + +namespace llaisys::ops::cpu { +void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, const size_t *shape, + const ptrdiff_t *out_strides, const ptrdiff_t *in_strides, size_t numel, size_t ndim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides, numel, ndim); + case LLAISYS_DTYPE_BF16: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides, numel, ndim); + case LLAISYS_DTYPE_F16: + return rearrange_(reinterpret_cast(out), + reinterpret_cast(in), + shape, out_strides, in_strides, numel, ndim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/rearrange/cpu/rearrange_cpu.hpp b/src/ops/rearrange/cpu/rearrange_cpu.hpp new file mode 100644 index 00000000..e8d4d5a0 --- /dev/null +++ b/src/ops/rearrange/cpu/rearrange_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, + const size_t *shape, const ptrdiff_t *out_strides, + const ptrdiff_t *in_strides, size_t numel, size_t ndim); +} \ No newline at end of file diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp index 017a6ae5..ae1523fc 100644 --- a/src/ops/rearrange/op.cpp +++ b/src/ops/rearrange/op.cpp @@ -1,7 +1,40 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rearrange_cpu.hpp" + namespace llaisys::ops { void rearrange(tensor_t out, tensor_t in) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in); + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + CHECK_SAME_SHAPE(out->shape(), in->shape()); + + size_t numel = in->numel(); + size_t ndim = out->ndim(); + const auto &shape = out->shape(); + const auto &out_strides = out->strides(); + const auto &in_strides = in->strides(); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rearrange(out->data(), in->data(), out->dtype(), shape.data(), out_strides.data(), + in_strides.data(), numel, ndim); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rearrange(out->data(), in->data(), out->dtype(), shape.data(), out_strides.data(), + in_strides.data(), numel, ndim); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp new file mode 100644 index 00000000..cdf5f391 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp @@ -0,0 +1,128 @@ +#include "self_attention_cpu.hpp" +#include +#include +#include +#include +#include "../../../utils.hpp" + +template +static void self_attention_(T *out, const T *q, const T *k, const T *v, + float scale,size_t qlen, size_t kvlen, + size_t nh, size_t nkvh, size_t hd, size_t dv) { + //cal group size of kv head + size_t kv_group = nh / nkvh; + // attn scores + std::vector scores(kvlen); + + // for each q head + for (size_t h = 0; h < nh; ++h) { + size_t kvh = h / kv_group; + for (size_t i = 0; i < qlen; ++i) { + const T *q_ = q + (i * nh + h) * hd; + float max_score = -std::numeric_limits::infinity(); + // causal mask + int64_t limit = static_cast(i) + + static_cast(kvlen) - static_cast(qlen); + + for (size_t j = 0; j < kvlen; ++j) { + // for j > i, set score to -inf, because it can't be seen + if (static_cast(j) > limit) { + scores[j] = -std::numeric_limits::infinity(); + continue; + } + // start to cal attn scores + // QK^T + float acc = 0.0f; + const T *k_ = k + (j * nkvh + kvh) * hd; + if constexpr (std::is_same_v + || std::is_same_v) { + for (size_t d = 0; d < hd; ++d) { + acc += llaisys::utils::cast(q_[d]) + * llaisys::utils::cast(k_[d]); + } + } else { + for (size_t d = 0; d < hd; ++d) { + acc += q_[d] * k_[d]; + } + } + //QK^T * scale + acc *= scale; + scores[j] = acc; + if (acc > max_score) { + max_score = acc; + } + } + + float sum = 0.0f; + for (size_t j = 0; j < kvlen; ++j) { + if (scores[j] == -std::numeric_limits::infinity()) { + scores[j] = 0.0f; + continue; + } + // softmax exp(j - max_score) + float w = std::exp(scores[j] - max_score); + scores[j] = w; + sum += w; + } + // softmax sum(exp(j - max_score)) + float inv_sum = sum > 0.0f ? 1.0f / sum : 0.0f; + + // start to cal causalsoftmax(A)*V + T *out_ = out + (i * nh + h) * dv; + for (size_t d = 0; d < dv; ++d) { + float acc = 0.0f; + for (size_t j = 0; j < kvlen; ++j) { + if (scores[j] == 0.0f) { + continue; + } + const T *v_ = v + (j * nkvh + kvh) * dv; + if constexpr (std::is_same_v + || std::is_same_v) { + acc += scores[j] * llaisys::utils::cast(v_[d]); + } else { + acc += scores[j] * v_[d]; + } + } + if constexpr (std::is_same_v + || std::is_same_v) { + out_[d] = llaisys::utils::cast(acc * inv_sum); + } else { + out_[d] = static_cast(acc * inv_sum); + } + } + } + } +} + +namespace llaisys::ops::cpu { +void self_attention(std::byte *out, const std::byte *q, const std::byte *k, + const std::byte *v, float scale, + llaisysDataType_t type, size_t qlen, + size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv) { + switch (type) { + case LLAISYS_DTYPE_F32: + return self_attention_( + reinterpret_cast(out), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nh, nkvh, hd, dv); + case LLAISYS_DTYPE_BF16: + return self_attention_( + reinterpret_cast(out), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nh, nkvh, hd, dv); + case LLAISYS_DTYPE_F16: + return self_attention_( + reinterpret_cast(out), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nh, nkvh, hd, dv); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp new file mode 100644 index 00000000..99c1b658 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale, + llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv); +} \ No newline at end of file diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d62014..abfe1964 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,49 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/self_attention_cpu.hpp" namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(attn_val, q, k, v); + CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype()); + ASSERT(attn_val->ndim() == 3 && q->ndim() == 3 && k->ndim() == 3 && v->ndim() == 3, + "self_attention: all tensors must be 3D."); + ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(), + "self_attention: all tensors must be contiguous."); + ASSERT(q->shape()[0] == attn_val->shape()[0], "self_attention: qlen mismatch."); + ASSERT(q->shape()[1] == attn_val->shape()[1], "self_attention: nhead mismatch."); + ASSERT(v->shape()[2] == attn_val->shape()[2], "self_attention: dv mismatch."); + ASSERT(q->shape()[2] == k->shape()[2], "self_attention: q/k head dim mismatch."); + ASSERT(k->shape()[0] == v->shape()[0], "self_attention: k/v length mismatch."); + ASSERT(k->shape()[1] == v->shape()[1], "self_attention: k/v head mismatch."); + ASSERT(q->shape()[1] % k->shape()[1] == 0, "self_attention: nhead must be multiple of nkvhead."); + + size_t qlen = q->shape()[0]; + size_t kvlen = k->shape()[0]; + size_t nh = q->shape()[1]; + size_t nkvh = k->shape()[1]; + size_t hd = q->shape()[2]; + size_t dv = v->shape()[2]; + + if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, + attn_val->dtype(), qlen, kvlen, nh, nkvh, hd, dv); + } + + llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId()); + + switch (attn_val->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, + attn_val->dtype(), qlen, kvlen, nh, nkvh, hd, dv); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp new file mode 100644 index 00000000..f5fabb03 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp @@ -0,0 +1,55 @@ +#include "swiglu_cpu.hpp" +#include +#include +#include "../../../utils.hpp" + +template +void swiglu_(T *out, const T *gate, const T *up, size_t seq_len, size_t inter_size) +{ + // for each line in up + for(size_t i = 0; i < seq_len; i++) + { + // cal i line base + T *out_ = out + i * inter_size; + const T* up_ = up + i * inter_size; + const T* gate_ = gate + i * inter_size; + // for each element in the line + for(size_t j = 0; j < inter_size; j++) + { + if constexpr (std::is_same_v + || std::is_same_v) { + out_[j] = llaisys::utils::cast( + llaisys::utils::cast(up_[j]) * + llaisys::utils::cast(gate_[j]) / + (1.0f + std::exp(-llaisys::utils::cast(gate_[j])))); + } + else + out_[j] = up_[j] * gate_[j] / (1.0f + std::exp(-gate_[j])); + } + + } +} + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size) { + switch (type) { + case LLAISYS_DTYPE_F32: + return swiglu_(reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + seq_len, inter_size); + case LLAISYS_DTYPE_BF16: + return swiglu_(reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + seq_len, inter_size); + case LLAISYS_DTYPE_F16: + return swiglu_(reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + seq_len, inter_size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp new file mode 100644 index 00000000..e796b041 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp @@ -0,0 +1,7 @@ +#pragma once +#include "llaisys.h" +#include + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size); +} \ No newline at end of file diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 47edbcc9..5ce3c4bf 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -1,7 +1,36 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/swiglu_cpu.hpp" namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, gate, up); + CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype()); + CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape()); + ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(), + "swiglu: all tensors must be contiguous."); + size_t seq_len = out->shape()[0]; + size_t inter_size = out->shape()[1]; + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::swiglu(out->data(), gate->data(), up->data(), + out->dtype(), seq_len, inter_size); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::swiglu(out->data(), gate->data(), up->data(), + out->dtype(), seq_len, inter_size); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops From 94eafa51a6c3c73d15c9ac7e75b4c10a69426da5 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Tue, 3 Feb 2026 16:33:41 +0800 Subject: [PATCH 06/14] inplement inference sys --- python/llaisys/libllaisys/__init__.py | 9 + python/llaisys/libllaisys/models.py | 65 +++++ python/llaisys/models/qwen2.py | 152 +++++++++- src/llaisys/qwen2.cc | 392 ++++++++++++++++++++++++++ 4 files changed, 608 insertions(+), 10 deletions(-) create mode 100644 python/llaisys/libllaisys/models.py create mode 100644 src/llaisys/qwen2.cc diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb52..fa872903 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -12,6 +12,10 @@ from .tensor import llaisysTensor_t from .tensor import load_tensor from .ops import load_ops +# Llaisys infer +from .models import load_models +from .models import LlaisysQwen2Meta +from .models import LlaisysQwen2Weights def load_shared_library(): @@ -38,6 +42,8 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) +# Llaisys load_models +load_models(LIB_LLAISYS) __all__ = [ @@ -52,4 +58,7 @@ def load_shared_library(): "llaisysMemcpyKind_t", "MemcpyKind", "llaisysStream_t", + # Llaisys c side + "LlaisysQwen2Meta", + "LlaisysQwen2Weights", ] diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py new file mode 100644 index 00000000..ea313487 --- /dev/null +++ b/python/llaisys/libllaisys/models.py @@ -0,0 +1,65 @@ +import ctypes +from ctypes import c_size_t, c_int64, c_int, c_float +from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t +from .tensor import llaisysTensor_t + +# c side wrap + +class LlaisysQwen2Meta(ctypes.Structure): + _fields_ = [ + ("dtype", llaisysDataType_t), + ("nlayer", c_size_t), + ("hs", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("maxseq", c_size_t), + ("voc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_int64), + ] + + +class LlaisysQwen2Weights(ctypes.Structure): + _fields_ = [ + ("in_embed", llaisysTensor_t), + ("out_embed", llaisysTensor_t), + ("out_norm_w", llaisysTensor_t), + ("attn_norm_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_q_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_q_b", ctypes.POINTER(llaisysTensor_t)), + ("attn_k_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_k_b", ctypes.POINTER(llaisysTensor_t)), + ("attn_v_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_v_b", ctypes.POINTER(llaisysTensor_t)), + ("attn_o_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_norm_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_gate_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_up_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_down_w", ctypes.POINTER(llaisysTensor_t)), + ] + + +def load_models(lib): + lib.llaisysQwen2ModelCreate.argtypes = [ + ctypes.POINTER(LlaisysQwen2Meta), + llaisysDeviceType_t, + ctypes.POINTER(c_int), + c_int, + ] + lib.llaisysQwen2ModelCreate.restype = ctypes.c_void_p + + lib.llaisysQwen2ModelDestroy.argtypes = [ctypes.c_void_p] + lib.llaisysQwen2ModelDestroy.restype = None + + lib.llaisysQwen2ModelWeights.argtypes = [ctypes.c_void_p] + lib.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights) + + lib.llaisysQwen2ModelInfer.argtypes = [ + ctypes.c_void_p, + ctypes.POINTER(c_int64), + c_size_t, + ] + lib.llaisysQwen2ModelInfer.restype = c_int64 diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b2..98efbe28 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,23 +1,143 @@ from typing import Sequence from ..libllaisys import LIB_LLAISYS from ..libllaisys import DeviceType +from ..libllaisys import DataType +from ..libllaisys import llaisysDeviceType_t +from ..libllaisys.models import LlaisysQwen2Meta from pathlib import Path import safetensors +import json +import ctypes class Qwen2: - def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor - + self._device = device model_path = Path(model_path) + config_path = model_path / "config.json" + # read model config from config.json + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + + # parse dim + hs = int(config["hidden_size"]) + nlayer = int(config["num_hidden_layers"]) + nh = int(config["num_attention_heads"]) + nkvh = int(config.get("num_key_value_heads", nh)) + di = int(config["intermediate_size"]) + dh = int(hs // nh) + + # parse key params + maxseq = int(config["max_position_embeddings"]) + voc = int(config["vocab_size"]) + epsilon = float(config["rms_norm_eps"]) + theta = float(config["rope_theta"]) + end_token = int(config["eos_token_id"]) + + + # construct C struct LlaisysQwen2Meta + meta = LlaisysQwen2Meta( + dtype=DataType.BF16, + nlayer=nlayer, + hs=hs, + nh=nh, + nkvh=nkvh, + dh=dh, + di=di, + maxseq=maxseq, + voc=voc, + epsilon=epsilon, + theta=theta, + end_token=end_token, + ) + + # only use cpu + device_ids = (ctypes.c_int * 1)(0) + # create model instance + self._model = LIB_LLAISYS.llaisysQwen2ModelCreate( + ctypes.byref(meta), llaisysDeviceType_t(device), device_ids, 1 + ) + + # get model weights + self._weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model).contents + self._end_token = end_token + # traverse all safetensors files, in fact only one file in qwen2 for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") + # load on cpu, I use pt framework to load bfloat16 weights here + data_ = safetensors.safe_open(file, framework="pt", device="cpu") for name_ in data_.keys(): - ## TODO: load the model weights - pass + weight = self._match_weight(name_) + if weight is None: + continue + # load weight to c side + arr = data_.get_tensor(name_).contiguous() #c-contiguous + + LIB_LLAISYS.tensorLoad(weight, ctypes.c_void_p(arr.data_ptr())) + + + def _match_weight(self, name: str): + # match weight name to c struct field + + w = self._weights + # input embedding + if name == "model.embed_tokens.weight": + return w.in_embed + # output embedding + if name in ("lm_head.weight", "model.lm_head.weight"): + return w.out_embed + # final LayerNorm + if name == "model.norm.weight": + return w.out_norm_w + # only processtransformer layer weights + if not name.startswith("model.layers."): + return None + parts = name.split(".") + if len(parts) < 5: + return None + layer = int(parts[2]) # 提取层索引 + tail = ".".join(parts[3:]) # 剩余后缀 + # Attention Layer + if tail == "input_layernorm.weight": + return w.attn_norm_w[layer] + if tail == "self_attn.q_proj.weight": + return w.attn_q_w[layer] + if tail == "self_attn.q_proj.bias": + return w.attn_q_b[layer] + if tail == "self_attn.k_proj.weight": + return w.attn_k_w[layer] + if tail == "self_attn.k_proj.bias": + return w.attn_k_b[layer] + if tail == "self_attn.v_proj.weight": + return w.attn_v_w[layer] + if tail == "self_attn.v_proj.bias": + return w.attn_v_b[layer] + if tail == "self_attn.o_proj.weight": + return w.attn_o_w[layer] + # FFN layer + if tail == "post_attention_layernorm.weight": + return w.mlp_norm_w[layer] + if tail == "mlp.gate_proj.weight": + return w.mlp_gate_w[layer] + if tail == "mlp.up_proj.weight": + return w.mlp_up_w[layer] + if tail == "mlp.down_proj.weight": + return w.mlp_down_w[layer] + return None + + def _infer(self, tokens: Sequence[int]) -> int: + # step forward infer + + if len(tokens) == 0: + return self._end_token + # convert python list to c int64 array + arr = (ctypes.c_int64 * len(tokens))(*tokens) + return int( + LIB_LLAISYS.llaisysQwen2ModelInfer( + self._model, arr, ctypes.c_size_t(len(tokens)) + ) + ) def generate( self, @@ -27,7 +147,19 @@ def generate( top_p: float = 0.8, temperature: float = 0.8, ): - - # TODO: Implement generate function - - return [] + # max new tokens default value:32 + if max_new_tokens is None: + max_new_tokens = 32 + tokens = list(inputs) + if max_new_tokens == 0: + return tokens + # prefill + next_token = self._infer(tokens) + tokens.append(next_token) + # decode + for _ in range(max_new_tokens - 1): + if tokens[-1] == self._end_token: + break + next_token = self._infer([tokens[-1]]) + tokens.append(next_token) + return tokens diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc new file mode 100644 index 00000000..b1c1f41b --- /dev/null +++ b/src/llaisys/qwen2.cc @@ -0,0 +1,392 @@ +#include "llaisys/models/qwen2.h" +#include "llaisys_tensor.hpp" + +#include "../tensor/tensor.hpp" +#include "../ops/add/op.hpp" +#include "../ops/argmax/op.hpp" +#include "../ops/embedding/op.hpp" +#include "../ops/linear/op.hpp" +#include "../ops/rearrange/op.hpp" +#include "../ops/rms_norm/op.hpp" +#include "../ops/rope/op.hpp" +#include "../ops/self_attention/op.hpp" +#include "../ops/swiglu/op.hpp" +#include "../utils.hpp" + +#include +#include +#include + +// wrap c++ tensor to external handle +namespace { + +// based on modelMetaInfo create tensor +llaisys::tensor_t make_tensor( + const LlaisysQwen2Meta &meta, + llaisysDeviceType_t device, + int device_id, + const std::vector &shape) { + return llaisys::Tensor::create(shape, meta.dtype, device, device_id); +} + +// based on dtype create tensor +llaisys::tensor_t make_tensor_dtype( + llaisysDataType_t dtype, + llaisysDeviceType_t device, + int device_id, + const std::vector &shape) { + return llaisys::Tensor::create(shape, dtype, device, device_id); +} + +// set tensor data to zero +void zero_tensor(const llaisys::tensor_t &t) { + CHECK_ARGUMENT(t->deviceType() == LLAISYS_DEVICE_CPU, "Zero: only CPU is supported."); + std::memset(t->data(), 0, t->numel() * t->elementSize()); +} + +// wrap c++ tensor to external handle and record to handles list +llaisysTensor_t wrap_tensor( + const llaisys::tensor_t &t, + std::vector &handles) { + auto *h = new LlaisysTensor{t}; + handles.push_back(h); + return h; +} + +} + +// Qwen2 Model Instance Structure +struct LlaisysQwen2Model { + LlaisysQwen2Meta meta; // model meta info + llaisysDeviceType_t device; // device type + int device_id; // device id + + LlaisysQwen2Weights weights; // all weight tensors handle + std::vector handles; // handles list for unified release + + // attn out bias + std::vector attn_o_bias; + // MLP gate bias + std::vector mlp_gate_bias; + // MLP up bias + std::vector mlp_up_bias; + // MLP down bias + std::vector mlp_down_bias; + // out bias + llaisys::tensor_t out_bias; + + // KV cache + std::vector k_cache; + std::vector v_cache; + + size_t cache_len; +}; + +// init model weights, allocate memory and zero bias +static void init_weights(LlaisysQwen2Model *model) { + const auto &m = model->meta; + // input embedding + model->weights.in_embed = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.voc, m.hs}), + model->handles); + // output embedding + model->weights.out_embed = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.voc, m.hs}), + model->handles); + // output layer norm + model->weights.out_norm_w = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.hs}), + model->handles); + + // allocate ptr array for each layer + model->weights.attn_norm_w = new llaisysTensor_t[m.nlayer]; + model->weights.attn_q_w = new llaisysTensor_t[m.nlayer]; + model->weights.attn_q_b = new llaisysTensor_t[m.nlayer]; + model->weights.attn_k_w = new llaisysTensor_t[m.nlayer]; + model->weights.attn_k_b = new llaisysTensor_t[m.nlayer]; + model->weights.attn_v_w = new llaisysTensor_t[m.nlayer]; + model->weights.attn_v_b = new llaisysTensor_t[m.nlayer]; + model->weights.attn_o_w = new llaisysTensor_t[m.nlayer]; + model->weights.mlp_norm_w = new llaisysTensor_t[m.nlayer]; + model->weights.mlp_gate_w = new llaisysTensor_t[m.nlayer]; + model->weights.mlp_up_w = new llaisysTensor_t[m.nlayer]; + model->weights.mlp_down_w = new llaisysTensor_t[m.nlayer]; + + // bias + model->attn_o_bias.resize(m.nlayer); + model->mlp_gate_bias.resize(m.nlayer); + model->mlp_up_bias.resize(m.nlayer); + model->mlp_down_bias.resize(m.nlayer); + + // for each layer, create its weight and bias tensor + // first weight tensor + for (size_t i = 0; i < m.nlayer; ++i) { + // attn layer norm + model->weights.attn_norm_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.hs}), + model->handles); + // Q/K/V/O proj weight and bias + model->weights.attn_q_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.nh * m.dh, m.hs}), + model->handles); + model->weights.attn_q_b[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.nh * m.dh}), + model->handles); + model->weights.attn_k_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh, m.hs}), + model->handles); + model->weights.attn_k_b[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh}), + model->handles); + model->weights.attn_v_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh, m.hs}), + model->handles); + model->weights.attn_v_b[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.nkvh * m.dh}), + model->handles); + model->weights.attn_o_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.hs, m.nh * m.dh}), + model->handles); + + // MLP layer norm and gate/up/down weight + model->weights.mlp_norm_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.hs}), + model->handles); + model->weights.mlp_gate_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.di, m.hs}), + model->handles); + model->weights.mlp_up_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.di, m.hs}), + model->handles); + model->weights.mlp_down_w[i] = wrap_tensor( + make_tensor(m, model->device, model->device_id, {m.hs, m.di}), + model->handles); + + // bias tensor + model->attn_o_bias[i] = make_tensor(m, model->device, model->device_id, {m.hs}); + model->mlp_gate_bias[i] = make_tensor(m, model->device, model->device_id, {m.di}); + model->mlp_up_bias[i] = make_tensor(m, model->device, model->device_id, {m.di}); + model->mlp_down_bias[i] = make_tensor(m, model->device, model->device_id, {m.hs}); + + // initialize bias tensor to 0 + zero_tensor(model->weights.attn_q_b[i]->tensor); + zero_tensor(model->weights.attn_k_b[i]->tensor); + zero_tensor(model->weights.attn_v_b[i]->tensor); + zero_tensor(model->attn_o_bias[i]); + zero_tensor(model->mlp_gate_bias[i]); + zero_tensor(model->mlp_up_bias[i]); + zero_tensor(model->mlp_down_bias[i]); + } + + // output layerbias + model->out_bias = make_tensor(m, model->device, model->device_id, {m.voc}); + zero_tensor(model->out_bias); +} + +// initialize kv cache tensor +static void init_cache(LlaisysQwen2Model *model) { + const auto &m = model->meta; + model->k_cache.resize(m.nlayer); + model->v_cache.resize(m.nlayer); + for (size_t i = 0; i < m.nlayer; ++i) { + // cache shape: [maxseq, num_heads, head_dim] + model->k_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh}); + model->v_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh}); + } + model->cache_len = 0; +} + +// inference implementation +static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, size_t ntoken) { + CHECK_ARGUMENT(model != nullptr, "model is null"); + CHECK_ARGUMENT(token_ids != nullptr || ntoken == 0, "token_ids is null"); + CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU, "Only CPU device is supported."); + + if (ntoken == 0) { + return model->meta.end_token; + } + + // prefill + if (ntoken > 1 || model->cache_len == 0) { + model->cache_len = 0; + } + + size_t seqlen = ntoken; + size_t pos_offset = model->cache_len; + + // position ID [pos_offset, pos_offset + seqlen) + std::vector pos_ids(seqlen); + for (size_t i = 0; i < seqlen; ++i) { + pos_ids[i] = static_cast(pos_offset + i); + } + + // input token and position ID tensors + auto input_ids = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen}); + input_ids->load(token_ids); + + auto pos_tensor = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen}); + pos_tensor->load(pos_ids.data()); + + // embedding lookup + auto x = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::embedding(x, input_ids, model->weights.in_embed->tensor); + + // attn scale factor + float scale = 1.0f / std::sqrt(static_cast(model->meta.dh)); + + // layer forward + for (size_t i = 0; i < model->meta.nlayer; ++i) { + // attn input norm + auto x_norm = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::rms_norm(x_norm, x, model->weights.attn_norm_w[i]->tensor, model->meta.epsilon); + + // Q/K/V linear proj + auto q = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh * model->meta.dh}); + auto k = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nkvh * model->meta.dh}); + auto v = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nkvh * model->meta.dh}); + + llaisys::ops::linear(q, x_norm, model->weights.attn_q_w[i]->tensor, model->weights.attn_q_b[i]->tensor); + llaisys::ops::linear(k, x_norm, model->weights.attn_k_w[i]->tensor, model->weights.attn_k_b[i]->tensor); + llaisys::ops::linear(v, x_norm, model->weights.attn_v_w[i]->tensor, model->weights.attn_v_b[i]->tensor); + + // transform to multi-head dim + auto q_view = q->view({seqlen, model->meta.nh, model->meta.dh}); + auto k_view = k->view({seqlen, model->meta.nkvh, model->meta.dh}); + auto v_view = v->view({seqlen, model->meta.nkvh, model->meta.dh}); + + // RoPE + auto q_rope = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh, model->meta.dh}); + auto k_rope = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nkvh, model->meta.dh}); + + llaisys::ops::rope(q_rope, q_view, pos_tensor, model->meta.theta); + llaisys::ops::rope(k_rope, k_view, pos_tensor, model->meta.theta); + + // write new k/v to cache + auto k_cache_slice = model->k_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen); + auto v_cache_slice = model->v_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen); + + llaisys::ops::rearrange(k_cache_slice, k_rope); + llaisys::ops::rearrange(v_cache_slice, v_view); + + // get all history k/v + size_t total_len = model->cache_len + seqlen; + auto k_total = model->k_cache[i]->slice(0, 0, total_len); + auto v_total = model->v_cache[i]->slice(0, 0, total_len); + + // self attn + auto attn = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh, model->meta.dh}); + llaisys::ops::self_attention(attn, q_rope, k_total, v_total, scale); + + // attn out proj + auto attn_flat = attn->view({seqlen, model->meta.nh * model->meta.dh}); + auto attn_proj = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::linear(attn_proj, attn_flat, model->weights.attn_o_w[i]->tensor, model->attn_o_bias[i]); + + // first residual conn + auto res1 = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::add(res1, x, attn_proj); + + // MLP input norm + auto x_norm2 = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::rms_norm(x_norm2, res1, model->weights.mlp_norm_w[i]->tensor, model->meta.epsilon); + + // MLP gate and up proj + auto gate = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.di}); + auto up = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.di}); + + llaisys::ops::linear(gate, x_norm2, model->weights.mlp_gate_w[i]->tensor, model->mlp_gate_bias[i]); + llaisys::ops::linear(up, x_norm2, model->weights.mlp_up_w[i]->tensor, model->mlp_up_bias[i]); + + // SwiGLU activate + auto swiglu_out = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.di}); + llaisys::ops::swiglu(swiglu_out, gate, up); + + // MLP down proj + auto down = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::linear(down, swiglu_out, model->weights.mlp_down_w[i]->tensor, model->mlp_down_bias[i]); + + // second residual conn + auto res2 = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::add(res2, res1, down); + + x = res2; + } + + // update cache len + model->cache_len += seqlen; + + // final norm + auto x_norm = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); + llaisys::ops::rms_norm(x_norm, x, model->weights.out_norm_w->tensor, model->meta.epsilon); + + // output proj to vocab + auto logits = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.voc}); + llaisys::ops::linear(logits, x_norm, model->weights.out_embed->tensor, model->out_bias); + + // get last token logits and argmax + auto last = logits->slice(0, seqlen - 1, seqlen)->view({model->meta.voc}); + auto max_idx = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {1}); + auto max_val = make_tensor(model->meta, model->device, model->device_id, {1}); + llaisys::ops::argmax(max_idx, max_val, last); + + return reinterpret_cast(max_idx->data())[0]; +} + +// C API wrapper +__C { + +// ModelCreate: Check args, initialize weights and cache +struct LlaisysQwen2Model *llaisysQwen2ModelCreate( + const LlaisysQwen2Meta *meta, + llaisysDeviceType_t device, + int *device_ids, + int ndevice) { + CHECK_ARGUMENT(meta != nullptr, "meta is null"); + CHECK_ARGUMENT(device == LLAISYS_DEVICE_CPU, "Only CPU device is supported."); + CHECK_ARGUMENT(ndevice >= 1, "Invalid device count."); + + auto *model = new LlaisysQwen2Model(); + model->meta = *meta; + model->device = device; + model->device_id = device_ids ? device_ids[0] : 0; + init_weights(model); + init_cache(model); + return model; +} + +// ModelDestroy: Free weights and cache, all handles +void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model) { + if (!model) { + return; + } + delete[] model->weights.attn_norm_w; + delete[] model->weights.attn_q_w; + delete[] model->weights.attn_q_b; + delete[] model->weights.attn_k_w; + delete[] model->weights.attn_k_b; + delete[] model->weights.attn_v_w; + delete[] model->weights.attn_v_b; + delete[] model->weights.attn_o_w; + delete[] model->weights.mlp_norm_w; + delete[] model->weights.mlp_gate_w; + delete[] model->weights.mlp_up_w; + delete[] model->weights.mlp_down_w; + + for (auto *h : model->handles) { + delete h; + } + delete model; +} + +// ModelWeights: Get weights pointer for loading pretrained params +struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model) { + CHECK_ARGUMENT(model != nullptr, "model is null"); + return &model->weights; +} + +// ModelInfer: Single interface for single token prediction +int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken) { + return infer_impl(model, token_ids, ntoken); +} + +} From 2e900c70d6643cddef040f96d622a57fc6b52135 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Tue, 3 Feb 2026 16:55:41 +0800 Subject: [PATCH 07/14] erase exception throw in qwen2.cc's extern C --- src/llaisys/qwen2.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index b1c1f41b..bd070c7d 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -341,9 +341,6 @@ struct LlaisysQwen2Model *llaisysQwen2ModelCreate( llaisysDeviceType_t device, int *device_ids, int ndevice) { - CHECK_ARGUMENT(meta != nullptr, "meta is null"); - CHECK_ARGUMENT(device == LLAISYS_DEVICE_CPU, "Only CPU device is supported."); - CHECK_ARGUMENT(ndevice >= 1, "Invalid device count."); auto *model = new LlaisysQwen2Model(); model->meta = *meta; @@ -380,7 +377,6 @@ void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model) { // ModelWeights: Get weights pointer for loading pretrained params struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model) { - CHECK_ARGUMENT(model != nullptr, "model is null"); return &model->weights; } From e6c10aa0dee7f44410bd2b3cc19982d103acc625 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Wed, 25 Feb 2026 17:45:03 +0800 Subject: [PATCH 08/14] implement random_sample op to replace argmax && test passed. --- include/llaisys/models/qwen2.h | 3 +- include/llaisys/ops.h | 2 + python/llaisys/libllaisys/models.py | 4 + python/llaisys/libllaisys/ops.py | 13 +- python/llaisys/models/qwen2.py | 16 ++- python/llaisys/ops.py | 22 +++- src/llaisys/ops.cc | 5 + src/llaisys/qwen2.cc | 20 +-- src/ops/rand_sample/cpu/rand_sample_cpu.cpp | 129 ++++++++++++++++++++ src/ops/rand_sample/cpu/rand_sample_cpu.hpp | 9 ++ src/ops/rand_sample/op.cpp | 53 ++++++++ src/ops/rand_sample/op.hpp | 8 ++ 12 files changed, 268 insertions(+), 16 deletions(-) create mode 100644 src/ops/rand_sample/cpu/rand_sample_cpu.cpp create mode 100644 src/ops/rand_sample/cpu/rand_sample_cpu.hpp create mode 100644 src/ops/rand_sample/op.cpp create mode 100644 src/ops/rand_sample/op.hpp diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h index 7054626d..8840660f 100644 --- a/include/llaisys/models/qwen2.h +++ b/include/llaisys/models/qwen2.h @@ -37,6 +37,7 @@ __C { __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model); - __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken, + float temperature, size_t topK, float topP, int64_t seed); } #endif // LLAISYS_MODELS_QWEN2_H diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h index ddb3be24..15b70067 100644 --- a/include/llaisys/ops.h +++ b/include/llaisys/ops.h @@ -6,6 +6,8 @@ __C { __export void llaisysAdd(llaisysTensor_t c, llaisysTensor_t a, llaisysTensor_t b); __export void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals); + __export void llaisysRandSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t vals, + float temperature, size_t topK, float topP, int64_t seed); __export void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight); __export void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias); __export void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in); diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py index ea313487..a3807fae 100644 --- a/python/llaisys/libllaisys/models.py +++ b/python/llaisys/libllaisys/models.py @@ -61,5 +61,9 @@ def load_models(lib): ctypes.c_void_p, ctypes.POINTER(c_int64), c_size_t, + c_float, + c_size_t, + c_float, + c_int64, ] lib.llaisysQwen2ModelInfer.restype = c_int64 diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py index 5be095ef..e6ecfaa6 100644 --- a/python/llaisys/libllaisys/ops.py +++ b/python/llaisys/libllaisys/ops.py @@ -1,5 +1,5 @@ from .tensor import llaisysTensor_t -from ctypes import c_float +from ctypes import c_float, c_int64, c_size_t def load_ops(lib): lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] @@ -8,6 +8,17 @@ def load_ops(lib): lib.llaisysArgmax.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] lib.llaisysArgmax.restype = None + lib.llaisysRandSample.argtypes = [ + llaisysTensor_t, + llaisysTensor_t, + llaisysTensor_t, + c_float, + c_size_t, + c_float, + c_int64, + ] + lib.llaisysRandSample.restype = None + lib.llaisysEmbedding.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] lib.llaisysEmbedding.restype = None diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 98efbe28..dcd1263b 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -126,7 +126,7 @@ def _match_weight(self, name: str): return w.mlp_down_w[layer] return None - def _infer(self, tokens: Sequence[int]) -> int: + def _infer(self, tokens: Sequence[int], temperature: float, top_k: int, top_p: float, seed: int) -> int: # step forward infer if len(tokens) == 0: @@ -135,7 +135,13 @@ def _infer(self, tokens: Sequence[int]) -> int: arr = (ctypes.c_int64 * len(tokens))(*tokens) return int( LIB_LLAISYS.llaisysQwen2ModelInfer( - self._model, arr, ctypes.c_size_t(len(tokens)) + self._model, + arr, + ctypes.c_size_t(len(tokens)), + ctypes.c_float(temperature), + ctypes.c_size_t(top_k), + ctypes.c_float(top_p), + ctypes.c_int64(seed), ) ) @@ -146,6 +152,7 @@ def generate( top_k: int = 1, top_p: float = 0.8, temperature: float = 0.8, + seed: int = 0, ): # max new tokens default value:32 if max_new_tokens is None: @@ -154,12 +161,13 @@ def generate( if max_new_tokens == 0: return tokens # prefill - next_token = self._infer(tokens) + next_token = self._infer(tokens, temperature, top_k, top_p, seed) tokens.append(next_token) # decode for _ in range(max_new_tokens - 1): if tokens[-1] == self._end_token: break - next_token = self._infer([tokens[-1]]) + seed += 1 + next_token = self._infer([tokens[-1]], temperature, top_k, top_p, seed) tokens.append(next_token) return tokens diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py index ed0180bc..8c835eeb 100644 --- a/python/llaisys/ops.py +++ b/python/llaisys/ops.py @@ -1,6 +1,6 @@ from .libllaisys import LIB_LLAISYS from .tensor import Tensor -from ctypes import c_float, c_int +from ctypes import c_float, c_int, c_int64, c_size_t class Ops: @@ -12,6 +12,26 @@ def add(c: Tensor, a: Tensor, b: Tensor): def argmax(max_idx: Tensor, max_val: Tensor, vals: Tensor): LIB_LLAISYS.llaisysArgmax(max_idx.lib_tensor(), max_val.lib_tensor(), vals.lib_tensor()) + @staticmethod + def rand_sample( + sample_idx: Tensor, + sample_val: Tensor, + vals: Tensor, + temperature: float, + top_k: int, + top_p: float, + seed: int, + ): + LIB_LLAISYS.llaisysRandSample( + sample_idx.lib_tensor(), + sample_val.lib_tensor(), + vals.lib_tensor(), + c_float(temperature), + c_size_t(top_k), + c_float(top_p), + c_int64(seed), + ) + @staticmethod def embedding(out: Tensor, index: Tensor, weight: Tensor): LIB_LLAISYS.llaisysEmbedding( diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc index c99fbc32..50186759 100644 --- a/src/llaisys/ops.cc +++ b/src/llaisys/ops.cc @@ -6,6 +6,7 @@ #include "../ops/argmax/op.hpp" #include "../ops/embedding/op.hpp" #include "../ops/linear/op.hpp" +#include "../ops/rand_sample/op.hpp" #include "../ops/rearrange/op.hpp" #include "../ops/rms_norm/op.hpp" #include "../ops/rope/op.hpp" @@ -19,6 +20,10 @@ __C { void llaisysArgmax(llaisysTensor_t max_idx, llaisysTensor_t max_val, llaisysTensor_t vals) { llaisys::ops::argmax(max_idx->tensor, max_val->tensor, vals->tensor); } + void llaisysRandSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t vals, + float temperature, size_t topK, float topP, int64_t seed) { + llaisys::ops::rand_sample(sample_idx->tensor, sample_val->tensor, vals->tensor, temperature, topK, topP, seed); + } void llaisysEmbedding(llaisysTensor_t out, llaisysTensor_t index, llaisysTensor_t weight) { llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor); } diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index bd070c7d..e9d123b1 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -3,7 +3,7 @@ #include "../tensor/tensor.hpp" #include "../ops/add/op.hpp" -#include "../ops/argmax/op.hpp" +#include "../ops/rand_sample/op.hpp" #include "../ops/embedding/op.hpp" #include "../ops/linear/op.hpp" #include "../ops/rearrange/op.hpp" @@ -197,7 +197,8 @@ static void init_cache(LlaisysQwen2Model *model) { } // inference implementation -static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, size_t ntoken) { +static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, size_t ntoken, float temperature, + size_t topK, float topP, int64_t seed) { CHECK_ARGUMENT(model != nullptr, "model is null"); CHECK_ARGUMENT(token_ids != nullptr || ntoken == 0, "token_ids is null"); CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU, "Only CPU device is supported."); @@ -323,13 +324,13 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si auto logits = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.voc}); llaisys::ops::linear(logits, x_norm, model->weights.out_embed->tensor, model->out_bias); - // get last token logits and argmax + // get last token logits and sample auto last = logits->slice(0, seqlen - 1, seqlen)->view({model->meta.voc}); - auto max_idx = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {1}); - auto max_val = make_tensor(model->meta, model->device, model->device_id, {1}); - llaisys::ops::argmax(max_idx, max_val, last); + auto sample_idx = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {1}); + auto sample_val = make_tensor(model->meta, model->device, model->device_id, {1}); + llaisys::ops::rand_sample(sample_idx, sample_val, last, temperature, topK, topP, seed); - return reinterpret_cast(max_idx->data())[0]; + return reinterpret_cast(sample_idx->data())[0]; } // C API wrapper @@ -381,8 +382,9 @@ struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *m } // ModelInfer: Single interface for single token prediction -int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken) { - return infer_impl(model, token_ids, ntoken); +int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model *model, int64_t *token_ids, size_t ntoken, float temperature, + size_t topK, float topP, int64_t seed) { + return infer_impl(model, token_ids, ntoken, temperature, topK, topP, seed); } } diff --git a/src/ops/rand_sample/cpu/rand_sample_cpu.cpp b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp new file mode 100644 index 00000000..aa8ffb5e --- /dev/null +++ b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp @@ -0,0 +1,129 @@ +#include "rand_sample_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include +#include + +template +static void rand_sample_(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, const float temperature, + const size_t topK, const float topP, size_t numel, const int64_t batch_size, const int64_t seed) { + const T *v_all = reinterpret_cast(vals); + auto *out_idx = reinterpret_cast(sample_idx); + auto *out_val = reinterpret_cast(sample_val); + float temp = temperature; + if (temp <= 1e-6f) { + temp = 1e-6f; + } + + std::vector scores(numel); + std::vector> sorted_scores(numel); + std::vector> candidates; + candidates.reserve(numel); + + std::mt19937_64 rng(static_cast(seed)); + std::uniform_real_distribution dist(0.0f, 1.0f); + + for (int64_t b = 0; b < batch_size; ++b) { + const T *v = v_all + b * numel; + size_t max_i = 0; + float max_v = llaisys::utils::cast(v[0]); + for (size_t i = 1; i < numel; ++i) { + float cur = llaisys::utils::cast(v[i]); + if (cur > max_v) { + max_v = cur; + max_i = i; + } + } + + for (size_t i = 0; i < numel; ++i) { + float cur = llaisys::utils::cast(v[i]); + scores[i] = std::exp((cur - max_v) / temp); + } + float sum = 0.0f; + for (size_t i = 0; i < numel; ++i) { + sum += scores[i]; + } + if (sum <= 0.0f) { + for (size_t i = 0; i < numel; ++i) { + scores[i] = 0.0f; + } + scores[max_i] = 1.0f; + sum = 1.0f; + } else { + for (size_t i = 0; i < numel; ++i) { + scores[i] /= sum; + } + } + + for (size_t i = 0; i < numel; ++i) { + sorted_scores[i] = {scores[i], i}; + } + std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto &a, const auto &b) { return a.first > b.first; }); + + size_t k = numel; + if (topK > 0 && topK < numel) { + k = topK; + } + + candidates.clear(); + if (topP > 0.0f && topP < 1.0f) { + float cum_score = 0.0f; + for (size_t i = 0; i < k; ++i) { + candidates.push_back(sorted_scores[i]); + cum_score += sorted_scores[i].first; + if (cum_score >= topP) { + break; + } + } + } else { + for (size_t i = 0; i < k; ++i) { + candidates.push_back(sorted_scores[i]); + } + } + if (candidates.empty()) { + candidates.push_back(sorted_scores[0]); + } + + float cand_sum = 0.0f; + for (const auto &item : candidates) { + cand_sum += item.first; + } + + size_t chosen = candidates[0].second; + if (cand_sum > 0.0f) { + float r = dist(rng); + for (const auto &item : candidates) { + r -= item.first / cand_sum; + if (r <= 0.0f) { + chosen = item.second; + break; + } + } + if (r > 0.0f) { + chosen = candidates.back().second; + } + } + + out_idx[b] = static_cast(chosen); + out_val[b] = v[chosen]; + } +} + +namespace llaisys::ops::cpu { +void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel, + const int64_t batch_size, const float temperature, const size_t topK, const float topP, const int64_t seed) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rand_sample_(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed); + case LLAISYS_DTYPE_BF16: + return rand_sample_(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed); + case LLAISYS_DTYPE_F16: + return rand_sample_(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} diff --git a/src/ops/rand_sample/cpu/rand_sample_cpu.hpp b/src/ops/rand_sample/cpu/rand_sample_cpu.hpp new file mode 100644 index 00000000..59ad95e7 --- /dev/null +++ b/src/ops/rand_sample/cpu/rand_sample_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel, + const int64_t batch_size, const float temperature, const size_t topK, const float topP, const int64_t seed); +} diff --git a/src/ops/rand_sample/op.cpp b/src/ops/rand_sample/op.cpp new file mode 100644 index 00000000..891aa2dc --- /dev/null +++ b/src/ops/rand_sample/op.cpp @@ -0,0 +1,53 @@ +#include "op.hpp" + +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rand_sample_cpu.hpp" + +namespace llaisys::ops { +void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float temperature, size_t topK, float topP, + int64_t seed) { + CHECK_SAME_DEVICE(sample_idx, sample_val, vals); + CHECK_SAME_DTYPE(sample_val->dtype(), vals->dtype()); + ASSERT(sample_idx->dtype() == LLAISYS_DTYPE_I64, "rand_sample: sample_idx must be I64."); + ASSERT(sample_idx->isContiguous() && sample_val->isContiguous() && vals->isContiguous(), + "rand_sample: all tensors must be contiguous."); + + size_t batch_size = 1; + size_t numel = 0; + if (vals->ndim() == 1) { + numel = vals->shape()[0]; + ASSERT(sample_idx->ndim() == 1 && sample_idx->shape()[0] == 1, "rand_sample: sample_idx shape must be (1, )."); + ASSERT(sample_val->ndim() == 1 && sample_val->shape()[0] == 1, "rand_sample: sample_val shape must be (1, )."); + } else { + ASSERT(vals->ndim() == 2, "rand_sample: vals must be 1D or 2D."); + batch_size = vals->shape()[0]; + numel = vals->shape()[1]; + ASSERT(sample_idx->ndim() == 1 && sample_idx->shape()[0] == batch_size, + "rand_sample: sample_idx shape must be (batch, )."); + ASSERT(sample_val->ndim() == 1 && sample_val->shape()[0] == batch_size, + "rand_sample: sample_val shape must be (batch, )."); + } + + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rand_sample(sample_idx->data(), sample_val->data(), vals->data(), vals->dtype(), numel, + static_cast(batch_size), temperature, topK, topP, seed); + } + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rand_sample(sample_idx->data(), sample_val->data(), vals->data(), vals->dtype(), numel, + static_cast(batch_size), temperature, topK, topP, seed); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} +} // namespace llaisys::ops diff --git a/src/ops/rand_sample/op.hpp b/src/ops/rand_sample/op.hpp new file mode 100644 index 00000000..8172f04d --- /dev/null +++ b/src/ops/rand_sample/op.hpp @@ -0,0 +1,8 @@ +#pragma once + +#include "../../tensor/tensor.hpp" + +namespace llaisys::ops { +void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float temperature, size_t topK, float topP, + int64_t seed); +} From bd737c7020c561d8b8520465d529b9abe79acc6c Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Wed, 25 Feb 2026 22:06:47 +0800 Subject: [PATCH 09/14] complete chat server && test passed. --- python/llaisys/server/chat_server.py | 242 ++++++++++++++++++++ python/setup.cfg | 3 + src/ops/rand_sample/cpu/rand_sample_cpu.cpp | 12 +- 3 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 python/llaisys/server/chat_server.py diff --git a/python/llaisys/server/chat_server.py b/python/llaisys/server/chat_server.py new file mode 100644 index 00000000..cbbbc5d2 --- /dev/null +++ b/python/llaisys/server/chat_server.py @@ -0,0 +1,242 @@ +import os +import time +import uuid +import json +from typing import List, Optional, Iterable + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from starlette.responses import StreamingResponse, JSONResponse +from transformers import AutoTokenizer + +import llaisys + +# define Message format +class ChatMessage(BaseModel): + role: str + content: str + +# define Request format +class ChatRequest(BaseModel): + model: Optional[str] = None + messages: List[ChatMessage] + max_tokens: Optional[int] = 128 + temperature: float = 1.0 + top_p: float = 0.8 + top_k: int = 50 + stream: bool = False + seed: int = 0 + + +class AppState: + tokenizer = None + model = None + model_path = None + model_name = None + device = None + + +state = AppState() +app = FastAPI() + +# map device name to llaisys device type +def llaisys_device(device_name: str): + if device_name == "cpu": + return llaisys.DeviceType.CPU + if device_name == "nvidia": + return llaisys.DeviceType.NVIDIA + raise ValueError(f"Unsupported device name: {device_name}") + +# set default values for model path, device, and model name +# if not provided +def ensure_state_loaded(): + if state.model is not None and state.tokenizer is not None: + return + model_path = os.environ.get("LLAISYS_MODEL_PATH", "./models") + if not model_path: + raise RuntimeError("LLAISYS_MODEL_PATH is required") + device = os.environ.get("LLAISYS_DEVICE", "cpu") + model_name = os.environ.get("LLAISYS_MODEL_NAME", "llaisys-qwen2") + state.model_path = model_path + state.device = device + state.model_name = model_name + state.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + state.model = llaisys.models.Qwen2(model_path, llaisys_device(device)) + +# build prompt from messages +def build_prompt(messages: List[ChatMessage]) -> str: + conversation = [{"role": m.role, "content": m.content} for m in messages] + return state.tokenizer.apply_chat_template( + conversation=conversation, + add_generation_prompt=True, + tokenize=False, + ) + +# decode output token id and mask prompt tokens +def decode_completion(input_ids: List[int], output_ids: List[int]) -> str: + prompt_text = state.tokenizer.decode(input_ids, skip_special_tokens=True) + full_text = state.tokenizer.decode(output_ids, skip_special_tokens=True) + return full_text[len(prompt_text):] + +# generate full completion +def generate_full( + input_ids: List[int], + max_tokens: int, + temperature: float, + top_k: int, + top_p: float, + seed: int, +): + output_ids = state.model.generate( + input_ids, + max_new_tokens=max_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + seed=seed, + ) + completion_text = decode_completion(input_ids, output_ids) + completion_tokens = max(0, len(output_ids) - len(input_ids)) + return completion_text, completion_tokens + +# stream mode +def generate_stream( + input_ids: List[int], + max_tokens: int, + temperature: float, + top_k: int, + top_p: float, + seed: int, +) -> Iterable[str]: + tokens = list(input_ids) + prompt_text = state.tokenizer.decode(tokens, skip_special_tokens=True) + prev_text = prompt_text + if max_tokens <= 0: + return + # generate first output token, then append it to prompt + next_token = state.model._infer(tokens, temperature, top_k, top_p, seed) + tokens.append(next_token) + text = state.tokenizer.decode(tokens, skip_special_tokens=True) + delta = text[len(prev_text):] + prev_text = text + # if we decode a non-empty delta, yield it, that's the first token + if delta: + yield delta + # continue generate subsequent tokens, until max_tokens or end_token + for _ in range(max_tokens - 1): + if tokens[-1] == state.model._end_token: + break + seed += 1 + # because of kv-cache, we only need to infer the last token + next_token = state.model._infer([tokens[-1]], temperature, top_k, top_p, seed) + tokens.append(next_token) + text = state.tokenizer.decode(tokens, skip_special_tokens=True) + delta = text[len(prev_text):] + prev_text = text + if delta: + yield delta + + +def sse_chunk(payload: dict) -> str: + return "data: " + json.dumps(payload, ensure_ascii=False) + "\n\n" + + +@app.post("/v1/chat/completions") +def chat_completions(req: ChatRequest): + ensure_state_loaded() + if not req.messages: + raise HTTPException(status_code=400, detail="messages is required") + prompt = build_prompt(req.messages) + input_ids = state.tokenizer.encode(prompt) + max_tokens = req.max_tokens if req.max_tokens is not None else 128 + created = int(time.time()) + request_id = "chatcmpl-" + uuid.uuid4().hex + + if not req.stream: + completion_text, completion_tokens = generate_full( + input_ids=input_ids, + max_tokens=max_tokens, + temperature=req.temperature, + top_k=req.top_k, + top_p=req.top_p, + seed=req.seed, + ) + response = { + "id": request_id, + "object": "chat.completion", + "created": created, + "model": req.model or state.model_name, + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": completion_text}, + "finish_reason": "stop", + } + ], + "usage": { + "prompt_tokens": len(input_ids), + "completion_tokens": completion_tokens, + "total_tokens": len(input_ids) + completion_tokens, + }, + } + return JSONResponse(response) + + def event_stream(): + yield sse_chunk( + { + "id": request_id, + "object": "chat.completion.chunk", + "created": created, + "model": req.model or state.model_name, + "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}], + } + ) + for delta in generate_stream( + input_ids=input_ids, + max_tokens=max_tokens, + temperature=req.temperature, + top_k=req.top_k, + top_p=req.top_p, + seed=req.seed, + ): + yield sse_chunk( + { + "id": request_id, + "object": "chat.completion.chunk", + "created": created, + "model": req.model or state.model_name, + "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}], + } + ) + yield sse_chunk( + { + "id": request_id, + "object": "chat.completion.chunk", + "created": created, + "model": req.model or state.model_name, + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], + } + ) + yield "data: [DONE]\n\n" + + return StreamingResponse(event_stream(), media_type="text/event-stream") + + +if __name__ == "__main__": + import argparse + import uvicorn + + parser = argparse.ArgumentParser() + parser.add_argument("--host", default="0.0.0.0") + parser.add_argument("--port", default=8000, type=int) + parser.add_argument("--model_path", default=None) + parser.add_argument("--device", default="cpu") + parser.add_argument("--model_name", default="llaisys-qwen2") + args = parser.parse_args() + + if args.model_path: + os.environ["LLAISYS_MODEL_PATH"] = args.model_path + os.environ["LLAISYS_DEVICE"] = args.device + os.environ["LLAISYS_MODEL_NAME"] = args.model_name + + uvicorn.run(app, host=args.host, port=args.port) \ No newline at end of file diff --git a/python/setup.cfg b/python/setup.cfg index b35fc65f..88ff8f6e 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -13,6 +13,9 @@ install_requires = torch>=2.4.0 transformers accelerate + fastapi + uvicorn + sse-starlette [options.package_data] llaisys = diff --git a/src/ops/rand_sample/cpu/rand_sample_cpu.cpp b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp index aa8ffb5e..9ca5e0e5 100644 --- a/src/ops/rand_sample/cpu/rand_sample_cpu.cpp +++ b/src/ops/rand_sample/cpu/rand_sample_cpu.cpp @@ -61,7 +61,8 @@ static void rand_sample_(std::byte *sample_idx, std::byte *sample_val, const std for (size_t i = 0; i < numel; ++i) { sorted_scores[i] = {scores[i], i}; } - std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto &a, const auto &b) { return a.first > b.first; }); + std::sort(sorted_scores.begin(), sorted_scores.end(), + [](const auto &a, const auto &b) { return a.first > b.first; }); size_t k = numel; if (topK > 0 && topK < numel) { @@ -117,11 +118,14 @@ void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte * const int64_t batch_size, const float temperature, const size_t topK, const float topP, const int64_t seed) { switch (type) { case LLAISYS_DTYPE_F32: - return rand_sample_(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed); + return rand_sample_(sample_idx, sample_val, vals, + temperature, topK, topP, numel, batch_size, seed); case LLAISYS_DTYPE_BF16: - return rand_sample_(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed); + return rand_sample_(sample_idx, sample_val, vals, + temperature, topK, topP, numel, batch_size, seed); case LLAISYS_DTYPE_F16: - return rand_sample_(sample_idx, sample_val, vals, temperature, topK, topP, numel, batch_size, seed); + return rand_sample_(sample_idx, sample_val, vals, + temperature, topK, topP, numel, batch_size, seed); default: EXCEPTION_UNSUPPORTED_DATATYPE(type); } From e20e680ae0b4ae8c1906355ce7086c1eb7afae8c Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Thu, 26 Feb 2026 14:27:02 +0800 Subject: [PATCH 10/14] complete webUI for chatServer && test passed. --- python/llaisys/server/chat_server.py | 98 +++++--- python/llaisys/webUI/index.html | 342 +++++++++++++++++++++++++++ 2 files changed, 402 insertions(+), 38 deletions(-) create mode 100644 python/llaisys/webUI/index.html diff --git a/python/llaisys/server/chat_server.py b/python/llaisys/server/chat_server.py index cbbbc5d2..4682a0f6 100644 --- a/python/llaisys/server/chat_server.py +++ b/python/llaisys/server/chat_server.py @@ -2,11 +2,14 @@ import time import uuid import json +import threading +from pathlib import Path from typing import List, Optional, Iterable from fastapi import FastAPI, HTTPException from pydantic import BaseModel -from starlette.responses import StreamingResponse, JSONResponse +from starlette.responses import StreamingResponse, JSONResponse, FileResponse +from starlette.staticfiles import StaticFiles from transformers import AutoTokenizer import llaisys @@ -38,6 +41,13 @@ class AppState: state = AppState() app = FastAPI() +model_lock = threading.Lock() + +WEBUI_INDEX = Path(__file__).resolve().parent.parent / "webUI" / "index.html" +WEBUI_DIR = WEBUI_INDEX.parent.parent +WEBUI_ENABLED = WEBUI_INDEX.is_file() +if WEBUI_ENABLED: + app.mount("/webUI", StaticFiles(directory=WEBUI_DIR), name="webUI") # map device name to llaisys device type def llaisys_device(device_name: str): @@ -153,14 +163,15 @@ def chat_completions(req: ChatRequest): request_id = "chatcmpl-" + uuid.uuid4().hex if not req.stream: - completion_text, completion_tokens = generate_full( - input_ids=input_ids, - max_tokens=max_tokens, - temperature=req.temperature, - top_k=req.top_k, - top_p=req.top_p, - seed=req.seed, - ) + with model_lock: + completion_text, completion_tokens = generate_full( + input_ids=input_ids, + max_tokens=max_tokens, + temperature=req.temperature, + top_k=req.top_k, + top_p=req.top_p, + seed=req.seed, + ) response = { "id": request_id, "object": "chat.completion", @@ -182,46 +193,57 @@ def chat_completions(req: ChatRequest): return JSONResponse(response) def event_stream(): - yield sse_chunk( - { - "id": request_id, - "object": "chat.completion.chunk", - "created": created, - "model": req.model or state.model_name, - "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}], - } - ) - for delta in generate_stream( - input_ids=input_ids, - max_tokens=max_tokens, - temperature=req.temperature, - top_k=req.top_k, - top_p=req.top_p, - seed=req.seed, - ): + model_lock.acquire() + try: + yield sse_chunk( + { + "id": request_id, + "object": "chat.completion.chunk", + "created": created, + "model": req.model or state.model_name, + "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}], + } + ) + for delta in generate_stream( + input_ids=input_ids, + max_tokens=max_tokens, + temperature=req.temperature, + top_k=req.top_k, + top_p=req.top_p, + seed=req.seed, + ): + yield sse_chunk( + { + "id": request_id, + "object": "chat.completion.chunk", + "created": created, + "model": req.model or state.model_name, + "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}], + } + ) yield sse_chunk( { "id": request_id, "object": "chat.completion.chunk", "created": created, "model": req.model or state.model_name, - "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}], + "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], } ) - yield sse_chunk( - { - "id": request_id, - "object": "chat.completion.chunk", - "created": created, - "model": req.model or state.model_name, - "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}], - } - ) - yield "data: [DONE]\n\n" + yield "data: [DONE]\n\n" + finally: + model_lock.release() return StreamingResponse(event_stream(), media_type="text/event-stream") +@app.get("/") +def chat_ui(): + if WEBUI_ENABLED: + return FileResponse(WEBUI_INDEX) + raise HTTPException(status_code=404, detail="Web UI not found") + + if __name__ == "__main__": import argparse import uvicorn @@ -239,4 +261,4 @@ def event_stream(): os.environ["LLAISYS_DEVICE"] = args.device os.environ["LLAISYS_MODEL_NAME"] = args.model_name - uvicorn.run(app, host=args.host, port=args.port) \ No newline at end of file + uvicorn.run(app, host=args.host, port=args.port) diff --git a/python/llaisys/webUI/index.html b/python/llaisys/webUI/index.html new file mode 100644 index 00000000..b8ac1a3d --- /dev/null +++ b/python/llaisys/webUI/index.html @@ -0,0 +1,342 @@ + + + + + + LLAISYS Chat UI + + + +
+ +
+
+

新对话

+
+ + + + + +
+
+
+
+ + +
+
+
+ + + From c2727b364dabe36240e5b55b0def92646e69e755 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Fri, 27 Feb 2026 00:26:34 +0800 Subject: [PATCH 11/14] complete cuda ops. --- .gitignore | 3 +- src/device/nvidia/nvidia_runtime_api.cu | 69 +++++-- src/ops/add/nvidia/add_nvidia.cu | 54 +++++ src/ops/add/nvidia/add_nvidia.hpp | 9 + src/ops/add/op.cpp | 4 +- src/ops/argmax/nvidia/argmax_nvidia.cu | 66 ++++++ src/ops/argmax/nvidia/argmax_nvidia.hpp | 9 + src/ops/argmax/op.cpp | 4 +- src/ops/embedding/nvidia/embedding_nvidia.cu | 56 ++++++ src/ops/embedding/nvidia/embedding_nvidia.hpp | 9 + src/ops/embedding/op.cpp | 4 +- src/ops/linear/nvidia/linear_nvidia.cu | 75 +++++++ src/ops/linear/nvidia/linear_nvidia.hpp | 9 + src/ops/linear/op.cpp | 4 +- src/ops/nvidia_utils.cuh | 123 ++++++++++++ .../rand_sample/nvidia/rand_sample_nvidia.cu | 29 +++ .../rand_sample/nvidia/rand_sample_nvidia.hpp | 11 + src/ops/rand_sample/op.cpp | 5 +- src/ops/rearrange/nvidia/rearrange_nvidia.cu | 88 ++++++++ src/ops/rearrange/nvidia/rearrange_nvidia.hpp | 11 + src/ops/rearrange/op.cpp | 5 +- src/ops/rms_norm/nvidia/rms_norm_nvidia.cu | 179 +++++++++++++++++ src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp | 10 + src/ops/rms_norm/op.cpp | 4 +- src/ops/rope/nvidia/rope_nvidia.cu | 188 ++++++++++++++++++ src/ops/rope/nvidia/rope_nvidia.hpp | 10 + src/ops/rope/op.cpp | 4 +- .../nvidia/self_attention_nvidia.cu | 34 ++++ .../nvidia/self_attention_nvidia.hpp | 10 + src/ops/self_attention/op.cpp | 5 +- src/ops/swiglu/nvidia/swiglu_nvidia.cu | 172 ++++++++++++++++ src/ops/swiglu/nvidia/swiglu_nvidia.hpp | 9 + src/ops/swiglu/op.cpp | 4 +- xmake.lua | 25 ++- xmake/nvidia.lua | 32 +++ 35 files changed, 1292 insertions(+), 41 deletions(-) create mode 100644 src/ops/add/nvidia/add_nvidia.cu create mode 100644 src/ops/add/nvidia/add_nvidia.hpp create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.cu create mode 100644 src/ops/argmax/nvidia/argmax_nvidia.hpp create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.cu create mode 100644 src/ops/embedding/nvidia/embedding_nvidia.hpp create mode 100644 src/ops/linear/nvidia/linear_nvidia.cu create mode 100644 src/ops/linear/nvidia/linear_nvidia.hpp create mode 100644 src/ops/nvidia_utils.cuh create mode 100644 src/ops/rand_sample/nvidia/rand_sample_nvidia.cu create mode 100644 src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.cu create mode 100644 src/ops/rearrange/nvidia/rearrange_nvidia.hpp create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.cu create mode 100644 src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp create mode 100644 src/ops/rope/nvidia/rope_nvidia.cu create mode 100644 src/ops/rope/nvidia/rope_nvidia.hpp create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.cu create mode 100644 src/ops/self_attention/nvidia/self_attention_nvidia.hpp create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.cu create mode 100644 src/ops/swiglu/nvidia/swiglu_nvidia.hpp create mode 100644 xmake/nvidia.lua diff --git a/.gitignore b/.gitignore index e92328ca..e9960a4d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,8 @@ # Xmake cache .xmake/ build/ - +# md docs +*.md # Binaries bin/ lib/ diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu index cab92826..adc39208 100644 --- a/src/device/nvidia/nvidia_runtime_api.cu +++ b/src/device/nvidia/nvidia_runtime_api.cu @@ -1,56 +1,97 @@ #include "../runtime_api.hpp" +#include #include #include +#include +#include namespace llaisys::device::nvidia { namespace runtime_api { +static void checkCuda(cudaError_t err) { + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err)); + } +} + +static cudaMemcpyKind toCudaMemcpyKind(llaisysMemcpyKind_t kind) { + switch (kind) { + case LLAISYS_MEMCPY_H2H: + return cudaMemcpyHostToHost; + case LLAISYS_MEMCPY_H2D: + return cudaMemcpyHostToDevice; + case LLAISYS_MEMCPY_D2H: + return cudaMemcpyDeviceToHost; + case LLAISYS_MEMCPY_D2D: + return cudaMemcpyDeviceToDevice; + default: + throw std::runtime_error("Unsupported memcpy kind"); + } +} + int getDeviceCount() { - TO_BE_IMPLEMENTED(); + int count = 0; + checkCuda(cudaGetDeviceCount(&count)); + return count; } -void setDevice(int) { - TO_BE_IMPLEMENTED(); +void setDevice(int device_id) { + checkCuda(cudaSetDevice(device_id)); } void deviceSynchronize() { - TO_BE_IMPLEMENTED(); + checkCuda(cudaDeviceSynchronize()); } llaisysStream_t createStream() { - TO_BE_IMPLEMENTED(); + cudaStream_t stream = nullptr; + checkCuda(cudaStreamCreate(&stream)); + return reinterpret_cast(stream); } void destroyStream(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + if (stream == nullptr) { + return; + } + checkCuda(cudaStreamDestroy(reinterpret_cast(stream))); } void streamSynchronize(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + checkCuda(cudaStreamSynchronize(reinterpret_cast(stream))); } void *mallocDevice(size_t size) { - TO_BE_IMPLEMENTED(); + void *ptr = nullptr; + checkCuda(cudaMalloc(&ptr, size)); + return ptr; } void freeDevice(void *ptr) { - TO_BE_IMPLEMENTED(); + if (ptr == nullptr) { + return; + } + checkCuda(cudaFree(ptr)); } void *mallocHost(size_t size) { - TO_BE_IMPLEMENTED(); + void *ptr = nullptr; + checkCuda(cudaMallocHost(&ptr, size)); + return ptr; } void freeHost(void *ptr) { - TO_BE_IMPLEMENTED(); + if (ptr == nullptr) { + return; + } + checkCuda(cudaFreeHost(ptr)); } void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); + checkCuda(cudaMemcpy(dst, src, size, toCudaMemcpyKind(kind))); } -void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); +void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) { + checkCuda(cudaMemcpyAsync(dst, src, size, toCudaMemcpyKind(kind), reinterpret_cast(stream))); } static const LlaisysRuntimeAPI RUNTIME_API = { diff --git a/src/ops/add/nvidia/add_nvidia.cu b/src/ops/add/nvidia/add_nvidia.cu new file mode 100644 index 00000000..e7307d64 --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.cu @@ -0,0 +1,54 @@ +#include "add_nvidia.hpp" + +#include "../../nvidia_utils.cuh" +#include "../../../core/llaisys_core.hpp" + +namespace llaisys::ops::nvidia { +template +__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } + if constexpr (std::is_same_v || std::is_same_v) { + float av = detail::to_float(a[idx]); + float bv = detail::to_float(b[idx]); + c[idx] = detail::from_float(av + bv); + } else { + c[idx] = a[idx] + b[idx]; + } +} + +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel) { + int threads = 256; + int blocks = static_cast((numel + threads - 1) / threads); + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + case LLAISYS_DTYPE_BF16: + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + case LLAISYS_DTYPE_F16: + add_kernel<<>>( + reinterpret_cast(c), + reinterpret_cast(a), + reinterpret_cast(b), + numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/add/nvidia/add_nvidia.hpp b/src/ops/add/nvidia/add_nvidia.hpp new file mode 100644 index 00000000..2e67b497 --- /dev/null +++ b/src/ops/add/nvidia/add_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp index a057330d..9b36ba8e 100644 --- a/src/ops/add/op.cpp +++ b/src/ops/add/op.cpp @@ -4,6 +4,7 @@ #include "../../utils.hpp" #include "cpu/add_cpu.hpp" +#include "nvidia/add_nvidia.hpp" namespace llaisys::ops { void add(tensor_t c, tensor_t a, tensor_t b) { @@ -25,8 +26,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) { return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/argmax/nvidia/argmax_nvidia.cu b/src/ops/argmax/nvidia/argmax_nvidia.cu new file mode 100644 index 00000000..79a51059 --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.cu @@ -0,0 +1,66 @@ +#include "argmax_nvidia.hpp" + +#include "../../nvidia_utils.cuh" +#include "../../../core/llaisys_core.hpp" + +namespace llaisys::ops::nvidia { +template +__global__ void argmax_kernel(int64_t *max_idx, T *max_val, const T *vals, size_t numel) { + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + size_t max_i = 0; + if constexpr (std::is_same_v || std::is_same_v) { + float max_f = detail::to_float(vals[0]); + for (size_t i = 1; i < numel; ++i) { + float cur = detail::to_float(vals[i]); + if (cur > max_f) { + max_f = cur; + max_i = i; + } + } + *max_val = detail::from_float(max_f); + } else { + T max_v = vals[0]; + for (size_t i = 1; i < numel; ++i) { + if (vals[i] > max_v) { + max_v = vals[i]; + max_i = i; + } + } + *max_val = max_v; + } + *max_idx = static_cast(max_i); +} + +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel) { + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + argmax_kernel<<<1, 1, 0, stream>>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + break; + case LLAISYS_DTYPE_BF16: + argmax_kernel<<<1, 1, 0, stream>>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + break; + case LLAISYS_DTYPE_F16: + argmax_kernel<<<1, 1, 0, stream>>>( + reinterpret_cast(max_idx), + reinterpret_cast(max_val), + reinterpret_cast(vals), + numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/argmax/nvidia/argmax_nvidia.hpp b/src/ops/argmax/nvidia/argmax_nvidia.hpp new file mode 100644 index 00000000..054fa353 --- /dev/null +++ b/src/ops/argmax/nvidia/argmax_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 034bc431..3482db29 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -4,6 +4,7 @@ #include "../../utils.hpp" #include "cpu/argmax_cpu.hpp" +#include "nvidia/argmax_nvidia.hpp" namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { @@ -26,8 +27,7 @@ void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/embedding/nvidia/embedding_nvidia.cu b/src/ops/embedding/nvidia/embedding_nvidia.cu new file mode 100644 index 00000000..c8832472 --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.cu @@ -0,0 +1,56 @@ +#include "embedding_nvidia.hpp" + +#include "../../nvidia_utils.cuh" +#include "../../../core/llaisys_core.hpp" + +namespace llaisys::ops::nvidia { +template +__global__ void embedding_kernel(T *out, const int64_t *index, const T *weight, size_t index_len, size_t row_len) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = index_len * row_len; + if (idx >= total) { + return; + } + size_t i = idx / row_len; + size_t j = idx - i * row_len; + int64_t row = index[i]; + out[idx] = weight[static_cast(row) * row_len + j]; +} + +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len) { + size_t total = index_len * row_len; + int threads = 256; + int blocks = static_cast((total + threads - 1) / threads); + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + index_len, + row_len); + break; + case LLAISYS_DTYPE_BF16: + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + index_len, + row_len); + break; + case LLAISYS_DTYPE_F16: + embedding_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + index_len, + row_len); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/embedding/nvidia/embedding_nvidia.hpp b/src/ops/embedding/nvidia/embedding_nvidia.hpp new file mode 100644 index 00000000..29ce3a28 --- /dev/null +++ b/src/ops/embedding/nvidia/embedding_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t index_len, size_t row_len); +} diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 3c33c5d5..01abaac7 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -2,6 +2,7 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/embedding_cpu.hpp" +#include "nvidia/embedding_nvidia.hpp" namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { @@ -29,8 +30,7 @@ void embedding(tensor_t out, tensor_t index, tensor_t weight) { return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::embedding(out->data(), index->data(), weight->data(), out->dtype(), index_len, row_len); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/linear/nvidia/linear_nvidia.cu b/src/ops/linear/nvidia/linear_nvidia.cu new file mode 100644 index 00000000..fc7066f3 --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.cu @@ -0,0 +1,75 @@ +#include "linear_nvidia.hpp" + +#include "../../nvidia_utils.cuh" +#include "../../../core/llaisys_core.hpp" + +namespace llaisys::ops::nvidia { +template +__global__ void linear_kernel(T *out, const T *in, const T *W, const T *bias, size_t batch, size_t in_dim, size_t out_dim) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = batch * out_dim; + if (idx >= total) { + return; + } + size_t i = idx / out_dim; + size_t j = idx - i * out_dim; + const T *in_row = in + i * in_dim; + const T *w_row = W + j * in_dim; + if constexpr (std::is_same_v || std::is_same_v) { + float acc = 0.0f; + for (size_t k = 0; k < in_dim; ++k) { + acc += detail::to_float(in_row[k]) * detail::to_float(w_row[k]); + } + if (bias) { + acc += detail::to_float(bias[j]); + } + out[idx] = detail::from_float(acc); + } else { + T acc = static_cast(0); + for (size_t k = 0; k < in_dim; ++k) { + acc += in_row[k] * w_row[k]; + } + if (bias) { + acc += bias[j]; + } + out[idx] = acc; + } +} + +void linear(std::byte *out, const std::byte *in, const std::byte *W, const std::byte *bias, llaisysDataType_t type, size_t batch, size_t in_dim, size_t out_dim) { + size_t total = batch * out_dim; + int threads = 256; + int blocks = static_cast((total + threads - 1) / threads); + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + linear_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + reinterpret_cast(bias), + batch, in_dim, out_dim); + break; + case LLAISYS_DTYPE_BF16: + linear_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + reinterpret_cast(bias), + batch, in_dim, out_dim); + break; + case LLAISYS_DTYPE_F16: + linear_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(W), + reinterpret_cast(bias), + batch, in_dim, out_dim); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/linear/nvidia/linear_nvidia.hpp b/src/ops/linear/nvidia/linear_nvidia.hpp new file mode 100644 index 00000000..f31ea3fb --- /dev/null +++ b/src/ops/linear/nvidia/linear_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void linear(std::byte *out, const std::byte *in, const std::byte *W, const std::byte *bias, llaisysDataType_t type, size_t batch, size_t in_dim, size_t out_dim); +} diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 9585068a..0804ca50 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -2,6 +2,7 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/linear_cpu.hpp" +#include "nvidia/linear_nvidia.hpp" namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { ASSERT(out->isContiguous() && in->isContiguous() @@ -40,8 +41,7 @@ void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { weight->data(), bias_data, bias->dtype(), batch, in_dim, out_dim); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::linear(out->data(), in->data(), weight->data(), bias_data, out->dtype(), batch, in_dim, out_dim); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/nvidia_utils.cuh b/src/ops/nvidia_utils.cuh new file mode 100644 index 00000000..04ed31f2 --- /dev/null +++ b/src/ops/nvidia_utils.cuh @@ -0,0 +1,123 @@ +#pragma once + +#include "llaisys.h" +#include "../utils.hpp" +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia::detail { + +inline void checkCuda(cudaError_t err) { + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err)); + } +} + +__device__ inline float f16_to_f32(llaisys::fp16_t val) { + uint16_t h = val._v; + uint32_t sign = (h & 0x8000) << 16; + int32_t exponent = (h >> 10) & 0x1F; + uint32_t mantissa = h & 0x3FF; + uint32_t f32; + if (exponent == 31) { + f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000); + } else if (exponent == 0) { + if (mantissa == 0) { + f32 = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + union { + uint32_t u; + float f; + } tmp; + tmp.u = f32; + return tmp.f; +} + +__device__ inline llaisys::fp16_t f32_to_f16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t f32 = tmp.u; + uint16_t sign = (f32 >> 16) & 0x8000; + int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + uint32_t mantissa = f32 & 0x7FFFFF; + if (exponent >= 16) { + if (exponent == 128 && mantissa != 0) { + return llaisys::fp16_t{static_cast(sign | 0x7E00)}; + } + return llaisys::fp16_t{static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return llaisys::fp16_t{static_cast(sign | ((exponent + 15) << 10) | (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return llaisys::fp16_t{static_cast(sign | (mantissa >> 13))}; + } + return llaisys::fp16_t{static_cast(sign)}; +} + +__device__ inline float bf16_to_f32(llaisys::bf16_t val) { + uint32_t bits32 = static_cast(val._v) << 16; + union { + uint32_t u; + float f; + } tmp; + tmp.u = bits32; + return tmp.f; +} + +__device__ inline llaisys::bf16_t f32_to_bf16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t bits32 = tmp.u; + const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + uint16_t bf16_bits = static_cast((bits32 + rounding_bias) >> 16); + return llaisys::bf16_t{bf16_bits}; +} + +template +__device__ inline float to_float(T v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f16_to_f32(v); + } else if constexpr (std::is_same_v) { + return bf16_to_f32(v); + } else { + return static_cast(v); + } +} + +template +__device__ inline T from_float(float v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f32_to_f16(v); + } else if constexpr (std::is_same_v) { + return f32_to_bf16(v); + } else { + return static_cast(v); + } +} + +} diff --git a/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu new file mode 100644 index 00000000..b0326878 --- /dev/null +++ b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu @@ -0,0 +1,29 @@ +#include "rand_sample_nvidia.hpp" + +#include "../../../core/llaisys_core.hpp" +#include "../../../utils.hpp" +#include "../../nvidia_utils.cuh" +#include "../cpu/rand_sample_cpu.hpp" + +namespace llaisys::ops::nvidia { +void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel, + int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed) { + auto &runtime = llaisys::core::context().runtime(); + size_t vals_bytes = numel * static_cast(batch_size) * llaisys::utils::dsize(type); + size_t idx_bytes = static_cast(batch_size) * sizeof(int64_t); + size_t val_bytes = static_cast(batch_size) * llaisys::utils::dsize(type); + + auto host_vals = static_cast(runtime.api()->malloc_host(vals_bytes)); + auto host_idx = static_cast(runtime.api()->malloc_host(idx_bytes)); + auto host_val = static_cast(runtime.api()->malloc_host(val_bytes)); + + runtime.api()->memcpy_sync(host_vals, vals, vals_bytes, LLAISYS_MEMCPY_D2H); + cpu::rand_sample(host_idx, host_val, host_vals, type, numel, batch_size, temperature, topK, topP, seed); + runtime.api()->memcpy_sync(sample_idx, host_idx, idx_bytes, LLAISYS_MEMCPY_H2D); + runtime.api()->memcpy_sync(sample_val, host_val, val_bytes, LLAISYS_MEMCPY_H2D); + + runtime.api()->free_host(host_vals); + runtime.api()->free_host(host_idx); + runtime.api()->free_host(host_val); +} +} diff --git a/src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp b/src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp new file mode 100644 index 00000000..e9c9bcab --- /dev/null +++ b/src/ops/rand_sample/nvidia/rand_sample_nvidia.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel, + int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed); +} diff --git a/src/ops/rand_sample/op.cpp b/src/ops/rand_sample/op.cpp index 891aa2dc..8cf7bae6 100644 --- a/src/ops/rand_sample/op.cpp +++ b/src/ops/rand_sample/op.cpp @@ -4,6 +4,7 @@ #include "../../utils.hpp" #include "cpu/rand_sample_cpu.hpp" +#include "nvidia/rand_sample_nvidia.hpp" namespace llaisys::ops { void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float temperature, size_t topK, float topP, @@ -43,8 +44,8 @@ void rand_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t vals, float static_cast(batch_size), temperature, topK, topP, seed); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::rand_sample(sample_idx->data(), sample_val->data(), vals->data(), vals->dtype(), numel, + static_cast(batch_size), temperature, topK, topP, seed); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.cu b/src/ops/rearrange/nvidia/rearrange_nvidia.cu new file mode 100644 index 00000000..2cea4da8 --- /dev/null +++ b/src/ops/rearrange/nvidia/rearrange_nvidia.cu @@ -0,0 +1,88 @@ +#include "rearrange_nvidia.hpp" + +#include "../../../core/llaisys_core.hpp" +#include "../../../utils.hpp" + +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { +namespace detail { +inline void checkCuda(cudaError_t err) { + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err)); + } +} +} + +template +__global__ void rearrange_kernel(T *out, const T *in, const size_t *shape, const ptrdiff_t *out_strides, const ptrdiff_t *in_strides, + size_t numel, size_t ndim) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } + size_t tmp = idx; + ptrdiff_t in_offset = 0; + ptrdiff_t out_offset = 0; + for (size_t d = ndim; d-- > 0;) { + size_t id = tmp % shape[d]; + tmp /= shape[d]; + in_offset += static_cast(id) * in_strides[d]; + out_offset += static_cast(id) * out_strides[d]; + } + out[out_offset] = in[in_offset]; +} + +void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, const size_t *shape, const ptrdiff_t *out_strides, + const ptrdiff_t *in_strides, size_t numel, size_t ndim) { + auto &runtime = llaisys::core::context().runtime(); + size_t shape_bytes = ndim * sizeof(size_t); + size_t stride_bytes = ndim * sizeof(ptrdiff_t); + auto d_shape = static_cast(runtime.api()->malloc_device(shape_bytes)); + auto d_out_strides = static_cast(runtime.api()->malloc_device(stride_bytes)); + auto d_in_strides = static_cast(runtime.api()->malloc_device(stride_bytes)); + + runtime.api()->memcpy_sync(d_shape, shape, shape_bytes, LLAISYS_MEMCPY_H2D); + runtime.api()->memcpy_sync(d_out_strides, out_strides, stride_bytes, LLAISYS_MEMCPY_H2D); + runtime.api()->memcpy_sync(d_in_strides, in_strides, stride_bytes, LLAISYS_MEMCPY_H2D); + + int threads = 256; + int blocks = static_cast((numel + threads - 1) / threads); + auto stream = reinterpret_cast(runtime.stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + rearrange_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + d_shape, d_out_strides, d_in_strides, numel, ndim); + break; + case LLAISYS_DTYPE_BF16: + rearrange_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + d_shape, d_out_strides, d_in_strides, numel, ndim); + break; + case LLAISYS_DTYPE_F16: + rearrange_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + d_shape, d_out_strides, d_in_strides, numel, ndim); + break; + default: + runtime.api()->free_device(d_shape); + runtime.api()->free_device(d_out_strides); + runtime.api()->free_device(d_in_strides); + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); + + runtime.api()->free_device(d_shape); + runtime.api()->free_device(d_out_strides); + runtime.api()->free_device(d_in_strides); +} +} diff --git a/src/ops/rearrange/nvidia/rearrange_nvidia.hpp b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp new file mode 100644 index 00000000..19bdbc2d --- /dev/null +++ b/src/ops/rearrange/nvidia/rearrange_nvidia.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include "llaisys.h" + +#include +#include + +namespace llaisys::ops::nvidia { +void rearrange(std::byte *out, const std::byte *in, llaisysDataType_t type, const size_t *shape, const ptrdiff_t *out_strides, + const ptrdiff_t *in_strides, size_t numel, size_t ndim); +} diff --git a/src/ops/rearrange/op.cpp b/src/ops/rearrange/op.cpp index ae1523fc..74a89df5 100644 --- a/src/ops/rearrange/op.cpp +++ b/src/ops/rearrange/op.cpp @@ -4,6 +4,7 @@ #include "../../utils.hpp" #include "cpu/rearrange_cpu.hpp" +#include "nvidia/rearrange_nvidia.hpp" namespace llaisys::ops { void rearrange(tensor_t out, tensor_t in) { @@ -30,8 +31,8 @@ void rearrange(tensor_t out, tensor_t in) { in_strides.data(), numel, ndim); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::rearrange(out->data(), in->data(), out->dtype(), shape.data(), out_strides.data(), + in_strides.data(), numel, ndim); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu new file mode 100644 index 00000000..623d6aba --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -0,0 +1,179 @@ +#include "rms_norm_nvidia.hpp" + +#include "../../../core/llaisys_core.hpp" +#include "../../../utils.hpp" + +#include +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { +namespace detail { +inline void checkCuda(cudaError_t err) { + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err)); + } +} + +__device__ inline float f16_to_f32(llaisys::fp16_t val) { + uint16_t h = val._v; + uint32_t sign = (h & 0x8000) << 16; + int32_t exponent = (h >> 10) & 0x1F; + uint32_t mantissa = h & 0x3FF; + uint32_t f32; + if (exponent == 31) { + f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000); + } else if (exponent == 0) { + if (mantissa == 0) { + f32 = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + union { + uint32_t u; + float f; + } tmp; + tmp.u = f32; + return tmp.f; +} + +__device__ inline llaisys::fp16_t f32_to_f16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t f32 = tmp.u; + uint16_t sign = (f32 >> 16) & 0x8000; + int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + uint32_t mantissa = f32 & 0x7FFFFF; + if (exponent >= 16) { + if (exponent == 128 && mantissa != 0) { + return llaisys::fp16_t{static_cast(sign | 0x7E00)}; + } + return llaisys::fp16_t{static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return llaisys::fp16_t{static_cast(sign | ((exponent + 15) << 10) | (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return llaisys::fp16_t{static_cast(sign | (mantissa >> 13))}; + } + return llaisys::fp16_t{static_cast(sign)}; +} + +__device__ inline float bf16_to_f32(llaisys::bf16_t val) { + uint32_t bits32 = static_cast(val._v) << 16; + union { + uint32_t u; + float f; + } tmp; + tmp.u = bits32; + return tmp.f; +} + +__device__ inline llaisys::bf16_t f32_to_bf16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t bits32 = tmp.u; + const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + uint16_t bf16_bits = static_cast((bits32 + rounding_bias) >> 16); + return llaisys::bf16_t{bf16_bits}; +} + +template +__device__ inline float to_float(T v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f16_to_f32(v); + } else if constexpr (std::is_same_v) { + return bf16_to_f32(v); + } else { + return static_cast(v); + } +} + +template +__device__ inline T from_float(float v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f32_to_f16(v); + } else if constexpr (std::is_same_v) { + return f32_to_bf16(v); + } else { + return static_cast(v); + } +} +} + +template +__global__ void rms_norm_kernel(T *out, const T *in, const T *weight, float eps, size_t batch, size_t in_dim) { + size_t row = blockIdx.x; + if (row >= batch) { + return; + } + const T *in_row = in + row * in_dim; + T *out_row = out + row * in_dim; + float acc = 0.0f; + for (size_t j = 0; j < in_dim; ++j) { + float v = detail::to_float(in_row[j]); + acc += v * v; + } + float denom = sqrtf(acc / static_cast(in_dim) + eps); + for (size_t j = 0; j < in_dim; ++j) { + float v = detail::to_float(in_row[j]) * detail::to_float(weight[j]) / denom; + out_row[j] = detail::from_float(v); + } +} + +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch, + size_t in_dim) { + int threads = 1; + int blocks = static_cast(batch); + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + rms_norm_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + eps, batch, in_dim); + break; + case LLAISYS_DTYPE_BF16: + rms_norm_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + eps, batch, in_dim); + break; + case LLAISYS_DTYPE_F16: + rms_norm_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + eps, batch, in_dim); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp new file mode 100644 index 00000000..15922384 --- /dev/null +++ b/src/ops/rms_norm/nvidia/rms_norm_nvidia.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch, + size_t in_dim); +} diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index d57c96cf..fb8cbc12 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -2,6 +2,7 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/rms_norm_cpu.hpp" +#include "nvidia/rms_norm_nvidia.hpp" namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { CHECK_SAME_DEVICE(out, in, weight); @@ -27,8 +28,7 @@ void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { weight->data(), eps, out->dtype(), batch, in_dim); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::rms_norm(out->data(), in->data(), weight->data(), eps, out->dtype(), batch, in_dim); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/rope/nvidia/rope_nvidia.cu b/src/ops/rope/nvidia/rope_nvidia.cu new file mode 100644 index 00000000..f098db48 --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.cu @@ -0,0 +1,188 @@ +#include "rope_nvidia.hpp" + +#include "../../../core/llaisys_core.hpp" +#include "../../../utils.hpp" + +#include +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { +namespace detail { +inline void checkCuda(cudaError_t err) { + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err)); + } +} + +__device__ inline float f16_to_f32(llaisys::fp16_t val) { + uint16_t h = val._v; + uint32_t sign = (h & 0x8000) << 16; + int32_t exponent = (h >> 10) & 0x1F; + uint32_t mantissa = h & 0x3FF; + uint32_t f32; + if (exponent == 31) { + f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000); + } else if (exponent == 0) { + if (mantissa == 0) { + f32 = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + union { + uint32_t u; + float f; + } tmp; + tmp.u = f32; + return tmp.f; +} + +__device__ inline llaisys::fp16_t f32_to_f16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t f32 = tmp.u; + uint16_t sign = (f32 >> 16) & 0x8000; + int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + uint32_t mantissa = f32 & 0x7FFFFF; + if (exponent >= 16) { + if (exponent == 128 && mantissa != 0) { + return llaisys::fp16_t{static_cast(sign | 0x7E00)}; + } + return llaisys::fp16_t{static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return llaisys::fp16_t{static_cast(sign | ((exponent + 15) << 10) | (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return llaisys::fp16_t{static_cast(sign | (mantissa >> 13))}; + } + return llaisys::fp16_t{static_cast(sign)}; +} + +__device__ inline float bf16_to_f32(llaisys::bf16_t val) { + uint32_t bits32 = static_cast(val._v) << 16; + union { + uint32_t u; + float f; + } tmp; + tmp.u = bits32; + return tmp.f; +} + +__device__ inline llaisys::bf16_t f32_to_bf16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t bits32 = tmp.u; + const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + uint16_t bf16_bits = static_cast((bits32 + rounding_bias) >> 16); + return llaisys::bf16_t{bf16_bits}; +} + +template +__device__ inline float to_float(T v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f16_to_f32(v); + } else if constexpr (std::is_same_v) { + return bf16_to_f32(v); + } else { + return static_cast(v); + } +} + +template +__device__ inline T from_float(float v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f32_to_f16(v); + } else if constexpr (std::is_same_v) { + return f32_to_bf16(v); + } else { + return static_cast(v); + } +} +} + +template +__global__ void rope_kernel(T *out, const T *in, const int64_t *pos_ids, float theta, size_t seq_len, size_t n_heads, + size_t head_dim) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t half = head_dim / 2; + size_t total = seq_len * n_heads * half; + if (idx >= total) { + return; + } + size_t tmp = idx; + size_t j = tmp % half; + tmp /= half; + size_t h = tmp % n_heads; + size_t s = tmp / n_heads; + int64_t p = pos_ids[s]; + float angle = static_cast(p) / powf(theta, 2.0f * static_cast(j) / static_cast(head_dim)); + float cosine = cosf(angle); + float sine = sinf(angle); + size_t base = (s * n_heads + h) * head_dim; + T a = in[base + j]; + T b = in[base + j + half]; + float af = detail::to_float(a); + float bf = detail::to_float(b); + out[base + j] = detail::from_float(af * cosine - bf * sine); + out[base + j + half] = detail::from_float(af * sine + bf * cosine); +} + +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim) { + size_t half = head_dim / 2; + size_t total = seq_len * n_heads * half; + int threads = 256; + int blocks = static_cast((total + threads - 1) / threads); + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + theta, seq_len, n_heads, head_dim); + break; + case LLAISYS_DTYPE_BF16: + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + theta, seq_len, n_heads, head_dim); + break; + case LLAISYS_DTYPE_F16: + rope_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + theta, seq_len, n_heads, head_dim); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/rope/nvidia/rope_nvidia.hpp b/src/ops/rope/nvidia/rope_nvidia.hpp new file mode 100644 index 00000000..ea72276b --- /dev/null +++ b/src/ops/rope/nvidia/rope_nvidia.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim); +} diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d3cab559..64250fa1 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -2,6 +2,7 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/rope_cpu.hpp" +#include "nvidia/rope_nvidia.hpp" namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { CHECK_SAME_DEVICE(out, in, pos_ids); @@ -28,8 +29,7 @@ void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::rope(out->data(), in->data(), pos_ids->data(), theta, out->dtype(), seq_len, n_heads, head_dim); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu new file mode 100644 index 00000000..aae4d79e --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu @@ -0,0 +1,34 @@ +#include "self_attention_nvidia.hpp" + +#include "../../../core/llaisys_core.hpp" +#include "../../../utils.hpp" +#include "../cpu/self_attention_cpu.hpp" + +namespace llaisys::ops::nvidia { +void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale, + llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv) { + auto &runtime = llaisys::core::context().runtime(); + size_t elem_size = llaisys::utils::dsize(type); + size_t q_bytes = qlen * nh * hd * elem_size; + size_t k_bytes = kvlen * nkvh * hd * elem_size; + size_t v_bytes = kvlen * nkvh * dv * elem_size; + size_t out_bytes = qlen * nh * dv * elem_size; + + auto host_q = static_cast(runtime.api()->malloc_host(q_bytes)); + auto host_k = static_cast(runtime.api()->malloc_host(k_bytes)); + auto host_v = static_cast(runtime.api()->malloc_host(v_bytes)); + auto host_out = static_cast(runtime.api()->malloc_host(out_bytes)); + + runtime.api()->memcpy_sync(host_q, q, q_bytes, LLAISYS_MEMCPY_D2H); + runtime.api()->memcpy_sync(host_k, k, k_bytes, LLAISYS_MEMCPY_D2H); + runtime.api()->memcpy_sync(host_v, v, v_bytes, LLAISYS_MEMCPY_D2H); + + cpu::self_attention(host_out, host_q, host_k, host_v, scale, type, qlen, kvlen, nh, nkvh, hd, dv); + runtime.api()->memcpy_sync(out, host_out, out_bytes, LLAISYS_MEMCPY_H2D); + + runtime.api()->free_host(host_q); + runtime.api()->free_host(host_k); + runtime.api()->free_host(host_v); + runtime.api()->free_host(host_out); +} +} diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.hpp b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp new file mode 100644 index 00000000..fe25c256 --- /dev/null +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale, + llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv); +} diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index abfe1964..cd8220d7 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -2,6 +2,7 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/self_attention_cpu.hpp" +#include "nvidia/self_attention_nvidia.hpp" namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { @@ -39,8 +40,8 @@ void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float attn_val->dtype(), qlen, kvlen, nh, nkvh, hd, dv); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::self_attention(attn_val->data(), q->data(), k->data(), v->data(), scale, attn_val->dtype(), qlen, + kvlen, nh, nkvh, hd, dv); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/ops/swiglu/nvidia/swiglu_nvidia.cu new file mode 100644 index 00000000..e4d416e7 --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -0,0 +1,172 @@ +#include "swiglu_nvidia.hpp" + +#include "../../../core/llaisys_core.hpp" +#include "../../../utils.hpp" + +#include +#include +#include +#include +#include +#include + +namespace llaisys::ops::nvidia { +namespace detail { +inline void checkCuda(cudaError_t err) { + if (err != cudaSuccess) { + throw std::runtime_error(std::string("CUDA error: ") + cudaGetErrorString(err)); + } +} + +__device__ inline float f16_to_f32(llaisys::fp16_t val) { + uint16_t h = val._v; + uint32_t sign = (h & 0x8000) << 16; + int32_t exponent = (h >> 10) & 0x1F; + uint32_t mantissa = h & 0x3FF; + uint32_t f32; + if (exponent == 31) { + f32 = mantissa != 0 ? (sign | 0x7F800000 | (mantissa << 13)) : (sign | 0x7F800000); + } else if (exponent == 0) { + if (mantissa == 0) { + f32 = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + union { + uint32_t u; + float f; + } tmp; + tmp.u = f32; + return tmp.f; +} + +__device__ inline llaisys::fp16_t f32_to_f16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t f32 = tmp.u; + uint16_t sign = (f32 >> 16) & 0x8000; + int32_t exponent = ((f32 >> 23) & 0xFF) - 127; + uint32_t mantissa = f32 & 0x7FFFFF; + if (exponent >= 16) { + if (exponent == 128 && mantissa != 0) { + return llaisys::fp16_t{static_cast(sign | 0x7E00)}; + } + return llaisys::fp16_t{static_cast(sign | 0x7C00)}; + } else if (exponent >= -14) { + return llaisys::fp16_t{static_cast(sign | ((exponent + 15) << 10) | (mantissa >> 13))}; + } else if (exponent >= -24) { + mantissa |= 0x800000; + mantissa >>= (-14 - exponent); + return llaisys::fp16_t{static_cast(sign | (mantissa >> 13))}; + } + return llaisys::fp16_t{static_cast(sign)}; +} + +__device__ inline float bf16_to_f32(llaisys::bf16_t val) { + uint32_t bits32 = static_cast(val._v) << 16; + union { + uint32_t u; + float f; + } tmp; + tmp.u = bits32; + return tmp.f; +} + +__device__ inline llaisys::bf16_t f32_to_bf16(float val) { + union { + uint32_t u; + float f; + } tmp; + tmp.f = val; + uint32_t bits32 = tmp.u; + const uint32_t rounding_bias = 0x00007FFF + ((bits32 >> 16) & 1); + uint16_t bf16_bits = static_cast((bits32 + rounding_bias) >> 16); + return llaisys::bf16_t{bf16_bits}; +} + +template +__device__ inline float to_float(T v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f16_to_f32(v); + } else if constexpr (std::is_same_v) { + return bf16_to_f32(v); + } else { + return static_cast(v); + } +} + +template +__device__ inline T from_float(float v) { + if constexpr (std::is_same_v) { + return v; + } else if constexpr (std::is_same_v) { + return f32_to_f16(v); + } else if constexpr (std::is_same_v) { + return f32_to_bf16(v); + } else { + return static_cast(v); + } +} +} + +template +__global__ void swiglu_kernel(T *out, const T *gate, const T *up, size_t seq_len, size_t inter_size) { + size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t total = seq_len * inter_size; + if (idx >= total) { + return; + } + float gate_v = detail::to_float(gate[idx]); + float up_v = detail::to_float(up[idx]); + float out_v = up_v * gate_v / (1.0f + expf(-gate_v)); + out[idx] = detail::from_float(out_v); +} + +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size) { + size_t total = seq_len * inter_size; + int threads = 256; + int blocks = static_cast((total + threads - 1) / threads); + auto stream = reinterpret_cast(llaisys::core::context().runtime().stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + seq_len, inter_size); + break; + case LLAISYS_DTYPE_BF16: + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + seq_len, inter_size); + break; + case LLAISYS_DTYPE_F16: + swiglu_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(gate), + reinterpret_cast(up), + seq_len, inter_size); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); +} +} diff --git a/src/ops/swiglu/nvidia/swiglu_nvidia.hpp b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp new file mode 100644 index 00000000..e4b9b506 --- /dev/null +++ b/src/ops/swiglu/nvidia/swiglu_nvidia.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::nvidia { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t seq_len, size_t inter_size); +} diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 5ce3c4bf..1390d779 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -2,6 +2,7 @@ #include "../../core/llaisys_core.hpp" #include "../../utils.hpp" #include "cpu/swiglu_cpu.hpp" +#include "nvidia/swiglu_nvidia.hpp" namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { @@ -26,8 +27,7 @@ void swiglu(tensor_t out, tensor_t gate, tensor_t up) { out->dtype(), seq_len, inter_size); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); - return; + return nvidia::swiglu(out->data(), gate->data(), up->data(), out->dtype(), seq_len, inter_size); #endif default: EXCEPTION_UNSUPPORTED_DEVICE; diff --git a/xmake.lua b/xmake.lua index 1f65f7a9..3a376751 100644 --- a/xmake.lua +++ b/xmake.lua @@ -15,7 +15,6 @@ option_end() if has_config("nv-gpu") then add_defines("ENABLE_NVIDIA_API") - includes("xmake/nvidia.lua") end target("llaisys-utils") @@ -26,6 +25,10 @@ target("llaisys-utils") if not is_plat("windows") then add_cxflags("-fPIC", "-Wno-unknown-pragmas") end + if has_config("nv-gpu") then + add_cuflags("-Xcompiler=-fPIC") + add_cuflags("-rdc=true") + end add_files("src/utils/*.cpp") @@ -37,13 +40,11 @@ target("llaisys-device") set_kind("static") add_deps("llaisys-utils") add_deps("llaisys-device-cpu") - set_languages("cxx17") set_warnings("all", "error") if not is_plat("windows") then add_cxflags("-fPIC", "-Wno-unknown-pragmas") end - add_files("src/device/*.cpp") on_install(function (target) end) @@ -83,7 +84,6 @@ target_end() target("llaisys-ops") set_kind("static") add_deps("llaisys-ops-cpu") - set_languages("cxx17") set_warnings("all", "error") if not is_plat("windows") then @@ -103,7 +103,20 @@ target("llaisys") add_deps("llaisys-tensor") add_deps("llaisys-ops") - set_languages("cxx17") + if has_config("nv-gpu") then + set_languages("cxx17", "cuda") + add_syslinks("cudart") + add_links("cudadevrt") + set_policy("build.cuda.devlink", true) + add_cuflags("-rdc=true") + add_cuflags("-Xcompiler=-fPIC") + add_files("src/device/nvidia/*.cu") + add_files("src/device/nvidia/*.cpp") + add_files("src/ops/*/nvidia/*.cu") + add_files("src/ops/*/nvidia/*.cpp") + else + set_languages("cxx17") + end set_warnings("all", "error") add_files("src/llaisys/*.cc") set_installdir(".") @@ -119,4 +132,4 @@ target("llaisys") os.cp("lib/*.so", "python/llaisys/libllaisys/") end end) -target_end() \ No newline at end of file +target_end() diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua new file mode 100644 index 00000000..cedaa090 --- /dev/null +++ b/xmake/nvidia.lua @@ -0,0 +1,32 @@ +target("llaisys-device-nvidia") + set_kind("static") + set_languages("cxx17", "cuda") + set_warnings("all", "error") + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + add_cuflags("-Xcompiler=-fPIC") + end + add_cuflags("-rdc=true") + + add_files("../src/device/nvidia/*.cu") + add_files("../src/device/nvidia/*.cpp") + + on_install(function (target) end) +target_end() + +target("llaisys-ops-nvidia") + set_kind("static") + add_deps("llaisys-tensor") + set_languages("cxx17", "cuda") + set_warnings("all", "error") + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + add_cuflags("-Xcompiler=-fPIC") + end + add_cuflags("-rdc=true") + + add_files("../src/ops/*/nvidia/*.cu") + add_files("../src/ops/*/nvidia/*.cpp") + + on_install(function (target) end) +target_end() From ca565603880d3d5dcade422c799d09294a977cd0 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Fri, 27 Feb 2026 21:56:10 +0800 Subject: [PATCH 12/14] implement: inference using cuda. && test passed. --- python/llaisys/models/qwen2.py | 34 +- python/llaisys/server/chat_server.py | 2 +- python/llaisys/webUI/index.html | 29 +- src/llaisys/qwen2.cc | 487 +++++++++++++++++++++++++-- 4 files changed, 514 insertions(+), 38 deletions(-) diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index dcd1263b..c89cbb04 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -6,9 +6,11 @@ from ..libllaisys.models import LlaisysQwen2Meta from pathlib import Path +import os import safetensors import json import ctypes +import torch class Qwen2: @@ -36,9 +38,11 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU): end_token = int(config["eos_token_id"]) + dtype = self._select_dtype(device) + self._dtype = dtype # construct C struct LlaisysQwen2Meta meta = LlaisysQwen2Meta( - dtype=DataType.BF16, + dtype=dtype, nlayer=nlayer, hs=hs, nh=nh, @@ -52,7 +56,6 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU): end_token=end_token, ) - # only use cpu device_ids = (ctypes.c_int * 1)(0) # create model instance self._model = LIB_LLAISYS.llaisysQwen2ModelCreate( @@ -72,7 +75,11 @@ def __init__(self, model_path, device: DeviceType = DeviceType.CPU): if weight is None: continue # load weight to c side - arr = data_.get_tensor(name_).contiguous() #c-contiguous + arr = data_.get_tensor(name_) + torch_dtype = self._torch_dtype(self._dtype) + if arr.dtype != torch_dtype: + arr = arr.to(torch_dtype) + arr = arr.contiguous() LIB_LLAISYS.tensorLoad(weight, ctypes.c_void_p(arr.data_ptr())) @@ -126,6 +133,27 @@ def _match_weight(self, name: str): return w.mlp_down_w[layer] return None + def _select_dtype(self, device: DeviceType) -> DataType: + dtype_env = os.environ.get("LLAISYS_DTYPE", "").strip().lower() + if dtype_env in ("f16", "float16"): + return DataType.F16 + if dtype_env in ("f32", "float32"): + return DataType.F32 + if dtype_env in ("bf16", "bfloat16"): + return DataType.BF16 + if device == DeviceType.NVIDIA: + return DataType.F32 + return DataType.BF16 + + def _torch_dtype(self, dtype: DataType): + if dtype == DataType.F16: + return torch.float16 + if dtype == DataType.F32: + return torch.float32 + if dtype == DataType.BF16: + return torch.bfloat16 + return torch.float32 + def _infer(self, tokens: Sequence[int], temperature: float, top_k: int, top_p: float, seed: int) -> int: # step forward infer diff --git a/python/llaisys/server/chat_server.py b/python/llaisys/server/chat_server.py index 4682a0f6..72af35ff 100644 --- a/python/llaisys/server/chat_server.py +++ b/python/llaisys/server/chat_server.py @@ -252,7 +252,7 @@ def chat_ui(): parser.add_argument("--host", default="0.0.0.0") parser.add_argument("--port", default=8000, type=int) parser.add_argument("--model_path", default=None) - parser.add_argument("--device", default="cpu") + parser.add_argument("--device", default="nvidia") parser.add_argument("--model_name", default="llaisys-qwen2") args = parser.parse_args() diff --git a/python/llaisys/webUI/index.html b/python/llaisys/webUI/index.html index b8ac1a3d..6da35694 100644 --- a/python/llaisys/webUI/index.html +++ b/python/llaisys/webUI/index.html @@ -273,6 +273,23 @@

新对话

}; } + function safeParseJson(data) { + try { + return JSON.parse(data); + } catch (err) { + const first = data.indexOf("{"); + const last = data.lastIndexOf("}"); + if (first !== -1 && last !== -1 && last > first) { + try { + return JSON.parse(data.slice(first, last + 1)); + } catch (innerErr) { + return null; + } + } + return null; + } + } + async function requestCompletion(messages, assistantIndex) { const payload = buildRequest(messages); try { @@ -283,7 +300,9 @@

新对话

body: JSON.stringify(payload) }); if (!res.ok) throw new Error("请求失败: " + res.status); - const data = await res.json(); + const text = await res.text(); + const data = safeParseJson(text); + if (!data) throw new Error("响应不是合法JSON"); const content = data.choices?.[0]?.message?.content || ""; updateMessageContent(assistantIndex, content); return; @@ -302,10 +321,11 @@

新对话

const { value, done } = await reader.read(); if (done) break; buffer += decoder.decode(value, { stream: true }); - const parts = buffer.split("\\n\\n"); + buffer = buffer.replace(/\r\n/g, "\n").replace(/\r/g, "\n"); + const parts = buffer.split("\n\n"); buffer = parts.pop(); for (const part of parts) { - const lines = part.split("\\n"); + const lines = part.split("\n"); for (const line of lines) { if (!line.startsWith("data: ")) continue; const data = line.slice(6).trim(); @@ -313,7 +333,8 @@

新对话

updateMessageContent(assistantIndex, current); return; } - const payload = JSON.parse(data); + const payload = safeParseJson(data); + if (!payload) continue; const delta = payload.choices?.[0]?.delta?.content || ""; if (delta) { current += delta; diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index e9d123b1..52b25db9 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -2,6 +2,7 @@ #include "llaisys_tensor.hpp" #include "../tensor/tensor.hpp" +#include "../core/llaisys_core.hpp" #include "../ops/add/op.hpp" #include "../ops/rand_sample/op.hpp" #include "../ops/embedding/op.hpp" @@ -14,7 +15,12 @@ #include "../utils.hpp" #include +#include #include +#include +#include +#include +#include #include // wrap c++ tensor to external handle @@ -40,8 +46,13 @@ llaisys::tensor_t make_tensor_dtype( // set tensor data to zero void zero_tensor(const llaisys::tensor_t &t) { - CHECK_ARGUMENT(t->deviceType() == LLAISYS_DEVICE_CPU, "Zero: only CPU is supported."); - std::memset(t->data(), 0, t->numel() * t->elementSize()); + size_t size = t->numel() * t->elementSize(); + if (t->deviceType() == LLAISYS_DEVICE_CPU) { + std::memset(t->data(), 0, size); + return; + } + std::vector zeros(size); + t->load(zeros.data()); } // wrap c++ tensor to external handle and record to handles list @@ -53,6 +64,315 @@ llaisysTensor_t wrap_tensor( return h; } +size_t ceil_div(size_t a, size_t b) { + return (a + b - 1) / b; +} + +size_t read_env_size_t(const char *name, size_t default_value) { + const char *value = std::getenv(name); + if (!value) { + return default_value; + } + char *end = nullptr; + unsigned long long parsed = std::strtoull(value, &end, 10); + if (end == value || *end != '\0') { + return default_value; + } + return static_cast(parsed); +} + +uint64_t hash_mix(uint64_t h, int64_t v, uint64_t base) { + return h * base + (static_cast(v) + 0x9e3779b97f4a7c15ull); +} + +struct KVPagePoolLayer { + std::vector k_pages; + std::vector v_pages; + std::vector refcnt; + std::vector last_access; + std::vector free_ids; + std::vector is_free; +}; + +class KVCachePool { +public: + void init(const LlaisysQwen2Meta &meta, llaisysDeviceType_t device, int device_id, size_t page_len, size_t max_pages) { + meta_ = meta; + device_ = device; + device_id_ = device_id; + page_len_ = page_len; + max_pages_ = max_pages; + access_clock_ = 1; + layers_.assign(meta.nlayer, KVPagePoolLayer{}); + } + + size_t page_len() const { + return page_len_; + } + + size_t acquire_page(size_t layer) { + auto &pool = layers_[layer]; + while (!pool.free_ids.empty()) { + size_t id = pool.free_ids.back(); + pool.free_ids.pop_back(); + if (pool.is_free[id] && pool.refcnt[id] == 0) { + pool.is_free[id] = 0; + pool.last_access[id] = access_clock_++; + return id; + } + } + + if (pool.k_pages.size() < max_pages_) { + size_t id = pool.k_pages.size(); + pool.k_pages.push_back(make_tensor(meta_, device_, device_id_, {page_len_, meta_.nkvh, meta_.dh})); + pool.v_pages.push_back(make_tensor(meta_, device_, device_id_, {page_len_, meta_.nkvh, meta_.dh})); + pool.refcnt.push_back(0); + pool.last_access.push_back(access_clock_++); + pool.is_free.push_back(0); + return id; + } + + size_t selected = std::numeric_limits::max(); + uint64_t best_access = std::numeric_limits::max(); + for (size_t i = 0; i < pool.k_pages.size(); ++i) { + if (pool.refcnt[i] == 0 && pool.last_access[i] <= best_access) { + best_access = pool.last_access[i]; + selected = i; + } + } + CHECK_ARGUMENT(selected != std::numeric_limits::max(), "KV cache pool has no free page."); + pool.is_free[selected] = 0; + pool.last_access[selected] = access_clock_++; + return selected; + } + + void incref(size_t layer, size_t page_id) { + auto &pool = layers_[layer]; + if (pool.is_free[page_id]) { + pool.is_free[page_id] = 0; + } + pool.refcnt[page_id] += 1; + pool.last_access[page_id] = access_clock_++; + } + + void decref(size_t layer, size_t page_id) { + auto &pool = layers_[layer]; + if (pool.refcnt[page_id] > 0) { + pool.refcnt[page_id] -= 1; + if (pool.refcnt[page_id] == 0 && !pool.is_free[page_id]) { + pool.is_free[page_id] = 1; + pool.free_ids.push_back(page_id); + } + } + pool.last_access[page_id] = access_clock_++; + } + + llaisys::tensor_t k_page(size_t layer, size_t page_id) { + auto &pool = layers_[layer]; + pool.last_access[page_id] = access_clock_++; + return pool.k_pages[page_id]; + } + + llaisys::tensor_t v_page(size_t layer, size_t page_id) { + auto &pool = layers_[layer]; + pool.last_access[page_id] = access_clock_++; + return pool.v_pages[page_id]; + } + +private: + LlaisysQwen2Meta meta_; + llaisysDeviceType_t device_; + int device_id_; + size_t page_len_; + size_t max_pages_; + uint64_t access_clock_ = 1; + std::vector layers_; +}; + +struct KVHandle { + std::vector> layer_pages; + size_t token_count = 0; + uint64_t last_access = 0; + std::vector tokens; + std::vector hash_keys; +}; + +class PrefixCacheIndex { +public: + void init(size_t max_handles) { + max_handles_ = max_handles; + access_clock_ = 1; + handles_.clear(); + alive_.clear(); + key_to_handles_.clear(); + free_handle_ids_.clear(); + } + + size_t find_longest_prefix(const int64_t *tokens, size_t ntoken, const KVCachePool &pool, KVHandle &out_handle) { + if (ntoken == 0) { + return 0; + } + auto hashes = prefix_hashes(tokens, ntoken); + for (size_t len = ntoken; len > 0; --len) { + uint64_t key = make_key(hashes[len], len); + auto it = key_to_handles_.find(key); + if (it == key_to_handles_.end()) { + continue; + } + for (size_t handle_id : it->second) { + if (handle_id >= handles_.size() || !alive_[handle_id]) { + continue; + } + const auto &h = handles_[handle_id]; + if (h.tokens.size() < len) { + continue; + } + if (!std::equal(tokens, tokens + len, h.tokens.begin())) { + continue; + } + size_t pages_needed = ceil_div(len, pool.page_len()); + out_handle.layer_pages.assign(h.layer_pages.size(), {}); + for (size_t i = 0; i < h.layer_pages.size(); ++i) { + out_handle.layer_pages[i].assign(h.layer_pages[i].begin(), h.layer_pages[i].begin() + pages_needed); + } + out_handle.token_count = len; + out_handle.tokens.assign(tokens, tokens + len); + out_handle.last_access = access_clock_++; + handles_[handle_id].last_access = out_handle.last_access; + return len; + } + } + return 0; + } + + void insert_handle(const KVHandle &handle, const int64_t *tokens, size_t ntoken, KVCachePool &pool) { + if (ntoken == 0) { + return; + } + size_t handle_id = acquire_handle_id(); + if (handle_id >= handles_.size()) { + handles_.resize(handle_id + 1); + alive_.resize(handle_id + 1, 0); + } + if (alive_[handle_id]) { + release_handle(handle_id, pool); + } + KVHandle stored; + stored.token_count = ntoken; + stored.tokens.assign(tokens, tokens + ntoken); + stored.layer_pages = handle.layer_pages; + size_t pages_needed = ceil_div(ntoken, pool.page_len()); + for (size_t i = 0; i < stored.layer_pages.size(); ++i) { + if (stored.layer_pages[i].size() > pages_needed) { + stored.layer_pages[i].resize(pages_needed); + } + for (size_t page_id : stored.layer_pages[i]) { + pool.incref(i, page_id); + } + } + auto hashes = prefix_hashes(tokens, ntoken); + stored.hash_keys.reserve(ntoken); + for (size_t len = 1; len <= ntoken; ++len) { + uint64_t key = make_key(hashes[len], len); + key_to_handles_[key].push_back(handle_id); + stored.hash_keys.push_back(key); + } + stored.last_access = access_clock_++; + handles_[handle_id] = std::move(stored); + alive_[handle_id] = 1; + enforce_capacity(pool); + } + + void release_handle(size_t handle_id, KVCachePool &pool) { + if (handle_id >= handles_.size() || !alive_[handle_id]) { + return; + } + auto &h = handles_[handle_id]; + for (size_t i = 0; i < h.layer_pages.size(); ++i) { + for (size_t page_id : h.layer_pages[i]) { + pool.decref(i, page_id); + } + } + for (uint64_t key : h.hash_keys) { + auto it = key_to_handles_.find(key); + if (it == key_to_handles_.end()) { + continue; + } + auto &vec = it->second; + vec.erase(std::remove(vec.begin(), vec.end(), handle_id), vec.end()); + if (vec.empty()) { + key_to_handles_.erase(it); + } + } + h.layer_pages.clear(); + h.tokens.clear(); + h.hash_keys.clear(); + alive_[handle_id] = 0; + free_handle_ids_.push_back(handle_id); + } + +private: + uint64_t make_key(uint64_t hash, size_t len) const { + return hash ^ (salt_ * static_cast(len)); + } + + std::vector prefix_hashes(const int64_t *tokens, size_t ntoken) const { + std::vector hashes(ntoken + 1); + uint64_t h = 0; + for (size_t i = 0; i < ntoken; ++i) { + h = hash_mix(h, tokens[i], base_); + hashes[i + 1] = h; + } + return hashes; + } + + size_t acquire_handle_id() { + if (!free_handle_ids_.empty()) { + size_t id = free_handle_ids_.back(); + free_handle_ids_.pop_back(); + return id; + } + return handles_.size(); + } + + void enforce_capacity(KVCachePool &pool) { + if (max_handles_ == 0) { + return; + } + size_t alive_count = 0; + for (uint8_t v : alive_) { + alive_count += v; + } + while (alive_count > max_handles_) { + size_t oldest = std::numeric_limits::max(); + uint64_t oldest_access = std::numeric_limits::max(); + for (size_t i = 0; i < handles_.size(); ++i) { + if (!alive_[i]) { + continue; + } + if (handles_[i].last_access <= oldest_access) { + oldest_access = handles_[i].last_access; + oldest = i; + } + } + if (oldest == std::numeric_limits::max()) { + break; + } + release_handle(oldest, pool); + alive_count -= 1; + } + } + + uint64_t base_ = 1469598103934665603ull; + uint64_t salt_ = 1099511628211ull; + size_t max_handles_ = 64; + uint64_t access_clock_ = 1; + std::unordered_map> key_to_handles_; + std::vector handles_; + std::vector alive_; + std::vector free_handle_ids_; +}; + } // Qwen2 Model Instance Structure @@ -76,12 +396,28 @@ struct LlaisysQwen2Model { llaisys::tensor_t out_bias; // KV cache - std::vector k_cache; - std::vector v_cache; - + KVCachePool kv_pool; + PrefixCacheIndex prefix_index; + KVHandle active_handle; + bool active_valid; size_t cache_len; }; +static void release_active_handle(LlaisysQwen2Model *model) { + if (!model || !model->active_valid) { + return; + } + for (size_t i = 0; i < model->active_handle.layer_pages.size(); ++i) { + for (size_t page_id : model->active_handle.layer_pages[i]) { + model->kv_pool.decref(i, page_id); + } + } + model->active_handle.layer_pages.assign(model->meta.nlayer, {}); + model->active_handle.tokens.clear(); + model->active_handle.token_count = 0; + model->active_valid = false; +} + // init model weights, allocate memory and zero bias static void init_weights(LlaisysQwen2Model *model) { const auto &m = model->meta; @@ -186,13 +522,15 @@ static void init_weights(LlaisysQwen2Model *model) { // initialize kv cache tensor static void init_cache(LlaisysQwen2Model *model) { const auto &m = model->meta; - model->k_cache.resize(m.nlayer); - model->v_cache.resize(m.nlayer); - for (size_t i = 0; i < m.nlayer; ++i) { - // cache shape: [maxseq, num_heads, head_dim] - model->k_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh}); - model->v_cache[i] = make_tensor(m, model->device, model->device_id, {m.maxseq, m.nkvh, m.dh}); - } + size_t page_len = read_env_size_t("LLAISYS_KV_PAGE_LEN", 128); + size_t max_pages = read_env_size_t("LLAISYS_KV_MAX_PAGES", ceil_div(m.maxseq, page_len)); + size_t max_handles = read_env_size_t("LLAISYS_KV_MAX_HANDLES", 64); + model->kv_pool.init(m, model->device, model->device_id, page_len, max_pages); + model->prefix_index.init(max_handles); + model->active_handle.layer_pages.assign(m.nlayer, {}); + model->active_handle.tokens.clear(); + model->active_handle.token_count = 0; + model->active_valid = false; model->cache_len = 0; } @@ -201,18 +539,44 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si size_t topK, float topP, int64_t seed) { CHECK_ARGUMENT(model != nullptr, "model is null"); CHECK_ARGUMENT(token_ids != nullptr || ntoken == 0, "token_ids is null"); - CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU, "Only CPU device is supported."); + CHECK_ARGUMENT(model->device == LLAISYS_DEVICE_CPU || model->device == LLAISYS_DEVICE_NVIDIA, + "Unsupported device type."); if (ntoken == 0) { return model->meta.end_token; } - // prefill - if (ntoken > 1 || model->cache_len == 0) { - model->cache_len = 0; + size_t reuse_len = 0; + if (ntoken > 1) { + release_active_handle(model); + reuse_len = model->prefix_index.find_longest_prefix(token_ids, ntoken, model->kv_pool, model->active_handle); + if (reuse_len > 0) { + model->active_valid = true; + for (size_t i = 0; i < model->active_handle.layer_pages.size(); ++i) { + for (size_t page_id : model->active_handle.layer_pages[i]) { + model->kv_pool.incref(i, page_id); + } + } + } + if (reuse_len >= ntoken) { + reuse_len = ntoken - 1; + } + if (model->active_valid) { + size_t max_pages = ceil_div(reuse_len, model->kv_pool.page_len()); + for (size_t i = 0; i < model->active_handle.layer_pages.size(); ++i) { + while (model->active_handle.layer_pages[i].size() > max_pages) { + size_t page_id = model->active_handle.layer_pages[i].back(); + model->active_handle.layer_pages[i].pop_back(); + model->kv_pool.decref(i, page_id); + } + } + model->active_handle.token_count = reuse_len; + model->active_handle.tokens.assign(token_ids, token_ids + reuse_len); + } + model->cache_len = reuse_len; } - size_t seqlen = ntoken; + size_t seqlen = ntoken - reuse_len; size_t pos_offset = model->cache_len; // position ID [pos_offset, pos_offset + seqlen) @@ -223,7 +587,7 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si // input token and position ID tensors auto input_ids = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen}); - input_ids->load(token_ids); + input_ids->load(token_ids + reuse_len); auto pos_tensor = make_tensor_dtype(LLAISYS_DTYPE_I64, model->device, model->device_id, {seqlen}); pos_tensor->load(pos_ids.data()); @@ -235,6 +599,21 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si // attn scale factor float scale = 1.0f / std::sqrt(static_cast(model->meta.dh)); + size_t total_len = model->cache_len + seqlen; + size_t page_len = model->kv_pool.page_len(); + size_t pages_needed = ceil_div(total_len, page_len); + if (model->active_handle.layer_pages.size() != model->meta.nlayer) { + model->active_handle.layer_pages.assign(model->meta.nlayer, {}); + } + for (size_t i = 0; i < model->meta.nlayer; ++i) { + while (model->active_handle.layer_pages[i].size() < pages_needed) { + size_t page_id = model->kv_pool.acquire_page(i); + model->kv_pool.incref(i, page_id); + model->active_handle.layer_pages[i].push_back(page_id); + model->active_valid = true; + } + } + // layer forward for (size_t i = 0; i < model->meta.nlayer; ++i) { // attn input norm @@ -262,17 +641,43 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si llaisys::ops::rope(q_rope, q_view, pos_tensor, model->meta.theta); llaisys::ops::rope(k_rope, k_view, pos_tensor, model->meta.theta); - // write new k/v to cache - auto k_cache_slice = model->k_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen); - auto v_cache_slice = model->v_cache[i]->slice(0, model->cache_len, model->cache_len + seqlen); - - llaisys::ops::rearrange(k_cache_slice, k_rope); - llaisys::ops::rearrange(v_cache_slice, v_view); - - // get all history k/v - size_t total_len = model->cache_len + seqlen; - auto k_total = model->k_cache[i]->slice(0, 0, total_len); - auto v_total = model->v_cache[i]->slice(0, 0, total_len); + size_t write_offset = model->cache_len; + size_t remaining = seqlen; + size_t src_offset = 0; + while (remaining > 0) { + size_t page_index = write_offset / page_len; + size_t page_offset = write_offset % page_len; + size_t chunk = std::min(remaining, page_len - page_offset); + size_t page_id = model->active_handle.layer_pages[i][page_index]; + auto k_page = model->kv_pool.k_page(i, page_id); + auto v_page = model->kv_pool.v_page(i, page_id); + auto k_page_slice = k_page->slice(0, page_offset, page_offset + chunk); + auto v_page_slice = v_page->slice(0, page_offset, page_offset + chunk); + auto k_chunk = k_rope->slice(0, src_offset, src_offset + chunk); + auto v_chunk = v_view->slice(0, src_offset, src_offset + chunk); + llaisys::ops::rearrange(k_page_slice, k_chunk); + llaisys::ops::rearrange(v_page_slice, v_chunk); + write_offset += chunk; + src_offset += chunk; + remaining -= chunk; + } + + auto k_total = make_tensor(model->meta, model->device, model->device_id, {total_len, model->meta.nkvh, model->meta.dh}); + auto v_total = make_tensor(model->meta, model->device, model->device_id, {total_len, model->meta.nkvh, model->meta.dh}); + size_t read_offset = 0; + for (size_t page_index = 0; page_index < pages_needed; ++page_index) { + size_t chunk = std::min(page_len, total_len - read_offset); + size_t page_id = model->active_handle.layer_pages[i][page_index]; + auto k_page = model->kv_pool.k_page(i, page_id); + auto v_page = model->kv_pool.v_page(i, page_id); + auto k_page_slice = k_page->slice(0, 0, chunk); + auto v_page_slice = v_page->slice(0, 0, chunk); + auto k_total_slice = k_total->slice(0, read_offset, read_offset + chunk); + auto v_total_slice = v_total->slice(0, read_offset, read_offset + chunk); + llaisys::ops::rearrange(k_total_slice, k_page_slice); + llaisys::ops::rearrange(v_total_slice, v_page_slice); + read_offset += chunk; + } // self attn auto attn = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.nh, model->meta.dh}); @@ -315,6 +720,18 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si // update cache len model->cache_len += seqlen; + model->active_handle.token_count = model->cache_len; + if (model->active_valid) { + if (model->active_handle.tokens.size() < model->cache_len) { + model->active_handle.tokens.insert( + model->active_handle.tokens.end(), + token_ids + reuse_len, + token_ids + reuse_len + seqlen); + } + } + if (ntoken > 1 && model->active_valid) { + model->prefix_index.insert_handle(model->active_handle, token_ids, ntoken, model->kv_pool); + } // final norm auto x_norm = make_tensor(model->meta, model->device, model->device_id, {seqlen, model->meta.hs}); @@ -330,7 +747,17 @@ static int64_t infer_impl(LlaisysQwen2Model *model, const int64_t *token_ids, si auto sample_val = make_tensor(model->meta, model->device, model->device_id, {1}); llaisys::ops::rand_sample(sample_idx, sample_val, last, temperature, topK, topP, seed); - return reinterpret_cast(sample_idx->data())[0]; + if (model->device == LLAISYS_DEVICE_CPU) { + return reinterpret_cast(sample_idx->data())[0]; + } + int64_t host_value = 0; + llaisys::core::context().setDevice(model->device, model->device_id); + llaisys::core::context().runtime().api()->memcpy_sync( + &host_value, + sample_idx->data(), + sizeof(int64_t), + LLAISYS_MEMCPY_D2H); + return host_value; } // C API wrapper From 101e733a11141b216da0ea76f948af14f120c8d1 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Sun, 15 Mar 2026 11:06:05 +0800 Subject: [PATCH 13/14] implement cuda ops: rand_sample && self attention. --- .../rand_sample/nvidia/rand_sample_nvidia.cu | 179 ++++++++++++++++-- .../nvidia/self_attention_nvidia.cu | 151 ++++++++++++--- 2 files changed, 288 insertions(+), 42 deletions(-) diff --git a/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu index b0326878..cdb00f7b 100644 --- a/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu +++ b/src/ops/rand_sample/nvidia/rand_sample_nvidia.cu @@ -1,29 +1,172 @@ #include "rand_sample_nvidia.hpp" #include "../../../core/llaisys_core.hpp" -#include "../../../utils.hpp" #include "../../nvidia_utils.cuh" -#include "../cpu/rand_sample_cpu.hpp" + +#include +#include +#include +#include namespace llaisys::ops::nvidia { -void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel, - int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed) { - auto &runtime = llaisys::core::context().runtime(); - size_t vals_bytes = numel * static_cast(batch_size) * llaisys::utils::dsize(type); - size_t idx_bytes = static_cast(batch_size) * sizeof(int64_t); - size_t val_bytes = static_cast(batch_size) * llaisys::utils::dsize(type); +__device__ inline uint64_t lcg_next(uint64_t &state) { + state = state * 6364136223846793005ULL + 1ULL; + return state; +} - auto host_vals = static_cast(runtime.api()->malloc_host(vals_bytes)); - auto host_idx = static_cast(runtime.api()->malloc_host(idx_bytes)); - auto host_val = static_cast(runtime.api()->malloc_host(val_bytes)); +__device__ inline float rng_uniform(uint64_t &state) { + uint64_t x = lcg_next(state); + uint32_t mant = static_cast(x >> 40); + return static_cast(mant) * (1.0f / 16777216.0f); +} - runtime.api()->memcpy_sync(host_vals, vals, vals_bytes, LLAISYS_MEMCPY_D2H); - cpu::rand_sample(host_idx, host_val, host_vals, type, numel, batch_size, temperature, topK, topP, seed); - runtime.api()->memcpy_sync(sample_idx, host_idx, idx_bytes, LLAISYS_MEMCPY_H2D); - runtime.api()->memcpy_sync(sample_val, host_val, val_bytes, LLAISYS_MEMCPY_H2D); +template +__global__ void rand_sample_kernel(int64_t *out_idx, T *out_val, const T *vals, size_t numel, int64_t batch_size, + float temperature, size_t topK, float topP, int64_t seed, float *probs, int *idx) { + int64_t b = static_cast(blockIdx.x); + if (b >= batch_size) { + return; + } + if (threadIdx.x != 0) { + return; + } + const T *v = vals + static_cast(b) * numel; + float *p = probs + static_cast(b) * numel; + int *id = idx + static_cast(b) * numel; + float temp = temperature <= 1e-6f ? 1e-6f : temperature; + size_t max_i = 0; + float max_v = detail::to_float(v[0]); + for (size_t i = 1; i < numel; ++i) { + float cur = detail::to_float(v[i]); + if (cur > max_v) { + max_v = cur; + max_i = i; + } + } + float sum = 0.0f; + for (size_t i = 0; i < numel; ++i) { + float cur = detail::to_float(v[i]); + float val = expf((cur - max_v) / temp); + p[i] = val; + sum += val; + id[i] = static_cast(i); + } + if (sum <= 0.0f) { + for (size_t i = 0; i < numel; ++i) { + p[i] = 0.0f; + } + p[max_i] = 1.0f; + sum = 1.0f; + } else { + float inv = 1.0f / sum; + for (size_t i = 0; i < numel; ++i) { + p[i] *= inv; + } + } + size_t k = numel; + if (topK > 0 && topK < numel) { + k = topK; + } + if ((topK > 0 && topK < numel) || (topP > 0.0f && topP < 1.0f)) { + for (size_t i = 0; i < k; ++i) { + size_t max_pos = i; + float max_val = p[i]; + for (size_t j = i + 1; j < numel; ++j) { + float vj = p[j]; + if (vj > max_val) { + max_val = vj; + max_pos = j; + } + } + if (max_pos != i) { + float tmp = p[i]; + p[i] = p[max_pos]; + p[max_pos] = tmp; + int tmp_i = id[i]; + id[i] = id[max_pos]; + id[max_pos] = tmp_i; + } + } + } + size_t cand = k; + if (topP > 0.0f && topP < 1.0f) { + float cum = 0.0f; + cand = 0; + for (size_t i = 0; i < k; ++i) { + cum += p[i]; + cand = i + 1; + if (cum >= topP) { + break; + } + } + if (cand == 0) { + cand = 1; + } + } + float cand_sum = 0.0f; + for (size_t i = 0; i < cand; ++i) { + cand_sum += p[i]; + } + size_t chosen = static_cast(id[0]); + if (cand_sum > 0.0f) { + uint64_t state = static_cast(seed) ^ (static_cast(b + 1) * 0x9e3779b97f4a7c15ULL); + float r = rng_uniform(state) * cand_sum; + for (size_t i = 0; i < cand; ++i) { + r -= p[i]; + if (r <= 0.0f) { + chosen = static_cast(id[i]); + break; + } + if (i == cand - 1) { + chosen = static_cast(id[i]); + } + } + } + out_idx[b] = static_cast(chosen); + out_val[b] = v[chosen]; +} - runtime.api()->free_host(host_vals); - runtime.api()->free_host(host_idx); - runtime.api()->free_host(host_val); +void rand_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *vals, llaisysDataType_t type, size_t numel, + int64_t batch_size, float temperature, size_t topK, float topP, int64_t seed) { + auto &runtime = llaisys::core::context().runtime(); + if (batch_size <= 0 || numel == 0) { + return; + } + auto d_probs = static_cast(runtime.api()->malloc_device(sizeof(float) * static_cast(batch_size) * numel)); + auto d_idx = static_cast(runtime.api()->malloc_device(sizeof(int) * static_cast(batch_size) * numel)); + dim3 grid(static_cast(batch_size), 1, 1); + int threads = 1; + auto stream = reinterpret_cast(runtime.stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + rand_sample_kernel<<>>( + reinterpret_cast(sample_idx), + reinterpret_cast(sample_val), + reinterpret_cast(vals), + numel, batch_size, temperature, topK, topP, seed, d_probs, d_idx); + break; + case LLAISYS_DTYPE_BF16: + rand_sample_kernel<<>>( + reinterpret_cast(sample_idx), + reinterpret_cast(sample_val), + reinterpret_cast(vals), + numel, batch_size, temperature, topK, topP, seed, d_probs, d_idx); + break; + case LLAISYS_DTYPE_F16: + rand_sample_kernel<<>>( + reinterpret_cast(sample_idx), + reinterpret_cast(sample_val), + reinterpret_cast(vals), + numel, batch_size, temperature, topK, topP, seed, d_probs, d_idx); + break; + default: + runtime.api()->free_device(d_probs); + runtime.api()->free_device(d_idx); + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); + runtime.api()->free_device(d_probs); + runtime.api()->free_device(d_idx); } } diff --git a/src/ops/self_attention/nvidia/self_attention_nvidia.cu b/src/ops/self_attention/nvidia/self_attention_nvidia.cu index aae4d79e..110217ff 100644 --- a/src/ops/self_attention/nvidia/self_attention_nvidia.cu +++ b/src/ops/self_attention/nvidia/self_attention_nvidia.cu @@ -1,34 +1,137 @@ #include "self_attention_nvidia.hpp" #include "../../../core/llaisys_core.hpp" -#include "../../../utils.hpp" -#include "../cpu/self_attention_cpu.hpp" +#include "../../nvidia_utils.cuh" + +#include +#include +#include namespace llaisys::ops::nvidia { +template +__global__ void self_attention_kernel(T *out, const T *q, const T *k, const T *v, float scale, size_t qlen, size_t kvlen, + size_t nh, size_t nkvh, size_t hd, size_t dv) { + size_t i = static_cast(blockIdx.x); + size_t h = static_cast(blockIdx.y); + if (i >= qlen || h >= nh) { + return; + } + size_t kv_group = nh / nkvh; + size_t kvh = h / kv_group; + const T *q_ptr = q + (i * nh + h) * hd; + extern __shared__ float shared[]; + float *smax = shared; + float *ssum = shared + blockDim.x; + int tid = threadIdx.x; + float local_max = -INFINITY; + int64_t limit = static_cast(i) + static_cast(kvlen) - static_cast(qlen); + for (size_t j = static_cast(tid); j < kvlen; j += static_cast(blockDim.x)) { + if (static_cast(j) > limit) { + continue; + } + const T *k_ptr = k + (j * nkvh + kvh) * hd; + float acc = 0.0f; + for (size_t d = 0; d < hd; ++d) { + acc += detail::to_float(q_ptr[d]) * detail::to_float(k_ptr[d]); + } + acc *= scale; + if (acc > local_max) { + local_max = acc; + } + } + smax[tid] = local_max; + __syncthreads(); + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < static_cast(s)) { + float other = smax[tid + s]; + if (other > smax[tid]) { + smax[tid] = other; + } + } + __syncthreads(); + } + float max_score = smax[0]; + float local_sum = 0.0f; + for (size_t j = static_cast(tid); j < kvlen; j += static_cast(blockDim.x)) { + if (static_cast(j) > limit) { + continue; + } + const T *k_ptr = k + (j * nkvh + kvh) * hd; + float acc = 0.0f; + for (size_t d = 0; d < hd; ++d) { + acc += detail::to_float(q_ptr[d]) * detail::to_float(k_ptr[d]); + } + acc *= scale; + local_sum += expf(acc - max_score); + } + ssum[tid] = local_sum; + __syncthreads(); + for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < static_cast(s)) { + ssum[tid] += ssum[tid + s]; + } + __syncthreads(); + } + float sum_score = ssum[0]; + if (sum_score <= 0.0f) { + sum_score = 1.0f; + } + for (size_t d = static_cast(tid); d < dv; d += static_cast(blockDim.x)) { + float acc = 0.0f; + for (size_t j = 0; j < kvlen; ++j) { + if (static_cast(j) > limit) { + continue; + } + const T *k_ptr = k + (j * nkvh + kvh) * hd; + float score = 0.0f; + for (size_t kd = 0; kd < hd; ++kd) { + score += detail::to_float(q_ptr[kd]) * detail::to_float(k_ptr[kd]); + } + score *= scale; + float w = expf(score - max_score) / sum_score; + const T *v_ptr = v + (j * nkvh + kvh) * dv; + acc += w * detail::to_float(v_ptr[d]); + } + out[(i * nh + h) * dv + d] = detail::from_float(acc); + } +} + void self_attention(std::byte *out, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t qlen, size_t kvlen, size_t nh, size_t nkvh, size_t hd, size_t dv) { auto &runtime = llaisys::core::context().runtime(); - size_t elem_size = llaisys::utils::dsize(type); - size_t q_bytes = qlen * nh * hd * elem_size; - size_t k_bytes = kvlen * nkvh * hd * elem_size; - size_t v_bytes = kvlen * nkvh * dv * elem_size; - size_t out_bytes = qlen * nh * dv * elem_size; - - auto host_q = static_cast(runtime.api()->malloc_host(q_bytes)); - auto host_k = static_cast(runtime.api()->malloc_host(k_bytes)); - auto host_v = static_cast(runtime.api()->malloc_host(v_bytes)); - auto host_out = static_cast(runtime.api()->malloc_host(out_bytes)); - - runtime.api()->memcpy_sync(host_q, q, q_bytes, LLAISYS_MEMCPY_D2H); - runtime.api()->memcpy_sync(host_k, k, k_bytes, LLAISYS_MEMCPY_D2H); - runtime.api()->memcpy_sync(host_v, v, v_bytes, LLAISYS_MEMCPY_D2H); - - cpu::self_attention(host_out, host_q, host_k, host_v, scale, type, qlen, kvlen, nh, nkvh, hd, dv); - runtime.api()->memcpy_sync(out, host_out, out_bytes, LLAISYS_MEMCPY_H2D); - - runtime.api()->free_host(host_q); - runtime.api()->free_host(host_k); - runtime.api()->free_host(host_v); - runtime.api()->free_host(host_out); + dim3 grid(static_cast(qlen), static_cast(nh), 1); + int threads = 256; + size_t shared_bytes = static_cast(threads) * sizeof(float) * 2; + auto stream = reinterpret_cast(runtime.stream()); + switch (type) { + case LLAISYS_DTYPE_F32: + self_attention_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nh, nkvh, hd, dv); + break; + case LLAISYS_DTYPE_BF16: + self_attention_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nh, nkvh, hd, dv); + break; + case LLAISYS_DTYPE_F16: + self_attention_kernel<<>>( + reinterpret_cast(out), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + scale, qlen, kvlen, nh, nkvh, hd, dv); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + detail::checkCuda(cudaGetLastError()); + detail::checkCuda(cudaStreamSynchronize(stream)); } } From 4643f0a62a127dd2db85e675ce7cea093cd06136 Mon Sep 17 00:00:00 2001 From: leninist1 <282516536@qq.com> Date: Sun, 15 Mar 2026 11:24:52 +0800 Subject: [PATCH 14/14] fix: build error on windows. --- src/llaisys/qwen2.cc | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc index 52b25db9..286ec4ea 100644 --- a/src/llaisys/qwen2.cc +++ b/src/llaisys/qwen2.cc @@ -69,6 +69,20 @@ size_t ceil_div(size_t a, size_t b) { } size_t read_env_size_t(const char *name, size_t default_value) { +#ifdef _WIN32 + char *value = nullptr; + size_t len = 0; + if (_dupenv_s(&value, &len, name) != 0 || value == nullptr) { + return default_value; + } + char *end = nullptr; + unsigned long long parsed = std::strtoull(value, &end, 10); + std::free(value); + if (end == value || *end != '\0') { + return default_value; + } + return static_cast(parsed); +#else const char *value = std::getenv(name); if (!value) { return default_value; @@ -79,6 +93,7 @@ size_t read_env_size_t(const char *name, size_t default_value) { return default_value; } return static_cast(parsed); +#endif } uint64_t hash_mix(uint64_t h, int64_t v, uint64_t base) {