From dfe45c514410c784083009dafe24b5068a1cf9f5 Mon Sep 17 00:00:00 2001 From: your_name Date: Mon, 16 Mar 2026 20:33:56 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E8=AE=AD=E7=BB=83=E8=90=A5=E3=80=91BF?= =?UTF-8?q?16=20Elementwise=20Kernel=20=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- infini_train/src/kernels/cuda/elementwise.cu | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/infini_train/src/kernels/cuda/elementwise.cu b/infini_train/src/kernels/cuda/elementwise.cu index 6b356156..d03abd24 100644 --- a/infini_train/src/kernels/cuda/elementwise.cu +++ b/infini_train/src/kernels/cuda/elementwise.cu @@ -4,17 +4,27 @@ #include "infini_train/include/common/cuda/common_cuda.h" #include "infini_train/include/common/cuda/kernel_helper.cuh" -#include "infini_train/include/core/runtime/device_guard.h" +#include "infini_train/include/core/device_guard.h" #include "infini_train/include/dispatcher.h" #include "infini_train/include/tensor.h" -#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h" +#include "infini_train/src/core/cuda/cuda_stream.h" namespace infini_train::kernels::cuda { namespace { using namespace infini_train::common::cuda; constexpr int kWarpSize = 32; +template +struct alignas(sizeof(T) * VecSize) AlignedVector { + T val[VecSize]; +}; + +template +__device__ __host__ inline bool IsAligned(const T* ptr, int vec_size) { + return reinterpret_cast(ptr) % (sizeof(T) * vec_size) == 0; +} + template __global__ void UnaryForwardKernel(T *output, Func fn, size_t num_elements, size_t offset, const T *input) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;