diff --git a/infini_train/src/kernels/cuda/elementwise.cu b/infini_train/src/kernels/cuda/elementwise.cu index 6b356156..d03abd24 100644 --- a/infini_train/src/kernels/cuda/elementwise.cu +++ b/infini_train/src/kernels/cuda/elementwise.cu @@ -4,17 +4,27 @@ #include "infini_train/include/common/cuda/common_cuda.h" #include "infini_train/include/common/cuda/kernel_helper.cuh" -#include "infini_train/include/core/runtime/device_guard.h" +#include "infini_train/include/core/device_guard.h" #include "infini_train/include/dispatcher.h" #include "infini_train/include/tensor.h" -#include "infini_train/src/core/runtime/cuda/cuda_runtime_common.h" +#include "infini_train/src/core/cuda/cuda_stream.h" namespace infini_train::kernels::cuda { namespace { using namespace infini_train::common::cuda; constexpr int kWarpSize = 32; +template +struct alignas(sizeof(T) * VecSize) AlignedVector { + T val[VecSize]; +}; + +template +__device__ __host__ inline bool IsAligned(const T* ptr, int vec_size) { + return reinterpret_cast(ptr) % (sizeof(T) * vec_size) == 0; +} + template __global__ void UnaryForwardKernel(T *output, Func fn, size_t num_elements, size_t offset, const T *input) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset;