Skip to content
40 changes: 40 additions & 0 deletions include/infiniop/ops/linear.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#ifndef __INFINIOP_LINEAR_API_H__
#define __INFINIOP_LINEAR_API_H__

#include "../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopLinearDescriptor_t;

__C __export infiniStatus_t infiniopCreateLinearDescriptor(infiniopHandle_t handle,
infiniopLinearDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t d_desc,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t weights_desc,
infiniopTensorDescriptor_t weights_scale_desc,
infiniopTensorDescriptor_t weights_zero_desc,
float alpha,
float beta);

__C __export infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopLinear(infiniopLinearDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *d,
const void *c,
const void *bias,
const void *x,
const void *x_scale,
const void *x_zero,
const void *weights,
const void *weights_scale,
const void *weights_zero,
void *stream);

__C __export infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc);

#endif
28 changes: 28 additions & 0 deletions include/infiniop/ops/quant/per_channel_quant_int8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__
#define __INFINIOP_PER_CHANNEL_QUANT_INT8_API_H__

#include "../../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopPerChannelQuantI8Descriptor_t;

__C __export infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);

__C __export infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQuantI8Descriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannelQuantI8Descriptor_t desc);

#endif
76 changes: 76 additions & 0 deletions src/infiniop/ops/linear/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#ifndef __LINEAR_KERNEL_CUH__
#define __LINEAR_KERNEL_CUH__

template <typename Tdata>
__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const float *x_zero, const int8_t *w_packed, const float *w_scale, const float *w_zero, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = (x_scale[row] * w_scale[col] * ((float)y_packed[idx] + K * x_zero[row] * w_zero[col]));
float output2 = 0.0f;
float output3 = 0.0f;
float tmp2 = x_scale[row] * w_scale[col] * w_zero[col];
float tmp3 = x_scale[row] * x_zero[row] * w_scale[col];
for (int ind = 0; ind < K; ind++) {
output2 += tmp2 * (float)x_packed[row * K + ind];
output3 += tmp3 * (float)w_packed[ind * N + col];
}
float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx] + (float)bias[col];

y[idx] = static_cast<Tdata>(output);
}

template <typename Tdata>
__device__ void postKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const float *x_scale, const float *x_zero, const int8_t *w_packed, const float *w_scale, const float *w_zero, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = (x_scale[row] * w_scale[col] * ((float)y_packed[idx] + K * x_zero[row] * w_zero[col]));
float output2 = 0.0f;
float output3 = 0.0f;
float tmp2 = x_scale[row] * w_scale[col] * w_zero[col];
float tmp3 = x_scale[row] * x_zero[row] * w_scale[col];
for (int ind = 0; ind < K; ind++) {
output2 += tmp2 * (float)x_packed[row * K + ind];
output3 += tmp3 * (float)w_packed[ind * N + col];
}
float output = alpha * (output1 - output2 - output3) + beta * (float)c[idx];

y[idx] = static_cast<Tdata>(output);
}

template <typename Tdata>
__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = x_scale[row] * w_scale[col] * ((float)y_packed[idx]);

float output = alpha * output1 + beta * (float)c[idx] + (float)bias[col];

y[idx] = static_cast<Tdata>(output);
}
template <typename Tdata>
__device__ void postSymKernel(Tdata *y, int32_t *y_packed, const Tdata *c, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N, float alpha, float beta) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) {
return;
}
int idx = row * N + col;
float output1 = x_scale[row] * w_scale[col] * ((float)y_packed[idx]);

float output = alpha * output1 + beta * (float)c[idx];

y[idx] = static_cast<Tdata>(output);
}
#endif // __LINEAR_KERNEL_CUH__
77 changes: 77 additions & 0 deletions src/infiniop/ops/linear/info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#ifndef __LINEAR_INFO_H__
#define __LINEAR_INFO_H__

#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"

namespace op::linear {

class LinearInfo {
private:
LinearInfo() = default;

public:
infiniDtype_t dtype, packed_type;
size_t M, K, N;
float alpha, beta;

static utils::Result<LinearInfo> createLinearInfo(
infiniopTensorDescriptor_t d_desc,
infiniopTensorDescriptor_t c_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t weights_desc,
infiniopTensorDescriptor_t weights_scale_desc,
infiniopTensorDescriptor_t weights_zero_desc,
float alpha,
float beta) {

CHECK_OR_RETURN(
d_desc != nullptr && c_desc != nullptr && x_desc != nullptr && x_scale_desc != nullptr && weights_desc != nullptr && weights_scale_desc != nullptr,
INFINI_STATUS_NULL_POINTER);

const infiniDtype_t dtype = d_desc->dtype();
const infiniDtype_t packed_type = x_desc->dtype();
CHECK_OR_RETURN(dtype == c_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_OR_RETURN(packed_type == weights_desc->dtype(),
INFINI_STATUS_BAD_TENSOR_DTYPE);
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);

CHECK_OR_RETURN(d_desc->ndim() == 2
&& c_desc->ndim() == 2
&& x_desc->ndim() == 2
&& x_scale_desc->ndim() == 2
&& weights_desc->ndim() == 2
&& weights_scale_desc->ndim() == 2,
INFINI_STATUS_BAD_TENSOR_SHAPE);

size_t M = d_desc->dim(0);
size_t N = d_desc->dim(1);
size_t K = x_desc->dim(1);

CHECK_OR_RETURN(M == x_desc->dim(0)
|| M == x_scale_desc->dim(0)
|| 1 == x_scale_desc->dim(1)
|| 1 == weights_scale_desc->dim(0)
|| N == weights_scale_desc->dim(1),
INFINI_STATUS_BAD_TENSOR_SHAPE);

return utils::Result<LinearInfo>(LinearInfo{
dtype,
packed_type,
M,
K,
N,
alpha,
beta});
}
};

} // namespace op::linear

#endif // __LINEAR_INFO_H__
54 changes: 54 additions & 0 deletions src/infiniop/ops/linear/linear.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef __LINEAR_H__
#define __LINEAR_H__

#include "../../operator.h"
#include "info.h"

#define DESCRIPTOR(NAMESPACE) \
\
namespace op::linear::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
LinearInfo _info; \
size_t _workspace_size; \
\
Descriptor(Opaque *opaque, LinearInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), _info(info), _workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t minWorkspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, Descriptor **desc_ptr, \
infiniopTensorDescriptor_t d_desc, \
infiniopTensorDescriptor_t c_desc, \
infiniopTensorDescriptor_t bias_desc, \
infiniopTensorDescriptor_t x_desc, \
infiniopTensorDescriptor_t x_scale_desc, \
infiniopTensorDescriptor_t x_zero_desc, \
infiniopTensorDescriptor_t weights_desc, \
infiniopTensorDescriptor_t weights_scale_desc, \
infiniopTensorDescriptor_t weights_zero_desc, \
float alpha, \
float beta); \
template <unsigned int BLOCK_SIZE, typename Tdata> \
infiniStatus_t launchKernel(const LinearInfo &info, Tdata *y, \
const Tdata *c, const Tdata *bias, const int8_t *x_packed, \
const float *x_scale, const float *x_zero, const int8_t *w_packed, \
const float *w_scale, const float *w_zero, void *stream, void *workspace) const; \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *d, const void *c, const void *bias, const void *x, \
const void *x_scale, const void *x_zero, const void *weights, \
const void *weights_scale, const void *weights_zero, void *stream) const; \
}; \
}

#endif // __LINEAR_H__
Loading