issue/843: add description in kernel

xgqdut2016 · xgqdut2016 · commit f3b50bc0c929 · 2026-01-09T10:07:29.000+08:00
diff --git a/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_channel_quant_int8/cuda/kernel.cuh
@@ -2,12 +2,51 @@
 #define __PERCHANNEL_QUANTINT8_KERNEL_CUH__
 
 #include <cub/block/block_reduce.cuh>
+/**
+ * Rounds a floating-point value to the nearest integer using
+ * the "half away from zero" tie-breaking rule.
+ *
+ * This rounding mode rounds to the nearest whole number, with ties
+ * (values exactly halfway between integers) rounded away from zero.
+ * For positive numbers: 1.5 rounds to 2, 2.5 rounds to 3
+ * For negative numbers: -1.5 rounds to -2, -2.5 rounds to -3
+ * This differs from standard "round to nearest, ties to even" banking rounding.
+ *
+ * @param x The floating-point value to round.
+ * @return The rounded integer value as an int.
+ *
+ * @note This is a CUDA device function designed to execute on GPU hardware.
+ * @note Uses floorf() and fabsf() from the CUDA math library.
+ */
 __device__ inline int round_half_away_from_zero(float x) {
     float ax = fabsf(x);
     float r = floorf(ax + 0.5f);
     return (x >= 0.0f) ? (int)r : -(int)r;
 }
-
+/**
+ * Performs per-channel asymmetric quantization to int8 precision for large matrices.
+ *
+ * This kernel quantizes input matrix x (M x K) to int8 using channel-wise (column-wise)
+ * quantization parameters, optimized for cases where K >= 1024. Each channel (column)
+ * has independently computed scale and zero point to minimize quantization error.
+ *
+ * The quantization follows: x_quantized = round((x - zero) / scale)
+ * where zero points shift the range and scales normalize to int8 range [-128, 127].
+ *
+ * @tparam Tdata Input data type (typically float or half)
+ * @tparam BLOCK_SIZE CUDA block size for thread cooperation
+ *
+ * @param x_packed Output buffer for packed int8 quantized values
+ * @param x_scale Output buffer for per-channel scale factors
+ * @param x_zero Output buffer for per-channel zero points
+ * @param x Input matrix in row-major layout (M rows, K columns)
+ * @param M Number of rows in input matrix
+ * @param K Number of columns in input matrix (channels)
+ *
+ * @note This is a CUDA device function optimized for GPU execution
+ * @note Designed for large channel dimensions (K >= 1024) to maximize parallelization
+ * @note Uses block-level reductions for efficient min/max computation per channel
+ */
 template <typename Tdata, unsigned int BLOCK_SIZE>
 __device__ void blockPerChannelQuantI8Kernel(
     int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
@@ -72,7 +111,10 @@ __device__ void blockPerChannelQuantI8Kernel(
         x_packed[tid + ind] = (int8_t)q;
     }
 }
-
+/**
+ * Performs per-channel symmetric quantization to int8 for large matrices (K >= 1024).
+ * Uses zero-centered scaling only, no zero point, and packs quantized data.
+ */
 template <typename Tdata, unsigned int BLOCK_SIZE>
 __device__ void blockPerChannelQuantI8SymKernel(
     int8_t *x_packed, float *x_scale, const Tdata *x,
@@ -145,7 +187,10 @@ __inline__ __device__ T WarpAllReduce(T val) {
     }
     return val;
 }
-
+/**
+ * Performs per-channel asymmetric quantization to int8 for large matrices (K < 1024).
+ * Computes scale/zero point per channel (column) and packs quantized data.
+ */
 template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
 __device__ void warpPerChannelQuantI8Kernel(
     int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x,
@@ -208,7 +253,10 @@ __device__ void warpPerChannelQuantI8Kernel(
         }
     }
 }
-
+/**
+ * Performs per-channel symmetric quantization to int8 for large matrices (K < 1024).
+ * Uses zero-centered scaling only, no zero point, and packs quantized data.
+ */
 template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
 __device__ void warpPerChannelQuantI8SymKernel(
     int8_t *x_packed, float *x_scale, const Tdata *x,