InfiniTensor
diff --git a/‎python/infinicore/ops/add_rms_norm.py‎
Lines changed: 22 additions & 6 deletions b/‎python/infinicore/ops/add_rms_norm.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎src/infiniop/ops/add_rms_norm/add_rms_norm.h‎
Lines changed: 10 additions & 10 deletions b/‎src/infiniop/ops/add_rms_norm/add_rms_norm.h‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/infiniop/ops/add_rms_norm/cpu/add_rms_norm_cpu.cc‎
Lines changed: 18 additions & 50 deletions b/‎src/infiniop/ops/add_rms_norm/cpu/add_rms_norm_cpu.cc‎
Lines changed: 18 additions & 50 deletions
diff --git a/‎src/infiniop/ops/add_rms_norm/cuda/kernel.cuh‎
Lines changed: 63 additions & 0 deletions b/‎src/infiniop/ops/add_rms_norm/cuda/kernel.cuh‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/infiniop/ops/add_rms_norm/info.h‎
Lines changed: 25 additions & 33 deletions b/‎src/infiniop/ops/add_rms_norm/info.h‎
Lines changed: 25 additions & 33 deletions
@@ -5,27 +5,43 @@
 def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None):
     """
     Fused Add and RMS Normalization.
-    
+
     Args:
         a: First input tensor
         b: Second input tensor
         weight: Scale weights
         epsilon: Small constant for numerical stability, default is 1e-5
         out: Optional output tuple (y, residual_out) for in-place operation
-    
+
     Returns:
         Tuple of (normalized_result, add_result): (RMSNorm(a + b) * weight, a + b)
         The add_result can be used as residual for subsequent layers.
     """
     if out is None:
-        result = _infinicore.add_rms_norm(a._underlying, b._underlying, weight._underlying, epsilon)
+        result = _infinicore.add_rms_norm(
+            a._underlying, b._underlying, weight._underlying, epsilon
+        )
         return (Tensor(result[0]), Tensor(result[1]))
-    
+
     y, residual_out = out
-    _infinicore.add_rms_norm_(y._underlying, residual_out._underlying, a._underlying, b._underlying, weight._underlying, epsilon)
+    _infinicore.add_rms_norm_(
+        y._underlying,
+        residual_out._underlying,
+        a._underlying,
+        b._underlying,
+        weight._underlying,
+        epsilon,
+    )
     return (y, residual_out)
 
 
 def add_rms_norm_(y, residual_out, a, b, weight, epsilon=1e-5):
     """In-place Fused Add and RMS Normalization."""
-    _infinicore.add_rms_norm_(y._underlying, residual_out._underlying, a._underlying, b._underlying, weight._underlying, epsilon)
+    _infinicore.add_rms_norm_(
+        y._underlying,
+        residual_out._underlying,
+        a._underlying,
+        b._underlying,
+        weight._underlying,
+        epsilon,
+    )
@@ -6,8 +6,8 @@
 
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
-    namespace op::add_rms_norm::NAMESPACE {                     \
-    class Descriptor final : public InfiniopDescriptor {        \
+    namespace op::add_rms_norm::NAMESPACE {                      \
+    class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
         AddRMSNormInfo _info;                                    \
@@ -19,7 +19,7 @@
             size_t workspace_size,                               \
             infiniDevice_t device_type,                          \
             int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},       \
+            : InfiniopDescriptor{device_type, device_id},        \
               _opaque(opaque),                                   \
               _info(info),                                       \
               _workspace_size(workspace_size) {}                 \
@@ -29,24 +29,24 @@
                                                                  \
         size_t workspaceSize() const { return _workspace_size; } \
                                                                  \
-        static infiniStatus_t create(                           \
+        static infiniStatus_t create(                            \
             infiniopHandle_t handle,                             \
             Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t y_desc,                  \
+            infiniopTensorDescriptor_t y_desc,                   \
             infiniopTensorDescriptor_t a_desc,                   \
             infiniopTensorDescriptor_t b_desc,                   \
-            infiniopTensorDescriptor_t weight_desc,               \
-            float epsilon,                                      \
-            infiniopTensorDescriptor_t residual_out_desc);      \
+            infiniopTensorDescriptor_t weight_desc,              \
+            float epsilon,                                       \
+            infiniopTensorDescriptor_t residual_out_desc);       \
                                                                  \
-        infiniStatus_t calculate(                               \
+        infiniStatus_t calculate(                                \
             void *workspace, size_t workspace_size,              \
             void *y,                                             \
             const void *a,                                       \
             const void *b,                                       \
             const void *weight,                                  \
             void *residual_out,                                  \
-            void *stream) const;                                \
+            void *stream) const;                                 \
     };                                                           \
     }
 
 
@@ -36,16 +36,13 @@ infiniStatus_t add_rmsnorm(const AddRMSNormInfo *info, T *y, const T *a, const T
         const T *a_ptr = a + i * info->a_strides[0] + j * info->a_strides[1];
         const T *b_ptr = b + i * info->b_strides[0] + j * info->b_strides[1];
         T *y_ptr = y + i * info->y_strides[0] + j * info->y_strides[1];
-        T *residual_out_ptr = info->has_residual_out ? 
-            (residual_out + i * info->residual_out_strides[0] + j * info->residual_out_strides[1]) : nullptr;
+        T *residual_out_ptr = residual_out + i * info->residual_out_strides[0] + j * info->residual_out_strides[1];
 
         // Compute add(a, b) once and store it
         T sum_squared = (T)0;
         for (size_t k = 0; k < dim; k++) {
             T sum_val = a_ptr[k] + b_ptr[k];
-            if (residual_out_ptr != nullptr) {
-                residual_out_ptr[k] = sum_val;  // Store add result
-            }
+            residual_out_ptr[k] = sum_val; // Store add result
             sum_squared += sum_val * sum_val;
         }
 
@@ -54,18 +51,9 @@ infiniStatus_t add_rmsnorm(const AddRMSNormInfo *info, T *y, const T *a, const T
         T rms = (T)1 / std::sqrt(sum_squared / (T)(dim) + (T)(info->epsilon));
 
         // Apply normalization: y = (a + b) * w * rms
-        // Reuse the stored sum values if residual_out was computed, otherwise recompute
-        if (residual_out_ptr != nullptr) {
-            // Reuse stored values
-            for (size_t k = 0; k < dim; k++) {
-                y_ptr[k] = residual_out_ptr[k] * w[k] * rms;
-            }
-        } else {
-            // Recompute sum
-            for (size_t k = 0; k < dim; k++) {
-                T sum_val = a_ptr[k] + b_ptr[k];
-                y_ptr[k] = sum_val * w[k] * rms;
-            }
+        // Reuse stored values from residual_out
+        for (size_t k = 0; k < dim; k++) {
+            y_ptr[k] = residual_out_ptr[k] * w[k] * rms;
         }
     }
 
@@ -90,52 +78,32 @@ infiniStatus_t add_rmsnormHalfPrecision(const AddRMSNormInfo *info, T *y, const
         const T *a_ptr = a + i * info->a_strides[0] + j * info->a_strides[1];
         const T *b_ptr = b + i * info->b_strides[0] + j * info->b_strides[1];
         T *y_ptr = y + i * info->y_strides[0] + j * info->y_strides[1];
-        T *residual_out_ptr = info->has_residual_out ? 
-            (residual_out + i * info->residual_out_strides[0] + j * info->residual_out_strides[1]) : nullptr;
+        T *residual_out_ptr = residual_out + i * info->residual_out_strides[0] + j * info->residual_out_strides[1];
 
         // Compute sum of squares for RMS normalization and store add result
         float sum_squared = 0.0f;
         for (size_t k = 0; k < dim; k++) {
             float sum_val = utils::cast<float>(a_ptr[k]) + utils::cast<float>(b_ptr[k]);
-            if (residual_out_ptr != nullptr) {
-                residual_out_ptr[k] = utils::cast<T>(sum_val);  // Store add result
-            }
+            residual_out_ptr[k] = utils::cast<T>(sum_val); // Store add result
             sum_squared += sum_val * sum_val;
         }
 
         // Compute RMS: 1 / (sqrt(sum/dim + eps))
         float rms = 1.f / std::sqrt(sum_squared / (float)(dim) + info->epsilon);
 
         // Apply normalization: y = (a + b) * w * rms
-        // Reuse stored values if residual_out was computed, otherwise recompute
-        if (residual_out_ptr != nullptr) {
-            // Reuse stored values
-            for (size_t k = 0; k < dim; k++) {
-                float sum_val = utils::cast<float>(residual_out_ptr[k]);
-                float val;
-                if constexpr (std::is_same<Tw, float>::value) {
-                    val = sum_val * w[k] * rms;
-                } else if constexpr (std::is_same<Tw, T>::value || std::is_same_v<Tw, fp16_t> || std::is_same_v<Tw, bf16_t>) {
-                    val = sum_val * utils::cast<float>(w[k]) * rms;
-                } else {
-                    std::abort();
-                }
-                y_ptr[k] = utils::cast<T>(val);
-            }
-        } else {
-            // Recompute sum
-            for (size_t k = 0; k < dim; k++) {
-                float sum_val = utils::cast<float>(a_ptr[k]) + utils::cast<float>(b_ptr[k]);
-                float val;
-                if constexpr (std::is_same<Tw, float>::value) {
-                    val = sum_val * w[k] * rms;
-                } else if constexpr (std::is_same<Tw, T>::value || std::is_same_v<Tw, fp16_t> || std::is_same_v<Tw, bf16_t>) {
-                    val = sum_val * utils::cast<float>(w[k]) * rms;
-                } else {
-                    std::abort();
-                }
-                y_ptr[k] = utils::cast<T>(val);
+        // Reuse stored values from residual_out
+        for (size_t k = 0; k < dim; k++) {
+            float sum_val = utils::cast<float>(residual_out_ptr[k]);
+            float val;
+            if constexpr (std::is_same<Tw, float>::value) {
+                val = sum_val * w[k] * rms;
+            } else if constexpr (std::is_same<Tw, T>::value || std::is_same_v<Tw, fp16_t> || std::is_same_v<Tw, bf16_t>) {
+                val = sum_val * utils::cast<float>(w[k]) * rms;
+            } else {
+                std::abort();
             }
+            y_ptr[k] = utils::cast<T>(val);
         }
     }
 
 
@@ -0,0 +1,63 @@
+#ifndef __ADD_RMS_NORM_CUDA_KERNEL_H__
+#define __ADD_RMS_NORM_CUDA_KERNEL_H__
+
+#include <cub/block/block_reduce.cuh>
+
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+__device__ void add_rmsnormBlock(
+    Tdata *__restrict__ y,
+    Tdata *__restrict__ residual_out,
+    ptrdiff_t stride_y_batch,
+    ptrdiff_t stride_y_nhead,
+    ptrdiff_t stride_residual_out_batch,
+    ptrdiff_t stride_residual_out_nhead,
+    const Tdata *__restrict__ a,
+    ptrdiff_t stride_a_batch,
+    ptrdiff_t stride_a_nhead,
+    const Tdata *__restrict__ b,
+    ptrdiff_t stride_b_batch,
+    ptrdiff_t stride_b_nhead,
+    const Tweight *__restrict__ w,
+    size_t nhead,
+    size_t dim,
+    float epsilon) {
+    // Each block takes care of one head in one batch
+    // Each thread deals with every block_size element in the row
+    size_t batch_idx = blockIdx.x / nhead;
+    size_t head_idx = blockIdx.x % nhead;
+
+    auto y_ptr = y + batch_idx * stride_y_batch + head_idx * stride_y_nhead;
+    auto a_ptr = a + batch_idx * stride_a_batch + head_idx * stride_a_nhead;
+    auto b_ptr = b + batch_idx * stride_b_batch + head_idx * stride_b_nhead;
+    auto w_ptr = w;
+    Tdata *residual_out_ptr = residual_out + batch_idx * stride_residual_out_batch + head_idx * stride_residual_out_nhead;
+
+    // Compute add(a, b) and sum of squares in one pass
+    Tcompute sum_squared = 0;
+    for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
+        Tcompute sum_val = Tcompute(a_ptr[i]) + Tcompute(b_ptr[i]);
+        residual_out_ptr[i] = Tdata(sum_val); // Store add result
+        sum_squared += sum_val * sum_val;
+    }
+
+    // Block-reduce sum of squares
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    sum_squared = BlockReduce(temp_storage).Sum(sum_squared);
+
+    // Thread_0 computes RMS=1/sqrt(ss/dim+epsilon) and stores in shared memory
+    __shared__ Tcompute rms;
+    if (threadIdx.x == 0) {
+        rms = Tcompute(rsqrtf(sum_squared / Tcompute(dim) + epsilon));
+    }
+    __syncthreads();
+
+    // Apply normalization: y = (a + b) * w * rms
+    // Reuse stored values from residual_out
+    for (size_t i = threadIdx.x; i < dim; i += BLOCK_SIZE) {
+        Tcompute sum_val = Tcompute(residual_out_ptr[i]); // Reuse stored value
+        y_ptr[i] = Tdata(sum_val * Tcompute(w_ptr[i]) * rms);
+    }
+}
+
+#endif
@@ -34,12 +34,12 @@ class AddRMSNormInfo {
 
         auto atype = y_desc->dtype();
         auto wtype = weight_desc->dtype();
-        
+
         // Check that all input tensors have the same dtype
         if (a_desc->dtype() != atype || b_desc->dtype() != atype) {
             return INFINI_STATUS_BAD_TENSOR_DTYPE;
         }
-        
+
         if (atype == INFINI_DTYPE_F16 || atype == INFINI_DTYPE_BF16) {
             // For half-precision types (FP16/BF16), weights can be the same half-precision type or FP32
             if (wtype != atype && wtype != INFINI_DTYPE_F32 && wtype != INFINI_DTYPE_BF16 && wtype != INFINI_DTYPE_F16) {
@@ -71,52 +71,46 @@ class AddRMSNormInfo {
             batch = y_desc->dim(0);
             dim = y_desc->dim(1);
 
-            if (a_desc->dim(0) != batch || a_desc->dim(1) != dim ||
-                b_desc->dim(0) != batch || b_desc->dim(1) != dim ||
-                weight_desc->dim(0) != dim) {
+            if (a_desc->dim(0) != batch || a_desc->dim(1) != dim || b_desc->dim(0) != batch || b_desc->dim(1) != dim || weight_desc->dim(0) != dim) {
                 return INFINI_STATUS_BAD_TENSOR_SHAPE;
             }
         } else if (y_ndim == 3) {
             batch = y_desc->dim(0);
             nhead = y_desc->dim(1);
             dim = y_desc->dim(2);
 
-            if (a_desc->dim(0) != batch || a_desc->dim(1) != nhead || a_desc->dim(2) != dim ||
-                b_desc->dim(0) != batch || b_desc->dim(1) != nhead || b_desc->dim(2) != dim ||
-                weight_desc->dim(0) != dim) {
+            if (a_desc->dim(0) != batch || a_desc->dim(1) != nhead || a_desc->dim(2) != dim || b_desc->dim(0) != batch || b_desc->dim(1) != nhead || b_desc->dim(2) != dim || weight_desc->dim(0) != dim) {
                 return INFINI_STATUS_BAD_TENSOR_SHAPE;
             }
         } else {
             return INFINI_STATUS_BAD_TENSOR_SHAPE;
         }
 
         // Check contiguity of the last dimension
-        if (y_desc->stride(y_ndim - 1) != 1 || 
-            a_desc->stride(a_ndim - 1) != 1 || 
-            b_desc->stride(b_ndim - 1) != 1 ||
-            weight_desc->stride(w_ndim - 1) != 1) {
+        if (y_desc->stride(y_ndim - 1) != 1 || a_desc->stride(a_ndim - 1) != 1 || b_desc->stride(b_ndim - 1) != 1 || weight_desc->stride(w_ndim - 1) != 1) {
             return INFINI_STATUS_BAD_TENSOR_STRIDES;
         }
 
-        // Check residual_out_desc if provided
-        bool has_residual_out = (residual_out_desc != nullptr);
-        if (has_residual_out) {
-            const size_t residual_out_ndim = residual_out_desc->ndim();
-            if (residual_out_ndim != y_ndim) {
+        // residual_out_desc is required (always needed for fused operator)
+        if (residual_out_desc == nullptr) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        const size_t residual_out_ndim = residual_out_desc->ndim();
+        if (residual_out_ndim != y_ndim) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (residual_out_desc->dtype() != atype) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        // Check shape matches
+        for (size_t i = 0; i < y_ndim; i++) {
+            if (residual_out_desc->dim(i) != y_desc->dim(i)) {
                 return INFINI_STATUS_BAD_TENSOR_SHAPE;
             }
-            if (residual_out_desc->dtype() != atype) {
-                return INFINI_STATUS_BAD_TENSOR_DTYPE;
-            }
-            // Check shape matches
-            for (size_t i = 0; i < y_ndim; i++) {
-                if (residual_out_desc->dim(i) != y_desc->dim(i)) {
-                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
-                }
-            }
-            if (residual_out_desc->stride(residual_out_ndim - 1) != 1) {
-                return INFINI_STATUS_BAD_TENSOR_STRIDES;
-            }
+        }
+        if (residual_out_desc->stride(residual_out_ndim - 1) != 1) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
         }
 
         AddRMSNormInfo info;
@@ -127,10 +121,8 @@ class AddRMSNormInfo {
         info.y_strides = y_desc->strides();
         info.a_strides = a_desc->strides();
         info.b_strides = b_desc->strides();
-        info.has_residual_out = has_residual_out;
-        if (has_residual_out) {
-            info.residual_out_strides = residual_out_desc->strides();
-        }
+        info.has_residual_out = true; // Always true now
+        info.residual_out_strides = residual_out_desc->strides();
         return utils::Result<AddRMSNormInfo>(info);
     }
 };