InfiniTensor
diff --git a/‎include/infiniop/ops/dequantize_gptq.h‎
Lines changed: 18 additions & 18 deletions b/‎include/infiniop/ops/dequantize_gptq.h‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎scripts/python_test.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/python_test.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu‎
Lines changed: 1 addition & 1 deletion b/‎src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh‎
Lines changed: 1 addition & 1 deletion b/‎src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu‎
Lines changed: 12 additions & 8 deletions b/‎src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎src/infiniop/ops/dequantize_gptq/dequantize_gptq.h‎
Lines changed: 4 additions & 4 deletions b/‎src/infiniop/ops/dequantize_gptq/dequantize_gptq.h‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu‎
Lines changed: 27 additions & 23 deletions b/‎src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu‎
Lines changed: 27 additions & 23 deletions
diff --git a/‎src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh‎
Lines changed: 0 additions & 41 deletions b/‎src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh‎
Lines changed: 0 additions & 41 deletions
diff --git a/‎src/infiniop/ops/dequantize_gptq/info.h‎
Lines changed: 5 additions & 5 deletions b/‎src/infiniop/ops/dequantize_gptq/info.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h‎
Lines changed: 0 additions & 41 deletions b/‎src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h‎
Lines changed: 0 additions & 41 deletions
@@ -5,26 +5,26 @@
 
 typedef struct InfiniopDescriptor *infiniopDequantizeGPTQDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
-                                                                  infiniopDequantizeGPTQDescriptor_t *desc_ptr,
-                                                                  infiniopTensorDescriptor_t out_desc,
-                                                                  infiniopTensorDescriptor_t qweight_desc,
-                                                                  infiniopTensorDescriptor_t scales_desc,
-                                                                  infiniopTensorDescriptor_t zeros_desc,
-                                                                  infiniopTensorDescriptor_t g_idx_desc); // add g_idx
+__INFINI_C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
+                                                                          infiniopDequantizeGPTQDescriptor_t *desc_ptr,
+                                                                          infiniopTensorDescriptor_t out_desc,
+                                                                          infiniopTensorDescriptor_t qweight_desc,
+                                                                          infiniopTensorDescriptor_t scales_desc,
+                                                                          infiniopTensorDescriptor_t zeros_desc,
+                                                                          infiniopTensorDescriptor_t g_idx_desc); // add g_idx
 
-__C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
+__INFINI_C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
-                                                  void *workspace,
-                                                  size_t workspace_size,
-                                                  void *out,
-                                                  const void *qweight,
-                                                  const void *scales,
-                                                  const void *zeros,
-                                                  const void *g_idx,  // add g_idx
-                                                  void *stream);
+__INFINI_C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
+                                                          void *workspace,
+                                                          size_t workspace_size,
+                                                          void *out,
+                                                          const void *qweight,
+                                                          const void *scales,
+                                                          const void *zeros,
+                                                          const void *g_idx, // add g_idx
+                                                          void *stream);
 
-__C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
+__INFINI_C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
 
 #endif
@@ -17,8 +17,8 @@ def run_tests(args):
         "causal_softmax.py",
         "clip.py",
         "conv.py",
-        "dequantize_awq.py",
-        "dequantize_gptq.py",
+        # "dequantize_awq.py",
+        # "dequantize_gptq.py",
         "gelu.py",
         "gemm.py",
         # "layer_norm.py",
 
@@ -8,7 +8,7 @@
 
 __global__ void __launch_bounds__(64)
     dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int G) {
+                           int *__restrict__ zeros, half *__restrict__ C, int G) {
     // static constexpr uint32_t ZERO = 0x0;
     half B_shared[32 * (128 + 8)];
 
 
@@ -122,4 +122,4 @@ __device__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
     return result;
 #endif
     __builtin_unreachable(); // Suppress missing return statement warning
-}
+}
@@ -11,15 +11,17 @@
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750)
 __global__ void __launch_bounds__(64)
     dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int G,
-                       int out_features, int in_features) {
+                           int *__restrict__ zeros, half *__restrict__ C, int G,
+                           int out_features, int in_features) {
     // static constexpr uint32_t ZERO = 0x0;
 
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = (blockIdx.y * blockDim.y + threadIdx.y);
 
     // 边界检查，防止越界访问
-    if (col >= out_features || row >= in_features) return;
+    if (col >= out_features || row >= in_features) {
+        return;
+    }
 
     // 每个元素在输出中的起始地址：行主序，连续 8 个 half
     int index1 = 8 * col + 8 * row * out_features;
@@ -60,23 +62,25 @@ __global__ void __launch_bounds__(64)
 
     // 直接写回全局内存输出
     half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < 8; ++i) {
         C_ptr2[i] = out_vec[i];
     }
 }
 #else
 __global__ void __launch_bounds__(64)
     dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
-                       int *__restrict__ zeros, half *__restrict__ C, int group_size,
-                       int out_features, int in_features) {
+                           int *__restrict__ zeros, half *__restrict__ C, int group_size,
+                           int out_features, int in_features) {
     static constexpr uint32_t ZERO = 0x0;
 
     int col = (blockIdx.x * blockDim.x + threadIdx.x);
     int row = blockIdx.y * blockDim.y + threadIdx.y;
 
     // 边界检查，防止越界访问
-    if (col >= out_features || row >= in_features) return;
+    if (col >= out_features || row >= in_features) {
+        return;
+    }
 
     int index1 = 8 * col + 8 * row * out_features;
     half *C_ptr2 = C + index1;
@@ -122,7 +126,7 @@ __global__ void __launch_bounds__(64)
 
     // 直接写回全局内存输出
     half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < 8; ++i) {
         C_ptr2[i] = out_vec[i];
     }
 
@@ -8,17 +8,17 @@
 
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
-    namespace op::dequantize_gptq::NAMESPACE {                    \
+    namespace op::dequantize_gptq::NAMESPACE {                   \
     class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
-        DequantizeGPTQInfo _info;                                 \
+        DequantizeGPTQInfo _info;                                \
         size_t _workspace_size;                                  \
                                                                  \
         Descriptor(                                              \
             size_t workspace_size_,                              \
             Opaque *opaque,                                      \
-            DequantizeGPTQInfo info,                              \
+            DequantizeGPTQInfo info,                             \
             infiniDevice_t device_type,                          \
             int device_id)                                       \
             : InfiniopDescriptor{device_type, device_id},        \
@@ -47,7 +47,7 @@
             const void *qweight,                                 \
             const void *scales,                                  \
             const void *zeros,                                   \
-            const void *g_idx,                                  \
+            const void *g_idx,                                   \
             void *stream) const;                                 \
     };                                                           \
     }
 
@@ -1,11 +1,10 @@
 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
 #include "dequantize_w42f16_iluvatar.cuh"
-#include "dequantize_w42f16_kernel.cuh"
 
 #include "../dequantize_gptq.h"
-#include <cuda_fp16.h>
 #include <cstdint>
+#include <cuda_fp16.h>
 
 namespace op::dequantize_gptq::iluvatar {
 
@@ -20,34 +19,38 @@ Descriptor::~Descriptor() { delete _opaque; }
 // zeros:   [num_groups, out_packed] packing 8 output channels per word
 // scales:  [num_groups, out_features], g_idx: [in_features]
 __global__ void __launch_bounds__(128)
-dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
-                        const half     *__restrict__ scales,
-                        const uint32_t *__restrict__ zeros,
-                        const int      *__restrict__ g_idx,
-                        half           *__restrict__ out,
-                        int in_features,
-                        int out_features,
-                        int out_packed,   // ceil(out_features / 8)
-                        int num_groups) {
+    dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
+                            const half *__restrict__ scales,
+                            const uint32_t *__restrict__ zeros,
+                            const int *__restrict__ g_idx,
+                            half *__restrict__ out,
+                            int in_features,
+                            int out_features,
+                            int out_packed, // ceil(out_features / 8)
+                            int num_groups) {
     const int col_pack = blockIdx.x * blockDim.x + threadIdx.x; // packed output column
-    const int row      = blockIdx.y * blockDim.y + threadIdx.y; // real input row
-    if (col_pack >= out_packed || row >= in_features) return;
+    const int row = blockIdx.y * blockDim.y + threadIdx.y;      // real input row
+    if (col_pack >= out_packed || row >= in_features) {
+        return;
+    }
 
     const int gid_raw = g_idx ? g_idx[row] : 0;
     const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
 
-    const int pack_row = row >> 3;          // packed input row (8 rows per pack)
-    const int q_shift  = (row & 7) * 4;     // nibble shift within uint32
+    const int pack_row = row >> 3;     // packed input row (8 rows per pack)
+    const int q_shift = (row & 7) * 4; // nibble shift within uint32
 
     const uint32_t zeros_loaded = zeros[gid * out_packed + col_pack];
 
-    const int col_base   = col_pack << 3;  // 8 real cols per pack
+    const int col_base = col_pack << 3; // 8 real cols per pack
     const int scale_base = gid * out_features + col_base;
 
-    #pragma unroll
+#pragma unroll
     for (int j = 0; j < 8; ++j) {
         const int col = col_base + j;
-        if (col >= out_features) break;
+        if (col >= out_features) {
+            break;
+        }
 
         const uint32_t q_loaded = qweight[pack_row * out_features + col];
         const int q_nib = (q_loaded >> q_shift) & 0xF;
@@ -96,14 +99,15 @@ infiniStatus_t Descriptor::calculate(
     (void)workspace;
     (void)workspace_size;
 
-    const int in_features  = _info.in_features();
+    const int in_features = _info.in_features();
     const int out_features = _info.out_features();
-    const int out_packed   = _info.out_packed();
-    const int in_packed    = _info.in_packed();
-    const int num_groups   = _info.num_groups();
+    const int out_packed = _info.out_packed();
+    const int in_packed = _info.in_packed();
+    const int num_groups = _info.num_groups();
 
-    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
+    if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0) {
         return INFINI_STATUS_BAD_PARAM;
+    }
 
     constexpr int BLOCK_X = 16; // packed columns
     constexpr int BLOCK_Y = 4;  // rows
 
@@ -28,11 +28,11 @@ class DequantizeGPTQInfo {
         infiniopTensorDescriptor_t zeros_desc,
         infiniopTensorDescriptor_t g_idx_desc) {
 
-        const int _in_features   = g_idx_desc->dim(0);          // real input channels
-        const int _in_packed     = qweight_desc->dim(0);        // ceil(in_features / 8)
-        const int _out_features  = qweight_desc->dim(1);        // real output channels
-        const int _num_groups    = scales_desc->dim(0);         // should be in_features / group_size
-        const int _out_packed    = zeros_desc->dim(1);          // ceil(out_features / 8)
+        const int _in_features = g_idx_desc->dim(0);    // real input channels
+        const int _in_packed = qweight_desc->dim(0);    // ceil(in_features / 8)
+        const int _out_features = qweight_desc->dim(1); // real output channels
+        const int _num_groups = scales_desc->dim(0);    // should be in_features / group_size
+        const int _out_packed = zeros_desc->dim(1);     // ceil(out_features / 8)
 
         assert(out_desc->dim(0) == _in_features);
         assert(out_desc->dim(1) == _out_features);