issue/1052: debug per tensor

xgqdut2016 · xgqdut2016 · commit 4a7a7593d5e7 · 2026-03-16T10:52:09.000+08:00
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
@@ -12,7 +12,8 @@ __device__ void perTensorDequantI8SymKernel(
     unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
     const int grid_size = blockDim.x * gridDim.x;
     float x_scale_val = x_scale[0];
-    for (int tid = gid; tid < num_elements; tid += grid_size) {
+    for (int ind = gid; ind < num_elements; ind += grid_size) {
+        int tid = ind;
         int w = tid % (int)width;
         tid = tid / (int)width;
 
diff --git a/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh b/src/infiniop/ops/quant/per_tensor_quant_int8/cuda/kernel.cuh
@@ -9,25 +9,21 @@
 
 #define FULL_MASK 0xffffffff
 
-
 // warp reduce max
-__device__ __forceinline__ float warpReduceMax(float val)
-{
-    for (int offset = WARP_SIZE/2; offset > 0; offset /= 2)
+__device__ __forceinline__ float warpReduceMax(float val) {
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
         val = fmaxf(val, __shfl_xor_sync(FULL_MASK, val, offset));
+    }
     return val;
 }
 
-
 // float atomic max (safe version)
-__device__ __forceinline__ void atomicMaxFloat(float* addr, float val)
-{
-    int* addr_i = (int*)addr;
+__device__ __forceinline__ void atomicMaxFloat(float *addr, float val) {
+    int *addr_i = (int *)addr;
     int old = *addr_i;
     int assumed;
 
-    do
-    {
+    do {
         assumed = old;
         float old_f = __int_as_float(assumed);
         float new_f = fmaxf(val, old_f);
@@ -48,15 +44,15 @@ __device__ void perTensorAbsmaxSymKernel(float *x_scale, const Tdata *x,
                                          size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
                                          ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
                                          int num_elements) {
-    int tid = threadIdx.x;
-    int gid = blockIdx.x * blockDim.x + tid;
+    int idx = threadIdx.x;
+    int gid = blockIdx.x * blockDim.x + idx;
     int grid_size = blockDim.x * gridDim.x;
 
     float local_max = 0.f;
 
     // grid-stride loop
-    for (int tid = gid; tid < num_elements; tid += grid_size)
-    {
+    for (int ind = gid; ind < num_elements; ind += grid_size) {
+        int tid = ind;
         int w = tid % (int)width;
         tid = tid / (int)width;
 
@@ -78,11 +74,9 @@ __device__ void perTensorAbsmaxSymKernel(float *x_scale, const Tdata *x,
     // warp reduction
     local_max = warpReduceMax(local_max);
     // 每个 warp 只 atomic 一次
-    if ((tid & (WARP_SIZE - 1)) == 0)
-    {
+    if ((idx & (WARP_SIZE - 1)) == 0) {
         atomicMaxFloat(x_scale, local_max / 127.0f);
     }
-
 }
 
 template <typename Tdata, unsigned int BLOCK_SIZE>
@@ -98,7 +92,8 @@ __device__ void perTensorQuantI8SymKernel(
 
     float scale_val = 1.0f / x_scale[0];
 
-    for (int tid = gid; tid < num_elements; tid += grid_size) {
+    for (int ind = gid; ind < num_elements; ind += grid_size) {
+        int tid = ind;
         int w = tid % (int)width;
         tid = tid / (int)width;
 
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
@@ -833,6 +833,7 @@ def per_tensor_dequant_int8_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def softplus_(lib):
     lib.infiniopCreateSoftplusDescriptor.restype = c_int32
@@ -1127,47 +1128,43 @@ def scaled_mm_int8_(lib):
     ]
 
 
-
 @OpRegister.operator
 def kv_caching_(lib):
     lib.infiniopCreateKVCachingDescriptor.restype = c_int32
     lib.infiniopCreateKVCachingDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,  
-        infiniopTensorDescriptor_t,  
-        infiniopTensorDescriptor_t,  
-        infiniopTensorDescriptor_t,  
-        infiniopTensorDescriptor_t, 
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
     ]
 
-    
     lib.infiniopGetKVCachingWorkspaceSize.restype = c_int32
     lib.infiniopGetKVCachingWorkspaceSize.argtypes = [
         infiniopOperatorDescriptor_t,
         POINTER(c_size_t),
     ]
 
-    
     lib.infiniopKVCaching.restype = c_int32
     lib.infiniopKVCaching.argtypes = [
         infiniopOperatorDescriptor_t,
-        c_void_p,  
-        c_size_t,  
-        c_void_p,  
-        c_void_p,  
-        c_void_p,  
-        c_void_p,  
-        c_void_p,  
-        c_void_p,  
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
     ]
 
-    
     lib.infiniopDestroyKVCachingDescriptor.restype = c_int32
     lib.infiniopDestroyKVCachingDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
-    
+
 
 @OpRegister.operator
 def paged_attention_(lib):
diff --git a/test/infiniop/per_channel_quant_int8.py b/test/infiniop/per_channel_quant_int8.py
@@ -78,6 +78,7 @@ def per_token_quant_int8_torch(x, symmetric):
 
         return w_packed, w_scale, w_zero
 
+
 def test(
     handle,
     device,
@@ -86,12 +87,12 @@ def test(
     dtype=InfiniDtype.F16,
     sync=None,
 ):
-    
+
     print(
         f"Testing Per Channel Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}"
     )
     M, K = x_shape
-   
+
     x = TestTensor(x_shape, None, dtype, device)
     x_p, x_s, x_z = per_token_quant_int8_torch(x.torch_tensor(), symmetric)
     x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
@@ -129,7 +130,7 @@ def test(
         )
     )
     workspace = TestWorkspace(workspace_size.value, x.device)
-    
+
     def lib_per_channel_quant_int8():
         check_error(
             LIBINFINIOP.infiniopPerChannelQuantI8(
@@ -145,7 +146,7 @@ def lib_per_channel_quant_int8():
         )
 
     lib_per_channel_quant_int8()
-    
+
     if sync is not None:
         sync()
 
@@ -157,12 +158,15 @@ def lib_per_channel_quant_int8():
             debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
 
     if symmetric:
-        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) and 
-                torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol))
+        assert torch.allclose(
+            x_packed.actual_tensor(), x_p, atol=2, rtol=0
+        ) and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
     else:
-        assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0) and 
-                torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and
-                torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol))
+        assert (
+            torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=0)
+            and torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
+            and torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
+        )
 
     # Profiling workflow
     if PROFILE:
@@ -185,5 +189,5 @@ def lib_per_channel_quant_int8():
 
     for device in get_test_devices(args):
         test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
-    
+
     print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/per_tensor_dequant_int8.py b/test/infiniop/per_tensor_dequant_int8.py
@@ -31,10 +31,8 @@
     ((16, 5632), (13312, 1), (13312, 1), True),
     ((4, 4, 5632), None, None, True),
     ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True),
-    ((1, 1, 8, 1), None, None, True),
-    ((1, 8, 32, 32), None, None, True),
-    ((8, 16, 64, 128), (8388608, 524288, 8192, 1), None, True),
-    ((1, 2, 2304, 128), (589824, 294912, 128, 1), (589824, 294912, 128, 1), True),
+    ((1, 4, 132, 128), (67584, 16896, 128, 1), (67584, 16896, 128, 1), True),
+    ((1, 4, 132, 128), None, None, True),
 ]
 
 
diff --git a/test/infiniop/per_tensor_quant_int8.py b/test/infiniop/per_tensor_quant_int8.py
@@ -31,10 +31,8 @@
     ((16, 5632), (13312, 1), (13312, 1), True, True),
     ((4, 4, 5632), None, None, True, False),
     ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), True, True),
-    ((1, 1, 8, 1), None, None, True, False),
-    ((1, 8, 32, 32), None, None, True, True),
-    ((8, 16, 64, 128), (8388608, 524288, 8192, 1), None, True, False),
-    ((1, 2, 2304, 128), (589824, 294912, 128, 1), (589824, 294912, 128, 1), True, True),
+    ((1, 32, 4, 128), (147456, 4608, 128, 1), (147456, 4608, 128, 1), True, False),
+    ((1, 32, 4, 128), (16384, 512, 128, 1), (16384, 512, 128, 1), True, True),
 ]
 
 
@@ -61,8 +59,8 @@ def per_tensor_quant_int8_torch(x, x_scale, symmetric, is_static):
         x = x.float()
         if is_static:
             x_q = x.mul(1 / x_scale)
-            x_q = torch.round(x_q).to(torch.int8)
-            return x_q, x_scale, None
+            x_packed = torch.clamp(x_q, -127, 127).to(torch.int8)
+            return x_packed, x_scale, None
         else:
             absmax = x.flatten().abs().max()
             if absmax == 0:
@@ -71,9 +69,8 @@ def per_tensor_quant_int8_torch(x, x_scale, symmetric, is_static):
                 return q, scale, None
         scale = absmax / 127
         x_q = x.mul(127 / absmax)
-        x_q = torch.round(x_q).to(torch.int8)
-
-        return x_q, scale, None
+        x_packed = torch.clamp(x_q, -127, 127).to(torch.int8)
+        return x_packed, scale, None
 
 
 def test(
@@ -154,17 +151,17 @@ def lib_per_tensor_quant_int8():
         )
 
     lib_per_tensor_quant_int8()
-    
+
     if sync is not None:
         sync()
-    
+
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
         debug(x_packed.actual_tensor(), x_p, atol=2, rtol=0)
         debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
         if symmetric == False:
             debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
-    
+
     if symmetric:
         assert torch.allclose(
             x_packed.actual_tensor(), x_p, atol=2, rtol=0