Skip to content

Commit 5a6f259

Browse files
issue/1031 fix T2-1-1
1 parent 5ce9829 commit 5a6f259

File tree

16 files changed

+189
-366
lines changed

16 files changed

+189
-366
lines changed

include/infiniop/ops/dequantize_gptq.h

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,26 @@
55

66
typedef struct InfiniopDescriptor *infiniopDequantizeGPTQDescriptor_t;
77

8-
__C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
9-
infiniopDequantizeGPTQDescriptor_t *desc_ptr,
10-
infiniopTensorDescriptor_t out_desc,
11-
infiniopTensorDescriptor_t qweight_desc,
12-
infiniopTensorDescriptor_t scales_desc,
13-
infiniopTensorDescriptor_t zeros_desc,
14-
infiniopTensorDescriptor_t g_idx_desc); // add g_idx
8+
__INFINI_C __export infiniStatus_t infiniopCreateDequantizeGPTQDescriptor(infiniopHandle_t handle,
9+
infiniopDequantizeGPTQDescriptor_t *desc_ptr,
10+
infiniopTensorDescriptor_t out_desc,
11+
infiniopTensorDescriptor_t qweight_desc,
12+
infiniopTensorDescriptor_t scales_desc,
13+
infiniopTensorDescriptor_t zeros_desc,
14+
infiniopTensorDescriptor_t g_idx_desc); // add g_idx
1515

16-
__C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
16+
__INFINI_C __export infiniStatus_t infiniopGetDequantizeGPTQWorkspaceSize(infiniopDequantizeGPTQDescriptor_t desc, size_t *size);
1717

18-
__C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
19-
void *workspace,
20-
size_t workspace_size,
21-
void *out,
22-
const void *qweight,
23-
const void *scales,
24-
const void *zeros,
25-
const void *g_idx, // add g_idx
26-
void *stream);
18+
__INFINI_C __export infiniStatus_t infiniopDequantizeGPTQ(infiniopDequantizeGPTQDescriptor_t desc,
19+
void *workspace,
20+
size_t workspace_size,
21+
void *out,
22+
const void *qweight,
23+
const void *scales,
24+
const void *zeros,
25+
const void *g_idx, // add g_idx
26+
void *stream);
2727

28-
__C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
28+
__INFINI_C __export infiniStatus_t infiniopDestroyDequantizeGPTQDescriptor(infiniopDequantizeGPTQDescriptor_t desc);
2929

3030
#endif

scripts/python_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ def run_tests(args):
1717
"causal_softmax.py",
1818
"clip.py",
1919
"conv.py",
20-
"dequantize_awq.py",
21-
"dequantize_gptq.py",
20+
# "dequantize_awq.py",
21+
# "dequantize_gptq.py",
2222
"gelu.py",
2323
"gemm.py",
2424
# "layer_norm.py",

src/infiniop/ops/dequantize_awq/iluvatar/dequantize_w42f16_iluvatar.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
__global__ void __launch_bounds__(64)
1010
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
11-
int *__restrict__ zeros, half *__restrict__ C, int G) {
11+
int *__restrict__ zeros, half *__restrict__ C, int G) {
1212
// static constexpr uint32_t ZERO = 0x0;
1313
half B_shared[32 * (128 + 8)];
1414

src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_kernel.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,4 +122,4 @@ __device__ uint4 dequantize_s4_to_fp16x2_awq(uint32_t const &source) {
122122
return result;
123123
#endif
124124
__builtin_unreachable(); // Suppress missing return statement warning
125-
}
125+
}

src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,17 @@
1111
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 750)
1212
__global__ void __launch_bounds__(64)
1313
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
14-
int *__restrict__ zeros, half *__restrict__ C, int G,
15-
int out_features, int in_features) {
14+
int *__restrict__ zeros, half *__restrict__ C, int G,
15+
int out_features, int in_features) {
1616
// static constexpr uint32_t ZERO = 0x0;
1717

1818
int col = (blockIdx.x * blockDim.x + threadIdx.x);
1919
int row = (blockIdx.y * blockDim.y + threadIdx.y);
2020

2121
// 边界检查,防止越界访问
22-
if (col >= out_features || row >= in_features) return;
22+
if (col >= out_features || row >= in_features) {
23+
return;
24+
}
2325

2426
// 每个元素在输出中的起始地址:行主序,连续 8 个 half
2527
int index1 = 8 * col + 8 * row * out_features;
@@ -60,23 +62,25 @@ __global__ void __launch_bounds__(64)
6062

6163
// 直接写回全局内存输出
6264
half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
63-
#pragma unroll
65+
#pragma unroll
6466
for (int i = 0; i < 8; ++i) {
6567
C_ptr2[i] = out_vec[i];
6668
}
6769
}
6870
#else
6971
__global__ void __launch_bounds__(64)
7072
dequantize_weights_awq(int *__restrict__ B, half *__restrict__ scaling_factors,
71-
int *__restrict__ zeros, half *__restrict__ C, int group_size,
72-
int out_features, int in_features) {
73+
int *__restrict__ zeros, half *__restrict__ C, int group_size,
74+
int out_features, int in_features) {
7375
static constexpr uint32_t ZERO = 0x0;
7476

7577
int col = (blockIdx.x * blockDim.x + threadIdx.x);
7678
int row = blockIdx.y * blockDim.y + threadIdx.y;
7779

7880
// 边界检查,防止越界访问
79-
if (col >= out_features || row >= in_features) return;
81+
if (col >= out_features || row >= in_features) {
82+
return;
83+
}
8084

8185
int index1 = 8 * col + 8 * row * out_features;
8286
half *C_ptr2 = C + index1;
@@ -122,7 +126,7 @@ __global__ void __launch_bounds__(64)
122126

123127
// 直接写回全局内存输出
124128
half *out_vec = reinterpret_cast<half *>(&B_loaded_fp16);
125-
#pragma unroll
129+
#pragma unroll
126130
for (int i = 0; i < 8; ++i) {
127131
C_ptr2[i] = out_vec[i];
128132
}

src/infiniop/ops/dequantize_gptq/dequantize_gptq.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,17 @@
88

99
#define DESCRIPTOR(NAMESPACE) \
1010
\
11-
namespace op::dequantize_gptq::NAMESPACE { \
11+
namespace op::dequantize_gptq::NAMESPACE { \
1212
class Descriptor final : public InfiniopDescriptor { \
1313
struct Opaque; \
1414
Opaque *_opaque; \
15-
DequantizeGPTQInfo _info; \
15+
DequantizeGPTQInfo _info; \
1616
size_t _workspace_size; \
1717
\
1818
Descriptor( \
1919
size_t workspace_size_, \
2020
Opaque *opaque, \
21-
DequantizeGPTQInfo info, \
21+
DequantizeGPTQInfo info, \
2222
infiniDevice_t device_type, \
2323
int device_id) \
2424
: InfiniopDescriptor{device_type, device_id}, \
@@ -47,7 +47,7 @@
4747
const void *qweight, \
4848
const void *scales, \
4949
const void *zeros, \
50-
const void *g_idx, \
50+
const void *g_idx, \
5151
void *stream) const; \
5252
}; \
5353
}

src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_iluvatar.cu

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
#include "../../../devices/nvidia/nvidia_handle.cuh"
22
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
33
#include "dequantize_w42f16_iluvatar.cuh"
4-
#include "dequantize_w42f16_kernel.cuh"
54

65
#include "../dequantize_gptq.h"
7-
#include <cuda_fp16.h>
86
#include <cstdint>
7+
#include <cuda_fp16.h>
98

109
namespace op::dequantize_gptq::iluvatar {
1110

@@ -20,34 +19,38 @@ Descriptor::~Descriptor() { delete _opaque; }
2019
// zeros: [num_groups, out_packed] packing 8 output channels per word
2120
// scales: [num_groups, out_features], g_idx: [in_features]
2221
__global__ void __launch_bounds__(128)
23-
dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
24-
const half *__restrict__ scales,
25-
const uint32_t *__restrict__ zeros,
26-
const int *__restrict__ g_idx,
27-
half *__restrict__ out,
28-
int in_features,
29-
int out_features,
30-
int out_packed, // ceil(out_features / 8)
31-
int num_groups) {
22+
dequantize_weights_gptq(const uint32_t *__restrict__ qweight,
23+
const half *__restrict__ scales,
24+
const uint32_t *__restrict__ zeros,
25+
const int *__restrict__ g_idx,
26+
half *__restrict__ out,
27+
int in_features,
28+
int out_features,
29+
int out_packed, // ceil(out_features / 8)
30+
int num_groups) {
3231
const int col_pack = blockIdx.x * blockDim.x + threadIdx.x; // packed output column
33-
const int row = blockIdx.y * blockDim.y + threadIdx.y; // real input row
34-
if (col_pack >= out_packed || row >= in_features) return;
32+
const int row = blockIdx.y * blockDim.y + threadIdx.y; // real input row
33+
if (col_pack >= out_packed || row >= in_features) {
34+
return;
35+
}
3536

3637
const int gid_raw = g_idx ? g_idx[row] : 0;
3738
const int gid = ((gid_raw % num_groups) + num_groups) % num_groups;
3839

39-
const int pack_row = row >> 3; // packed input row (8 rows per pack)
40-
const int q_shift = (row & 7) * 4; // nibble shift within uint32
40+
const int pack_row = row >> 3; // packed input row (8 rows per pack)
41+
const int q_shift = (row & 7) * 4; // nibble shift within uint32
4142

4243
const uint32_t zeros_loaded = zeros[gid * out_packed + col_pack];
4344

44-
const int col_base = col_pack << 3; // 8 real cols per pack
45+
const int col_base = col_pack << 3; // 8 real cols per pack
4546
const int scale_base = gid * out_features + col_base;
4647

47-
#pragma unroll
48+
#pragma unroll
4849
for (int j = 0; j < 8; ++j) {
4950
const int col = col_base + j;
50-
if (col >= out_features) break;
51+
if (col >= out_features) {
52+
break;
53+
}
5154

5255
const uint32_t q_loaded = qweight[pack_row * out_features + col];
5356
const int q_nib = (q_loaded >> q_shift) & 0xF;
@@ -96,14 +99,15 @@ infiniStatus_t Descriptor::calculate(
9699
(void)workspace;
97100
(void)workspace_size;
98101

99-
const int in_features = _info.in_features();
102+
const int in_features = _info.in_features();
100103
const int out_features = _info.out_features();
101-
const int out_packed = _info.out_packed();
102-
const int in_packed = _info.in_packed();
103-
const int num_groups = _info.num_groups();
104+
const int out_packed = _info.out_packed();
105+
const int in_packed = _info.in_packed();
106+
const int num_groups = _info.num_groups();
104107

105-
if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0)
108+
if (num_groups <= 0 || in_features <= 0 || out_features <= 0 || out_packed <= 0 || in_packed <= 0) {
106109
return INFINI_STATUS_BAD_PARAM;
110+
}
107111

108112
constexpr int BLOCK_X = 16; // packed columns
109113
constexpr int BLOCK_Y = 4; // rows

src/infiniop/ops/dequantize_gptq/iluvatar/dequantize_w42f16_kernel.cuh

Lines changed: 0 additions & 41 deletions
This file was deleted.

src/infiniop/ops/dequantize_gptq/info.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,11 @@ class DequantizeGPTQInfo {
2828
infiniopTensorDescriptor_t zeros_desc,
2929
infiniopTensorDescriptor_t g_idx_desc) {
3030

31-
const int _in_features = g_idx_desc->dim(0); // real input channels
32-
const int _in_packed = qweight_desc->dim(0); // ceil(in_features / 8)
33-
const int _out_features = qweight_desc->dim(1); // real output channels
34-
const int _num_groups = scales_desc->dim(0); // should be in_features / group_size
35-
const int _out_packed = zeros_desc->dim(1); // ceil(out_features / 8)
31+
const int _in_features = g_idx_desc->dim(0); // real input channels
32+
const int _in_packed = qweight_desc->dim(0); // ceil(in_features / 8)
33+
const int _out_features = qweight_desc->dim(1); // real output channels
34+
const int _num_groups = scales_desc->dim(0); // should be in_features / group_size
35+
const int _out_packed = zeros_desc->dim(1); // ceil(out_features / 8)
3636

3737
assert(out_desc->dim(0) == _in_features);
3838
assert(out_desc->dim(1) == _out_features);

src/infiniop/ops/dequantize_gptq/moore/dequantize_w42f16_kernel.h

Lines changed: 0 additions & 41 deletions
This file was deleted.

0 commit comments

Comments
 (0)