Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
303 changes: 303 additions & 0 deletions 03_nf4_dequant/yuruwind/Summary_report.md

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions 03_nf4_dequant/yuruwind/config.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
blocksize=64
compute_type=fp16
target_gpu=4060
110 changes: 110 additions & 0 deletions 03_nf4_dequant/yuruwind/cpu_dequantize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include <iostream>
#include <vector>
#include <fstream>
#include <cmath>
#include <stdint.h>
#include <algorithm>

// NF4 查找表
const float NF4_TABLE[16] = {
-1.0f, -0.69487101f, -0.51209301f, -0.37391701f,
-0.25611401f, -0.14725500f, -0.04162400f, 0.06282201f,
0.16859101f, 0.28551400f, 0.40619302f, 0.53675699f,
0.68502200f, 0.87091398f, 1.0f, 0.0f
};

// FP16 转换
float half_to_float_cpu(uint16_t h) {
union { float f; uint32_t i; } res;
uint32_t sign = (h & 0x8000) << 16;
uint32_t exp = (h & 0x7c00) >> 10;
uint32_t mant = (h & 0x03ff) << 13;
if (exp == 0x1f) {
res.i = sign | 0x7f800000 | mant;
} else if (exp == 0) {
if (mant == 0) res.i = sign;
else {
exp = 127 - 14;
while (!(mant & 0x00800000)) { mant <<= 1; exp--; }
res.i = sign | (exp << 23) | (mant & 0x007fffff);
}
} else {
res.i = sign | ((exp + (127 - 15)) << 23) | mant;
}
return res.f;
}

struct WeightHeader {
int64_t num_rows;
int64_t num_cols;
int32_t blocksize;
};

int main() {
std::ifstream ifs("input.bin", std::ios::binary);
if (!ifs) return -1;

// 1. 读取 Header(按字段读取,避免结构体对齐带来的偏移错误)
WeightHeader header;
ifs.read(reinterpret_cast<char*>(&header.num_rows), sizeof(header.num_rows));
ifs.read(reinterpret_cast<char*>(&header.num_cols), sizeof(header.num_cols));
ifs.read(reinterpret_cast<char*>(&header.blocksize), sizeof(header.blocksize));

int64_t total_elements = header.num_rows * header.num_cols;
int32_t num_blocks = (total_elements + header.blocksize - 1) / header.blocksize;
int32_t group_size = 256; // QLoRA 默认
int32_t num_groups = (num_blocks + group_size - 1) / group_size;

// 2. 分配并读取数据
std::vector<uint8_t> packed_weights((total_elements + 1) / 2);
std::vector<uint8_t> absmax_q(num_blocks);
std::vector<uint16_t> code2(256);
std::vector<uint16_t> absmax2(num_groups);
float offset;

ifs.read(reinterpret_cast<char*>(packed_weights.data()), packed_weights.size());
ifs.read(reinterpret_cast<char*>(absmax_q.data()), absmax_q.size());
ifs.read(reinterpret_cast<char*>(code2.data()), code2.size() * 2);
ifs.read(reinterpret_cast<char*>(absmax2.data()), absmax2.size() * 2);
ifs.read(reinterpret_cast<char*>(&offset), 4);

// 3. CPU 反量化逻辑 (原型)
std::vector<float> output(total_elements);
for (int64_t i = 0; i < static_cast<int64_t>(packed_weights.size()); ++i) {
uint8_t byte = packed_weights[i];

// 拆解两个 4-bit 索引
uint8_t idxs[2];
idxs[0] = byte & 0x0F; // 低4位
idxs[1] = (byte >> 4) & 0x0F; // 高4位

for (int j = 0; j < 2; ++j) {
int64_t curr_idx = i * 2 + j;
if (curr_idx >= total_elements) {
continue;
}
int32_t b_idx = curr_idx / header.blocksize;
int32_t g_idx = b_idx / group_size;

// 双重解量化公式
float s1 = half_to_float_cpu(code2[absmax_q[b_idx]]);
float s2 = half_to_float_cpu(absmax2[g_idx]);
float scale = s1 * s2;

output[curr_idx] = NF4_TABLE[idxs[j]] * scale + offset;
}
}

// 4. 验证结果 (读取 gt_output.bin)
std::vector<uint16_t> gt(total_elements);
std::ifstream gfs("gt_output.bin", std::ios::binary);
gfs.read(reinterpret_cast<char*>(gt.data()), total_elements * 2);

float max_error = 0;
for(int i=0; i<total_elements; ++i) {
max_error = std::max(max_error, std::abs(output[i] - half_to_float_cpu(gt[i])));
}
std::cout << "Max Absolute Error: " << max_error << std::endl;

return 0;
}
85 changes: 85 additions & 0 deletions 03_nf4_dequant/yuruwind/gen_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/python3
import numpy as np
import struct
import argparse

# NF4 标准查找表
NF4_TABLE = np.array([
-1.0, -0.69487101, -0.51209301, -0.37391701,
-0.25611401, -0.14725500, -0.04162400, 0.06282201,
0.16859101, 0.28551400, 0.40619302, 0.53675699,
0.68502200, 0.87091398, 1.0, 0.0
], dtype=np.float32)

def gen_test_data(rows, cols, blocksize=64, group_size=256):
num_elements = rows * cols
num_blocks = (num_elements + blocksize - 1) // blocksize
num_groups = (num_blocks + group_size - 1) // group_size

# 随机生成 4-bit 索引 (0-15)
indices = np.random.randint(0, 16, size=num_elements, dtype=np.uint8)

# 打包权重: 处理奇数个元素的情况
num_bytes = (num_elements + 1) // 2
packed_weights = np.zeros(num_bytes, dtype=np.uint8)

for i in range(num_elements // 2):
packed_weights[i] = (indices[2*i+1] << 4) | (indices[2*i] & 0x0F)

# 如果总数是奇数,单独处理最后半个 byte (放在低 4 位)
if num_elements % 2 != 0:
packed_weights[-1] = indices[-1] & 0x0F

# 生成二级缩放因子数据
# absmax_q (每块一个索引,对应 code2)
absmax_q = np.random.randint(0, 256, size=num_blocks, dtype=np.uint8)
# code2 (二级码表,256个 fp16)
code2 = np.random.randn(256).astype(np.float16)
# absmax2 (每组一个 fp16)
absmax2 = np.random.randn(num_groups).astype(np.float16)
offset = 0.0

# 计算 Ground Truth (用于验证)
gt_weights = np.zeros(num_elements, dtype=np.float32)
for i in range(num_elements):
b_idx = i // blocksize
g_idx = b_idx // group_size
scale = f16_to_f32(code2[absmax_q[b_idx]]) * f16_to_f32(absmax2[g_idx])
gt_weights[i] = NF4_TABLE[indices[i]] * scale

with open("input.bin", "wb") as f:
# Header: rows(i64), cols(i64), blocksize(i32)
f.write(struct.pack("qqi", rows, cols, blocksize))
f.write(packed_weights.tobytes())
f.write(absmax_q.tobytes())
f.write(code2.tobytes())
f.write(absmax2.tobytes())
f.write(struct.pack("f", offset))

gt_weights.astype(np.float16).tofile("gt_output.bin")
print(f"Successfully generated: {rows}x{cols} ({num_elements} elements)")
print("Files saved: input.bin, gt_output.bin")

def f16_to_f32(val): # 辅助转换
return np.array(val, dtype=np.float16).astype(np.float32)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate NF4 test data.")
parser.add_argument("rows", type=int, help="Number of rows")
parser.add_argument("cols", type=int, help="Number of columns")
parser.add_argument(
"--blocksize",
type=int,
default=64,
help="Block size (default: 64)",
)
parser.add_argument(
"--group-size",
type=int,
default=256,
help="Group size (default: 256)",
)

args = parser.parse_args()

gen_test_data(args.rows, args.cols, args.blocksize, args.group_size)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading