InfiniTensor · yuruwind · Mar 15, 2026 · Mar 15, 2026
diff --git a/03_nf4_dequant/yuruwind/Summary_report.md b/03_nf4_dequant/yuruwind/Summary_report.md
diff --git a/03_nf4_dequant/yuruwind/config.txt b/03_nf4_dequant/yuruwind/config.txt
@@ -0,0 +1,3 @@
+blocksize=64
+compute_type=fp16
+target_gpu=4060
diff --git a/03_nf4_dequant/yuruwind/cpu_dequantize.cpp b/03_nf4_dequant/yuruwind/cpu_dequantize.cpp
@@ -0,0 +1,110 @@
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <cmath>
+#include <stdint.h>
+#include <algorithm>
+
+// NF4 查找表
+const float NF4_TABLE[16] = {
+    -1.0f, -0.69487101f, -0.51209301f, -0.37391701f,
+    -0.25611401f, -0.14725500f, -0.04162400f, 0.06282201f,
+    0.16859101f, 0.28551400f, 0.40619302f, 0.53675699f,
+    0.68502200f, 0.87091398f, 1.0f, 0.0f
+};
+
+// FP16 转换
+float half_to_float_cpu(uint16_t h) {
+    union { float f; uint32_t i; } res;
+    uint32_t sign = (h & 0x8000) << 16;
+    uint32_t exp = (h & 0x7c00) >> 10;
+    uint32_t mant = (h & 0x03ff) << 13;
+    if (exp == 0x1f) {
+        res.i = sign | 0x7f800000 | mant;
+    } else if (exp == 0) {
+        if (mant == 0) res.i = sign;
+        else {
+            exp = 127 - 14;
+            while (!(mant & 0x00800000)) { mant <<= 1; exp--; }
+            res.i = sign | (exp << 23) | (mant & 0x007fffff);
+        }
+    } else {
+        res.i = sign | ((exp + (127 - 15)) << 23) | mant;
+    }
+    return res.f;
+}
+
+struct WeightHeader {
+    int64_t num_rows;
+    int64_t num_cols;
+    int32_t blocksize;
+};
+
+int main() {
+    std::ifstream ifs("input.bin", std::ios::binary);
+    if (!ifs) return -1;
+
+    // 1. 读取 Header（按字段读取，避免结构体对齐带来的偏移错误）
+    WeightHeader header;
+    ifs.read(reinterpret_cast<char*>(&header.num_rows), sizeof(header.num_rows));
+    ifs.read(reinterpret_cast<char*>(&header.num_cols), sizeof(header.num_cols));
+    ifs.read(reinterpret_cast<char*>(&header.blocksize), sizeof(header.blocksize));
+
+    int64_t total_elements = header.num_rows * header.num_cols;
+    int32_t num_blocks = (total_elements + header.blocksize - 1) / header.blocksize;
+    int32_t group_size = 256; // QLoRA 默认
+    int32_t num_groups = (num_blocks + group_size - 1) / group_size;
+
+    // 2. 分配并读取数据
+    std::vector<uint8_t> packed_weights((total_elements + 1) / 2);
+    std::vector<uint8_t> absmax_q(num_blocks);
+    std::vector<uint16_t> code2(256);
+    std::vector<uint16_t> absmax2(num_groups);
+    float offset;
+
+    ifs.read(reinterpret_cast<char*>(packed_weights.data()), packed_weights.size());
+    ifs.read(reinterpret_cast<char*>(absmax_q.data()), absmax_q.size());
+    ifs.read(reinterpret_cast<char*>(code2.data()), code2.size() * 2);
+    ifs.read(reinterpret_cast<char*>(absmax2.data()), absmax2.size() * 2);
+    ifs.read(reinterpret_cast<char*>(&offset), 4);
+
+    // 3. CPU 反量化逻辑 (原型)
+    std::vector<float> output(total_elements);
+    for (int64_t i = 0; i < static_cast<int64_t>(packed_weights.size()); ++i) {
+        uint8_t byte = packed_weights[i];
+
+        // 拆解两个 4-bit 索引
+        uint8_t idxs[2];
+        idxs[0] = byte & 0x0F;         // 低4位
+        idxs[1] = (byte >> 4) & 0x0F;  // 高4位
+
+        for (int j = 0; j < 2; ++j) {
+            int64_t curr_idx = i * 2 + j;
+            if (curr_idx >= total_elements) {
+                continue;
+            }
+            int32_t b_idx = curr_idx / header.blocksize;
+            int32_t g_idx = b_idx / group_size;
+
+            // 双重解量化公式
+            float s1 = half_to_float_cpu(code2[absmax_q[b_idx]]);
+            float s2 = half_to_float_cpu(absmax2[g_idx]);
+            float scale = s1 * s2;
+
+            output[curr_idx] = NF4_TABLE[idxs[j]] * scale + offset;
+        }
+    }
+
+    // 4. 验证结果 (读取 gt_output.bin)
+    std::vector<uint16_t> gt(total_elements);
+    std::ifstream gfs("gt_output.bin", std::ios::binary);
+    gfs.read(reinterpret_cast<char*>(gt.data()), total_elements * 2);
+
+    float max_error = 0;
+    for(int i=0; i<total_elements; ++i) {
+        max_error = std::max(max_error, std::abs(output[i] - half_to_float_cpu(gt[i])));
+    }
+    std::cout << "Max Absolute Error: " << max_error << std::endl;
+
+    return 0;
+}
diff --git a/03_nf4_dequant/yuruwind/gen_data.py b/03_nf4_dequant/yuruwind/gen_data.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python3
+import numpy as np
+import struct
+import argparse
+
+# NF4 标准查找表 
+NF4_TABLE = np.array([
+    -1.0, -0.69487101, -0.51209301, -0.37391701,
+    -0.25611401, -0.14725500, -0.04162400, 0.06282201,
+    0.16859101, 0.28551400, 0.40619302, 0.53675699,
+    0.68502200, 0.87091398, 1.0, 0.0
+], dtype=np.float32)
+
+def gen_test_data(rows, cols, blocksize=64, group_size=256):
+    num_elements = rows * cols
+    num_blocks = (num_elements + blocksize - 1) // blocksize
+    num_groups = (num_blocks + group_size - 1) // group_size
+
+    # 随机生成 4-bit 索引 (0-15)
+    indices = np.random.randint(0, 16, size=num_elements, dtype=np.uint8)
+
+    # 打包权重: 处理奇数个元素的情况
+    num_bytes = (num_elements + 1) // 2
+    packed_weights = np.zeros(num_bytes, dtype=np.uint8)
+
+    for i in range(num_elements // 2):
+        packed_weights[i] = (indices[2*i+1] << 4) | (indices[2*i] & 0x0F)
+
+    # 如果总数是奇数，单独处理最后半个 byte (放在低 4 位)
+    if num_elements % 2 != 0:
+        packed_weights[-1] = indices[-1] & 0x0F
+
+    # 生成二级缩放因子数据
+    # absmax_q (每块一个索引，对应 code2)
+    absmax_q = np.random.randint(0, 256, size=num_blocks, dtype=np.uint8)
+    # code2 (二级码表，256个 fp16)
+    code2 = np.random.randn(256).astype(np.float16)
+    # absmax2 (每组一个 fp16)
+    absmax2 = np.random.randn(num_groups).astype(np.float16)
+    offset = 0.0
+
+    # 计算 Ground Truth (用于验证)
+    gt_weights = np.zeros(num_elements, dtype=np.float32)
+    for i in range(num_elements):
+        b_idx = i // blocksize
+        g_idx = b_idx // group_size
+        scale = f16_to_f32(code2[absmax_q[b_idx]]) * f16_to_f32(absmax2[g_idx])
+        gt_weights[i] = NF4_TABLE[indices[i]] * scale
+
+    with open("input.bin", "wb") as f:
+        # Header: rows(i64), cols(i64), blocksize(i32)
+        f.write(struct.pack("qqi", rows, cols, blocksize))
+        f.write(packed_weights.tobytes())
+        f.write(absmax_q.tobytes())
+        f.write(code2.tobytes())
+        f.write(absmax2.tobytes())
+        f.write(struct.pack("f", offset))
+
+    gt_weights.astype(np.float16).tofile("gt_output.bin")
+    print(f"Successfully generated: {rows}x{cols} ({num_elements} elements)")
+    print("Files saved: input.bin, gt_output.bin")
+
+def f16_to_f32(val): # 辅助转换
+    return np.array(val, dtype=np.float16).astype(np.float32)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate NF4 test data.")
+    parser.add_argument("rows", type=int, help="Number of rows")
+    parser.add_argument("cols", type=int, help="Number of columns")
+    parser.add_argument(
+        "--blocksize",
+        type=int,
+        default=64,
+        help="Block size (default: 64)",
+    )
+    parser.add_argument(
+        "--group-size",
+        type=int,
+        default=256,
+        help="Group size (default: 256)",
+    )
+
+    args = parser.parse_args()
+
+    gen_test_data(args.rows, args.cols, args.blocksize, args.group_size)
diff --git a/03_nf4_dequant/yuruwind/img/image-20260315174840901.png b/03_nf4_dequant/yuruwind/img/image-20260315174840901.png
diff --git a/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_181513.png b/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_181513.png
diff --git a/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182006.png b/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182006.png
diff --git a/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182040.png b/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182040.png
diff --git a/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182110.png b/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182110.png
diff --git a/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182718.png b/03_nf4_dequant/yuruwind/img/屏幕截图_20260315_182718.png