diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b0cd528
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+build/
+bilateral_filter
+lib/
+*.raw
+*.o
+*.prof
+*.nsys-rep
+*.ncu-rep
+.vscode/
\ No newline at end of file
diff --git a/08_bilateral_filter/.clang-format b/08_bilateral_filter/.clang-format
new file mode 100644
index 0000000..bd66366
--- /dev/null
+++ b/08_bilateral_filter/.clang-format
@@ -0,0 +1,87 @@
+BasedOnStyle: LLVM
+Language: Cpp
+
+# Indentation
+IndentWidth: 4
+TabWidth: 4
+UseTab: Never
+ContinuationIndentWidth: 4
+IndentCaseLabels: false
+IndentPPDirectives: None
+NamespaceIndentation: None
+
+# Line length
+ColumnLimit: 100
+
+# Braces
+BreakBeforeBraces: Attach
+BraceWrapping:
+  AfterClass: false
+  AfterControlStatement: false
+  AfterEnum: false
+  AfterFunction: false
+  AfterNamespace: false
+  AfterStruct: false
+  BeforeElse: false
+  BeforeWhile: false
+
+# Alignment
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+
+# Pointer and reference alignment
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Includes
+IncludeBlocks: Regroup
+IncludeCategories:
+  - Regex: '^<cuda.*>'
+    Priority: 1
+  - Regex: '^<.*\.h>'
+    Priority: 2
+  - Regex: '^<.*>'
+    Priority: 3
+  - Regex: '".*"'
+    Priority: 4
+SortIncludes: true
+
+# Spaces
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: true
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+
+# Line breaks
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Inline
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeColon
+BreakStringLiterals: true
+
+# Others
+Cpp11BracedListStyle: true
+FixNamespaceComments: true
+MaxEmptyLinesToKeep: 1
+ReflowComments: true
+SortUsingDeclarations: true
\ No newline at end of file
diff --git a/08_bilateral_filter/Andy1314Chen/Makefile b/08_bilateral_filter/Andy1314Chen/Makefile
new file mode 100644
index 0000000..aba81b4
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/Makefile
@@ -0,0 +1,53 @@
+CXX = g++
+NVCC = nvcc
+CUDA_PATH ?= /usr/local/cuda
+CXXFLAGS = -std=c++17 -O3 -Wall -Wextra -I./include -I$(CUDA_PATH)/include
+NVCCFLAGS = -O3 -gencode arch=compute_110,code=sm_110 -I./include
+
+OPENCV_CFLAGS = $(shell pkg-config --cflags opencv4)
+OPENCV_LIBS = $(shell pkg-config --libs opencv4)
+CUDA_LIBS = -L$(CUDA_PATH)/lib64 -lcudart
+
+# Detect OpenCV CUDA module (cudaimgproc)
+HAVE_OPENCV_CUDA := $(shell pkg-config --libs opencv4 2>/dev/null | grep -q cudaimgproc && echo 1)
+ifeq ($(HAVE_OPENCV_CUDA),1)
+  CXXFLAGS += -DHAVE_OPENCV_CUDA
+  $(info OpenCV CUDA modules detected: enabling cv::cuda::bilateralFilter baseline)
+else
+  $(info OpenCV CUDA modules not found: cv::cuda baseline disabled)
+endif
+
+SRC_DIR = src
+BUILD_DIR = build
+BIN_DIR = .
+
+SRCS_CPP = $(SRC_DIR)/main.cpp \
+           $(SRC_DIR)/image_io.cpp \
+           $(SRC_DIR)/bilateral_filter_cpu.cpp \
+           $(SRC_DIR)/bilateral_filter_opencv.cpp
+
+SRCS_CUDA = $(SRC_DIR)/bilateral_filter_cuda.cu
+
+OBJS_CPP = $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SRCS_CPP))
+OBJS_CUDA = $(patsubst $(SRC_DIR)/%.cu,$(BUILD_DIR)/%.o,$(SRCS_CUDA))
+
+TARGET = $(BIN_DIR)/bilateral_filter
+
+.PHONY: all clean
+
+all: $(TARGET)
+
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp
+	@mkdir -p $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) -c -o $@ $<
+
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu
+	@mkdir -p $(BUILD_DIR)
+	$(NVCC) $(NVCCFLAGS) -c -o $@ $<
+
+$(TARGET): $(OBJS_CPP) $(OBJS_CUDA)
+	@mkdir -p $(BUILD_DIR)
+	$(NVCC) -o $@ $^ $(OPENCV_LIBS) $(CUDA_LIBS)
+
+clean:
+	rm -rf $(BUILD_DIR) $(TARGET)
diff --git a/08_bilateral_filter/README.md b/08_bilateral_filter/Andy1314Chen/README.md
similarity index 100%
rename from 08_bilateral_filter/README.md
rename to 08_bilateral_filter/Andy1314Chen/README.md
diff --git a/08_bilateral_filter/Andy1314Chen/REPORT_course.md b/08_bilateral_filter/Andy1314Chen/REPORT_course.md
new file mode 100644
index 0000000..41ee5f8
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/REPORT_course.md
@@ -0,0 +1,923 @@
+# CUDA 双边滤波优化实验报告
+
+**题目**：CUDA 双边滤波优化实验与性能分析  
+**课程**：并行计算 / GPU 编程  
+**作者**：Andy1314Chen  
+**日期**：2026-02-27  
+**项目路径**：`08_bilateral_filter/Andy1314Chen`
+
+---
+
+## 目录
+
+1. [引言](#1-引言)
+2. [算法原理与任务定义](#2-算法原理与任务定义)
+3. [CUDA 实现模式设计](#3-cuda-实现模式设计)
+4. [实验环境与评测方法](#4-实验环境与评测方法)
+5. [基线实现与瓶颈分析](#5-基线实现与瓶颈分析)
+6. [优化策略与逐步实验](#6-优化策略与逐步实验)
+7. [性能结果与质量验证](#7-性能结果与质量验证)
+8. [Profiler 深度分析](#8-profiler-深度分析)
+9. [失败实验与优化边界](#9-失败实验与优化边界)
+10. [跨平台对比分析](#10-跨平台对比分析)
+11. [结论与展望](#11-结论与展望)
+12. [附录：完整数据图表](#12-附录完整数据图表)
+
+---
+
+## 1. 引言
+
+双边滤波是一种经典的非线性保边平滑滤波器，在 VR 头显弱光降噪、医学图像处理等领域有广泛应用。然而，其计算复杂度为 $O((2r+1)^2)$，当滤波半径 $r=5$ 时每个像素需进行 121 次邻域访问和权重计算，对 4K（3840×2160）@60fps 实时处理构成极大挑战。
+
+本文目标：在 CUDA 平台上实现高性能双边滤波，满足以下指标：
+- **吞吐量**：4K@60fps 即 ≥498 MP/s
+- **正确性**：MAE < 1.0、PSNR > 40 dB（以 OpenCV CPU `bilateralFilter` 为参考）
+- **性能**：相对 OpenCV CPU 显著加速，并超越 OpenCV CUDA
+
+实验经历了 12 个迭代版本和 15 个优化实验（含 5 个失败实验），最终在两个 GPU 平台上验证了优化效果。
+
+---
+
+## 2. 算法原理与任务定义
+
+### 2.1 双边滤波公式
+
+$$
+BF[I]_p = \frac{1}{W_p} \sum_{q \in \mathcal{S}} G_{\sigma_s}(\|p-q\|) \cdot G_{\sigma_r}(|I_p - I_q|) \cdot I_q
+$$
+
+其中：
+- $G_{\sigma_s}(\|p-q\|) = \exp\left(-\frac{\|p-q\|^2}{2\sigma_s^2}\right)$ 为**空间高斯权重**
+- $G_{\sigma_r}(|I_p-I_q|) = \exp\left(-\frac{|I_p-I_q|^2}{2\sigma_r^2}\right)$ 为**值域高斯权重**
+- $W_p = \sum_{q} G_{\sigma_s} \cdot G_{\sigma_r}$ 为归一化因子
+
+在平坦区域，颜色相似使权重近似空间高斯，实现充分平滑；在边缘处，颜色差异大使值域权重趋零，避免跨边缘模糊。
+
+### 2.2 计算量分析
+
+对于 4K RGB 图像（3840×2160×3），radius=5：
+- 每个像素：$(2 \times 5+1)^2 = 121$ 次邻域访问
+- 每次访问：1 次空间权重 + 1 次值域权重（含 `expf`）+ 加权累加
+- 全图：$8.3M \times 121 \approx 10$ 亿次浮点运算
+
+### 2.3 代码结构
+
+```
+src/
+  bilateral_filter_cuda.cu    # 全部 CUDA kernel（6 种模式）、LUT、pipeline
+  bilateral_filter_cpu.cpp    # CPU 参考实现
+  bilateral_filter_opencv.cpp # OpenCV CPU + CUDA 封装
+  main.cpp                    # CLI 入口（--bench/--cuda/--compare-all）
+  image_io.cpp                # 二进制 raw 图像 I/O
+include/
+  bilateral_filter_cuda.cuh   # CUDA 滤波声明
+  bilateral_filter.h          # CPU 滤波声明
+  image_io.h                  # ImageData/FilterParams 结构体
+```
+
+---
+
+## 3. CUDA 实现模式设计
+
+本项目实现了 6 种 CUDA 滤波模式，通过环境变量 `BILATERAL_MODE` 在运行时切换。各模式在算法复杂度、编译器优化能力、精度和适用场景上有本质差异。以下从代码层面逐一分析。
+
+### 3.1 STANDARD 模式（MODE=0）—— 运行时半径
+
+STANDARD 是最通用的 fallback 实现：半径作为**运行时参数**传入 kernel，shared memory 大小通过 `extern __shared__` 动态分配。
+
+```cpp
+// bilateral_filter_cuda.cu:1031-1034 — 运行时半径，动态 shared memory
+__global__ void k_bilateral_filter_shared(
+    const InT* __restrict__ input, OutT* __restrict__ output,
+    int width, int height, int radius) {
+    extern __shared__ float smem[];  // 大小在 launch 时指定
+    ...
+    #pragma unroll 4  // 仅提示展开因子 4（循环界不确定）
+    for (int dy = -radius; dy <= radius; ++dy) { ... }
+}
+```
+
+**核心特征**：
+- 支持**任意半径值**，无需预编译
+- 循环界为运行时变量，编译器**无法完全展开**，仅按 `#pragma unroll 4` 部分展开
+- RGB 版本对三通道使用**独立权重**（每通道单独查 LUT、独立累加），精度最高但 LUT 访问量为 TEMPLATE 的 3 倍
+
+```cpp
+// bilateral_filter_cuda.cu:1160-1180 — STANDARD RGB: 三通道独立权重
+float cw_r = d_color_lut[diff_r];  // R 通道权重
+float cw_g = d_color_lut[diff_g];  // G 通道权重
+float cw_b = d_color_lut[diff_b];  // B 通道权重
+sum_r += nr * (spatial_weight * cw_r);  // 各通道独立加权
+sum_g += ng * (spatial_weight * cw_g);
+sum_b += nb * (spatial_weight * cw_b);
+```
+
+**适用场景**：半径不在预编译列表（3/5/7/9/10）中时的通用回退。
+
+### 3.2 TEMPLATE 模式（MODE=1）—— 编译期半径
+
+TEMPLATE 是性能最均衡的模式：将半径作为 **C++ 模板参数**，使编译器获得完整循环界信息。
+
+```cpp
+// bilateral_filter_cuda.cu:74-76 — 编译期半径
+template <int RADIUS, typename InT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_filter_gray_template(...) {
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;  // 编译期常量
+    ...
+    #pragma unroll  // 编译器可完全展开（RADIUS 已知）
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        #pragma unroll
+        for (int dx = -RADIUS; dx <= RADIUS; ++dx) { ... }
+    }
+}
+```
+
+**与 STANDARD 的关键差异**：
+
+| 维度 | STANDARD | TEMPLATE |
+|------|----------|----------|
+| 半径 | 运行时参数 `int radius` | 编译期常量 `template<int RADIUS>` |
+| Shared memory | `extern __shared__` 动态 | `__shared__ float smem[编译期大小]` |
+| 循环展开 | `#pragma unroll 4`（部分） | `#pragma unroll`（**完全展开**） |
+| 编译器优化 | 有限（循环界未知） | **DCE + 寄存器优化 + 指令调度** |
+| RGB 权重 | 三通道独立（3 次 LUT） | **均值近似**（1 次 LUT） |
+
+TEMPLATE 的 RGB kernel 使用**单一色彩权重近似**——三通道差值的均值作为查表索引，LUT 查找从 3 次降为 1 次：
+
+```cpp
+// bilateral_filter_cuda.cu:218-224 — TEMPLATE RGB: 单色权重近似
+int diff = static_cast<int>(
+    (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b))
+    * (1.0f / 3.0f) + 0.5f);
+float w = spatial_weight * d_color_lut[diff]; // 1 次查表，3 通道共享
+```
+
+**圆形窗口 DCE**：由于 RADIUS 是编译期常量，`#pragma unroll` 完全展开 121 次迭代后，编译器在编译期确定哪 40 个位置的 `d_spatial_lut` 恒为 0，通过 Dead Code Elimination 直接删除这些迭代，等效于仅执行 81 次。
+
+运行时通过 `switch(radius)` 分发到预编译的模板实例：
+
+```cpp
+// 支持 radius = 3, 5, 7, 9, 10 五种预编译版本
+switch (radius) {
+    case 3: launch_u8_gray<3>(d_in, d_out, w, h, s); break;
+    case 5: launch_u8_gray<5>(d_in, d_out, w, h, s); break;
+    case 7: launch_u8_gray<7>(d_in, d_out, w, h, s); break;
+    ...
+}
+```
+
+### 3.3 SEPARABLE 模式（MODE=2）—— 分离近似
+
+SEPARABLE 是**算法级优化**：将 2D 双边滤波近似为水平 + 垂直两次 1D 滤波。
+
+严格来说，双边滤波因值域权重依赖像素值而**不可分离**。但实践中近似分离的质量损失很小（MAE 0.45 vs STANDARD 0.48），且复杂度从 $O((2r+1)^2)$ 降至 $O(2 \times (2r+1))$——r=5 时从 121 次降到 22 次。
+
+**水平 pass**：每行加载到 shared memory，沿行方向滤波：
+
+```cpp
+// bilateral_filter_cuda.cu:246-296 — 水平 kernel
+template <int RADIUS, typename InT, typename TmpT>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_horizontal_gray(...) {
+    __shared__ float smem[BLOCK_Y][TILE_W_PAD]; // 每行独立
+    // 加载行 + halo
+    for (int i = tx; i < TILE_W; i += BLOCK_X) {
+        smem[ty][i] = static_cast<float>(input[gy * width + gx]);
+    }
+    __syncthreads();
+    // 1D 滤波（仅沿 dx 方向）
+    for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+        float neighbor = smem[ty][lx + dx];
+        float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+        ...
+    }
+}
+```
+
+**垂直 pass**：每列加载到 shared memory，沿列方向滤波：
+
+```cpp
+// bilateral_filter_cuda.cu:298-347 — 垂直 kernel
+template <int RADIUS, typename TmpT, typename OutT>
+__global__ void k_bilateral_vertical_gray(...) {
+    __shared__ float smem[TILE_H][BLOCK_X]; // 每列独立
+    // 1D 滤波（仅沿 dy 方向）
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        float neighbor = smem[ly + dy][tx];
+        float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+        ...
+    }
+}
+```
+
+**与 TEMPLATE 2D 的关键差异**：
+
+| 维度 | TEMPLATE (2D) | SEPARABLE (H+V) |
+|------|---------------|-----------------|
+| 复杂度 | $O((2r+1)^2) = 121$ 次 | $O(2 \times (2r+1)) = 22$ 次 |
+| Kernel 数 | 1 个 | 2 个（H + V） |
+| 中间缓冲区 | 无 | float 或 FP16（SoA 布局） |
+| Shared memory | 2D tile (H×W) | 1D 行或列 |
+| 精度 | 精确 2D 滤波 | 近似（可分离假设） |
+| launch_bounds | `(256, 4)` | `(256, 6)`（更激进） |
+
+### 3.4 SEPARABLE_FP16 模式（MODE=3）—— FP16 中间缓冲
+
+在 SEPARABLE 基础上，将两趟之间的中间缓冲区从 `float`（4B）改为 `__half`（2B），**全局内存带宽减半**。计算仍在 FP32 进行，精度无损。
+
+```cpp
+// bilateral_filter_cuda.cu:1332-1335
+// H kernel: uint8 → smem(float) → compute(float) → output(__half)
+// V kernel: input(__half) → smem(float) → compute(float) → output(uint8)
+// 仅 H↔V 传输使用 FP16，计算精度不受影响。
+```
+
+ncu 验证：V kernel LD sectors/request 从 4.00 降至 2.00（**-50%**），Gray 模式端到端提速 **7-10%**。
+
+### 3.5 ADAPTIVE 模式（MODE=4）—— 自适应半径
+
+ADAPTIVE 是**质量优先**的模式：先用 Sobel 算子计算逐像素梯度，再将梯度映射为半径——平坦区域用大半径（充分平滑），边缘区域用小半径（保留细节）。
+
+**第一步：梯度计算 → 半径映射**
+
+```cpp
+// bilateral_filter_cuda.cu:1491-1506 — Sobel 梯度 → radius
+float gx = -p00 + p02 - 2.0f*p10 + 2.0f*p12 - p20 + p22;
+float gy = -p00 - 2.0f*p01 - p02 + p20 + 2.0f*p21 + p22;
+float grad = sqrtf(gx*gx + gy*gy);
+// 线性插值：grad=0 → r_max，grad≥threshold → r_min
+float t = fminf(grad * inv_grad_threshold, 1.0f);
+int r = __float2int_rn(r_max - t * (r_max - r_min));
+radius_map[y * width + x] = static_cast<uint8_t>(r);
+```
+
+**第二步：使用 per-pixel radius 的双边滤波**
+
+```cpp
+// bilateral_filter_cuda.cu:1551-1588 — 自适应 kernel
+k_bilateral_adaptive_gray(..., const uint8_t* radius_map, ...) {
+    // shared memory 按 r_max 分配（保证所有线程可访问最大邻域）
+    const int tile_w = BLOCK_X + 2 * r_max;
+    ...
+    const int my_radius = static_cast<int>(radius_map[y * width + x]);
+    for (int dy = -my_radius; dy <= my_radius; ++dy) { ... }
+}
+```
+
+**与其他模式的关键差异**：
+
+| 维度 | TEMPLATE/STANDARD | ADAPTIVE |
+|------|-------------------|----------|
+| 半径 | 全图统一 | **逐像素不同** |
+| 预处理 | 无 | 额外 Sobel 梯度 kernel |
+| Warp divergence | 无（循环界一致） | **有**（同 warp 内不同线程循环次数不同） |
+| Shared memory | 按实际 radius 分配 | 按 **r_max** 分配（最坏情况） |
+| MAE | 0.48-0.60 | **0.40**（最优） |
+| 性能 | 较快 | 较慢（梯度 pass + divergence 开销） |
+
+### 3.6 FUSED 模式（MODE=5）—— 融合 H+V
+
+实验性模式：将 SEPARABLE 的两个 kernel 融合为单 kernel，消除中间缓冲区的全局内存读写。
+
+```
+单 kernel 三阶段：
+Phase 0: 加载 2D halo → smem_raw
+Phase 1: 对全部行做水平滤波 → smem_h（smem 内，无全局写入）
+Phase 2: 从 smem_h 做垂直滤波 → 输出
+```
+
+实测性能倒退 31-34%（见第 9 章失败实验），已标记为实验性。
+
+### 3.7 模式对比总结
+
+下表为 **Jetson AGX Thor** 实测数据（统一内存，无 PCIe 瓶颈，kernel 性能直接反映端到端）。OpenCV CPU 82 ms = 1.00x。
+
+| 模式 | 环境变量 | 复杂度 | MAE | Avg (ms) | Min (ms) | vs OCV CPU | vs OCV CUDA | 核心特点 |
+|------|---------|:---:|---:|---:|---:|---:|---:|---------|
+| **SEP_FP16** | **`MODE=3`** | **$O(r)$** | **0.46** | **3.01** | **2.95** | **27.2x** | **4.00x** | **SEPARABLE + FP16 中间缓冲，最快** |
+| SEPARABLE | `MODE=2` | $O(r)$ | 0.45 | 3.06 | 2.99 | 26.8x | 3.94x | 水平+垂直分离近似 |
+| TEMPLATE | `MODE=1` | $O(r^2)$ | 0.60 | 5.48 | 5.44 | 15.0x | 2.23x | 编译期半径，完全展开 + DCE |
+| ADAPTIVE | `MODE=4` | $O(r_{avg}^2)$ | **0.40** | 6.17 | 6.11 | 13.3x | 1.94x | Sobel 自适应半径，最高精度 |
+| STANDARD | `MODE=0` | $O(r^2)$ | 0.48 | 9.41 | 9.26 | 8.70x | 1.29x | 运行时半径，三通道独立权重 |
+| OpenCV CUDA | — | — | 0.00 | 12.07 | 11.79 | 6.79x | 1.00x | GPU 参考基线 |
+| **OpenCV CPU** | — | — | — | **82** | — | **1.00x** | — | **CPU 基线** |
+
+---
+
+## 4. 实验环境与评测方法
+
+### 4.1 硬件平台
+
+| 规格 | RTX 4060 (桌面) | Jetson AGX Thor (嵌入式) |
+|------|:---:|:---:|
+| GPU 架构 | Ada Lovelace (sm_89) | Blackwell (sm_110) |
+| SM 数量 | 24 | 20 |
+| 显存/统一内存 | 8 GB GDDR6 (独立) | 128 GB LPDDR5x (共享) |
+| L2 Cache | 24 MB | 32 MB |
+| Shared Memory/SM | 100 KB | 228 KB |
+| CPU-GPU 互联 | PCIe 4.0 x8 (WSL2) | 统一内存 (无 PCIe) |
+| CUDA | 13.1 | 13.0 |
+| OpenCV | 4.13.0 (with CUDA) | 4.x (with CUDA) |
+
+### 4.2 评测方法
+
+- **Benchmark 流程**：5 次 warmup + 50 次计时，报告 mean ± stddev
+- **测试数据**：4K/1080p × RGB/Gray，参数 radius=5, σ_s=3.0, σ_c=30.0
+- **基线口径**：
+  - **主基线**：OpenCV CPU `bilateralFilter`（正确性参照 + 性能基准）
+  - **次基线**：OpenCV CUDA `cv::cuda::bilateralFilter`（GPU 性能参照）
+- **工具**：`ncu --set full`（硬件计数器）、`nsys profile`（时间线）
+
+---
+
+## 5. 基线实现与瓶颈分析
+
+### 5.1 OpenCV CPU 基线
+
+本项目以 OpenCV `cv::bilateralFilter`（CPU 版本）作为 **主基线**——它是工业级优化的参考实现，也是正确性验证的金标准。
+
+4K RGB 耗时 **82 ms**（~101 MP/s），作为 1.00x 基线。
+
+项目同时提供了一份朴素 CPU 实现（`bilateral_filter_cpu.cpp`），用于展示算法逻辑。其核心循环与 CUDA naive kernel 对应：
+
+```cpp
+// bilateral_filter_cpu.cpp — 朴素三重循环（仅作算法参照，不作为性能基线）
+for (int dy = -radius; dy <= radius; ++dy) {
+    for (int dx = -radius; dx <= radius; ++dx) {
+        float spatial_weight = expf((dx*dx + dy*dy) * spatial_coeff);
+        float color_dist = neighbor_val - center_val;
+        float color_weight = expf(color_dist * color_dist * color_coeff);
+        float weight = spatial_weight * color_weight;
+        sum += neighbor_val * weight;
+        weight_sum += weight;
+    }
+}
+output[idx] = sum / weight_sum;
+```
+
+> 该朴素 CPU 实现在 1080p RGB 上耗时约 6918 ms，比 OpenCV CPU（~27 ms）慢约 **256 倍**（OpenCV 内部使用 SIMD + 缓存优化），不作为性能对比基准。
+
+### 5.2 Naive CUDA Kernel（v1）
+
+最初的 CUDA 实现将 CPU 逻辑直接迁移到 GPU：每个线程处理一个像素，从全局内存独立读取邻域数据，实时计算 `expf`。4K RGB 耗时 **250 ms**（33 MP/s），甚至慢于 CPU——原因在于：
+
+### 5.3 瓶颈分析
+
+| 瓶颈 | 分析 | 量化 |
+|------|------|------|
+| **全局内存冗余** | 16×16 block + r=5 时，256 线程发起 30,976 次访问，实际不重复仅 676 个像素，冗余率 **45x** | 带宽浪费 ~97% |
+| **expf 调用代价** | 每像素 121 次 `expf`（~20 cycles/次），RGB 三通道 363 次 | 占 kernel 时间 ~60% |
+| **warp divergence** | 边界像素的 `if (ny < 0 || ny >= height) continue` 导致同 warp 线程执行路径不一致 | 边界 block 效率下降 ~30% |
+| **PCIe 传输** | 4K RGB 24.9 MB，PCIe 4.0 x8 (WSL2) 有效带宽 ~8 GB/s | H2D+D2H 约 6 ms |
+
+---
+
+## 6. 优化策略与逐步实验
+
+### 6.1 v2: Shared Memory 协作加载（+42%）
+
+**动机**：消除邻域的全局内存重复访问。
+
+**实现**：block 内线程协作将 tile（含 halo 区域）加载到 shared memory，边界用 clamp 策略避免分支：
+
+```cpp
+// bilateral_filter_cuda.cu:96-107
+#pragma unroll
+for (int i = thread_id; i < TILE_SIZE_LOG; i += threads_per_block) {
+    int gx = blockIdx.x * BLOCK_X + sx - RADIUS;
+    gx = max(0, min(width - 1, gx));  // clamp 边界
+    gy = max(0, min(height - 1, gy));
+    smem[sy * TILE_W_PAD + sx] = static_cast<float>(input[gy * width + gx]);
+}
+__syncthreads();  // 确保数据就绪
+```
+
+shared memory 大小为 $(blockDim + 2r)^2$，访问延迟从 ~200-400 ns 降至 ~5 ns。
+
+| 版本 | 4K RGB (ms) | 吞吐量 (MP/s) | vs OpenCV CPU |
+|------|---:|---:|---:|
+| v1 naive | 250 | 33 | 0.23x |
+| **v2 shared** | **176** | **47** | **0.32x** |
+
+### 6.2 v3-v4: LUT 替代 expf（+26%, +154%）
+
+**动机**：消除内循环中代价最高的超越函数调用。
+
+**空间权重 LUT**（v3）：空间权重仅依赖偏移 $(dx,dy)$，预计算后存入 constant memory（warp 广播，单次事务）：
+
+```cpp
+// bilateral_filter_cuda.cu:52-53
+__constant__ float d_spatial_lut[LUT_SIZE];     // (2r+1)² 个 float
+__constant__ float d_color_lut[COLOR_LUT_SIZE]; // 256 个 float
+
+// bilateral_filter_cuda.cu:1194-1214 — LUT 初始化
+static void init_spatial_lut(int radius, float sigma_spatial) {
+    float coeff = -0.5f / (sigma_spatial * sigma_spatial);
+    for (int dy = -radius; dy <= radius; ++dy)
+        for (int dx = -radius; dx <= radius; ++dx)
+            lut[(dy+radius)*w + (dx+radius)] = expf((dx*dx+dy*dy) * coeff);
+    cudaMemcpyToSymbol(d_spatial_lut, lut.data(), ...);
+}
+```
+
+**值域权重 LUT**（v4-v5）：8-bit 图像差值范围 [0,255]，预计算 256 项查表**完全消除 expf**：
+
+```cpp
+// kernel 内：查表替代 expf
+int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+diff = min(diff, 255);
+float color_weight = d_color_lut[diff];  // ~4 cycles vs expf ~20 cycles
+```
+
+ncu 实测 constant cache 命中率 **99.99%**。
+
+| 版本 | 4K RGB (ms) | 吞吐量 (MP/s) | vs OpenCV CPU |
+|------|---:|---:|---:|
+| v3 spatial LUT | 140 | 59 | 0.40x |
+| v4 fast math | 55 | 150 | 1.03x |
+| **v5 color LUT** | **18** | **460** | **3.14x** |
+
+> Color LUT 是单项收益最大的优化（**~3x**），每像素消除了 3 通道 × 81 邻域 = 243 次 `expf` 调用。
+
+### 6.3 v6: Template 编译期半径（+7%）
+
+**动机**：使编译器完全展开双重循环，消除循环控制开销，提升 ILP。
+
+```cpp
+// bilateral_filter_cuda.cu:74-75 — 模板参数化
+template <int RADIUS, typename InT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_filter_gray_template(...) {
+    #pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        #pragma unroll
+        for (int dx = -RADIUS; dx <= RADIUS; ++dx) { ... }
+    }
+}
+```
+
+运行时通过 `switch(radius)` 分发到对应模板实例化版本（支持 r=3/5/7/9/10）。
+
+### 6.4 v7-v10: 工程级优化（持久缓冲 +71%，u8 I/O +11%，页锁定 +7%）
+
+| 优化 | 机制 | 收益 |
+|------|------|---:|
+| **持久 GPU 缓冲** | 静态 `g_bufs` 结构体，仅尺寸变化时 `cudaMalloc`，LUT 参数不变则跳过上传 | **+71%** |
+| **uint8 直接 I/O** | kernel 模板参数 `<InT=uint8_t, OutT=uint8_t>`，省去 host 端 u8→float→u8 转换 | **+11%** |
+| **cudaHostRegister** | 将调用方堆内存注册为 page-locked，使 H2D/D2H 走 DMA 通道 | **+7%** |
+| **Block 16×16** | 较好的 L1/smem 缓存利用率 | **+1%** |
+
+### 6.5 v11: RGB 单一色彩权重（+16%）
+
+**动机**：原始实现每邻域需 3 次 LUT 查找（R/G/B 各一次）。改用三通道均值距离，仅 1 次查找：
+
+```cpp
+// bilateral_filter_cuda.cu:218-222
+int diff = static_cast<int>(
+    (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b))
+    * (1.0f / 3.0f) + 0.5f);
+float w = spatial_weight * d_color_lut[diff]; // 1 次 vs 3 次
+```
+
+MAE 从 0.65 略升至 0.80，仍满足 < 1.0 要求。
+
+### 6.6 v12: 圆形窗口 Early-Continue（+13% RGB, +65% Gray）
+
+**动机**：r=5 的方形窗口 121 个位置中，40 个角落点超出半径圆，空间权重为零。
+
+**Phase 1**：在 `init_spatial_lut` 中预置零圆外项（零性能成本，MAE 改善 0.15~0.20）：
+
+```cpp
+// bilateral_filter_cuda.cu:1206-1207
+if (dx * dx + dy * dy > r2)
+    lut[...] = 0.0f;  // 33% 的位置置零
+```
+
+**Phase 2**：kernel 内 `if (spatial_weight == 0.0f) continue;`（`bilateral_filter_cuda.cu:124`）。TEMPLATE 模式下 RADIUS 为编译期常量，`#pragma unroll` 完全展开后编译器通过 **Dead Code Elimination** 直接删除这 40 次迭代的全部指令。
+
+| 模式 | Before (ms) | After (ms) | 提升 |
+|------|---:|---:|---:|
+| TEMPLATE RGB | 7.36 | **6.53** | **+13%** |
+| TEMPLATE Gray | 4.97 | **3.01** | **+65%** |
+
+### 6.7 SEPARABLE 分离近似（算法级优化）
+
+**核心思想**：将 2D 双边滤波近似为水平 + 垂直两次 1D 滤波，复杂度从 $O(r^2)$ 降至 $O(r)$。r=5 时从 121 次降到 22 次邻域访问。
+
+水平 pass 从 shared memory 沿行方向滤波，垂直 pass 沿列方向滤波。两个 pass 之间通过中间缓冲区（float 或 FP16）传递数据。
+
+```cpp
+// 水平 kernel: bilateral_filter_cuda.cu:281-293
+#pragma unroll
+for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+    float neighbor = smem[ty][lx + dx];
+    float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+    float w = spatial_weight * color_weight;
+    sum = fmaf(neighbor, w, sum);  // 显式 FMA
+}
+```
+
+**中间缓冲区 SoA 布局**（Opt H）：水平输出改为 R|G|B 平面格式，垂直 kernel 读取单通道连续 float，合并效率从 ~33% 提升至 ~71%。
+
+### 6.8 版本迭代汇总
+
+> 以下 v1-v12 数据为早期开发阶段在 RTX 4060 (WSL2) 上的历史测量，vs OpenCV CPU 按当时基线（~57 ms）。最终行 SEPARABLE / SEP_FP16 为 Thor 实测。
+
+| 版本 | 优化手段 | 4K RGB (ms) | 吞吐量 (MP/s) | vs OpenCV CPU |
+|------|---------|---:|---:|---:|
+| v1 | Naive global memory | 250 | 33 | 0.23x |
+| v2 | Shared memory | 176 | 47 | 0.32x |
+| v3 | + Spatial LUT | 140 | 59 | 0.40x |
+| v4 | + `__expf` fast math | 55 | 150 | 1.03x |
+| v5 | + Color LUT + unroll | 18 | 460 | 3.14x |
+| v6 | + Template radius | 16.9 | 492 | 3.35x |
+| v7 | + 持久缓冲 + LUT cache | 9.86 | 841 | 5.74x |
+| v8 | + uint8 I/O kernels | 8.91 | 930 | 6.35x |
+| v9 | + cudaHostRegister | 8.65 | 959 | 6.54x |
+| v10 | + Block 16×16 | 8.64 | 960 | 6.55x |
+| **v11** | **+ 单一色彩权重** | **7.45** | **1113** | **7.59x** |
+| **v12** | **+ 圆形窗口 DCE** | **6.53** | **1271** | **8.67x** |
+| — | SEPARABLE (Thor) | **3.06** | **2713** | **26.8x** |
+| — | **SEP_FP16 (Thor)** | **3.01** | **2753** | **27.2x** |
+
+> 从 naive 250 ms 到 TEMPLATE 6.53 ms 实现 **38x** 加速；Thor 上 SEP_FP16 3.01 ms 实现 **83x** 加速（vs naive）、**27.2x**（vs OpenCV CPU）。
+
+---
+
+## 7. 性能结果与质量验证
+
+> 以下数据均在 **Jetson AGX Thor** 上实测（2026-03-16），统一内存架构，无 PCIe 传输瓶颈。
+
+### 7.1 Thor — 4K RGB（3840×2160×3）
+
+| 实现 | Time (ms) | Min (ms) | 吞吐量 (MP/s) | MAE | PSNR (dB) | vs OCV CPU | vs OCV CUDA |
+|------|---:|---:|---:|---:|---:|---:|---:|
+| **CUDA SEP_FP16** | **3.01** | **2.95** | **2753** | **0.46** | **48.39** | **27.2x** | **4.00x** |
+| CUDA SEPARABLE | 3.06 | 2.99 | 2713 | 0.45 | 48.49 | 26.8x | 3.94x |
+| CUDA TEMPLATE | 5.48 | 5.44 | 1515 | 0.60 | 48.28 | 15.0x | 2.23x |
+| CUDA ADAPTIVE | 6.17 | 6.11 | 1344 | 0.40 | 49.42 | 13.3x | 1.94x |
+| CUDA STANDARD | 9.41 | 9.26 | 882 | 0.48 | 48.61 | 8.70x | 1.29x |
+| OpenCV CUDA | 12.07 | 11.79 | 687 | 0.00 | — | 6.79x | 1.00x |
+| **OpenCV CPU** | **81.87** | **80.79** | **101** | — | — | **1.00x** | — |
+
+### 7.2 Thor — 4K Grayscale（3840×2160×1）
+
+| 实现 | Time (ms) | Min (ms) | 吞吐量 (MP/s) | MAE | PSNR (dB) | vs OCV CPU | vs OCV CUDA |
+|------|---:|---:|---:|---:|---:|---:|---:|
+| **CUDA SEP_FP16** | **1.32** | **1.27** | **6268** | **0.12** | **57.00** | **38.9x** | **5.88x** |
+| CUDA SEPARABLE | 1.42 | 1.39 | 5832 | 0.15 | 56.18 | 36.4x | 5.53x |
+| CUDA TEMPLATE | 3.52 | 3.46 | 2358 | 0.61 | 50.23 | 14.7x | 2.22x |
+| CUDA STANDARD | 4.36 | 4.32 | 1901 | 0.61 | 50.23 | 11.9x | 1.76x |
+| CUDA ADAPTIVE | 5.77 | 5.75 | 1438 | 0.61 | 50.23 | 8.94x | 1.34x |
+| OpenCV CUDA | 7.78 | 7.44 | 1066 | 0.00 | — | 6.63x | 1.00x |
+| **OpenCV CPU** | **51.60** | **50.57** | **161** | — | — | **1.00x** | — |
+
+### 7.3 性能目标达成
+
+| 目标 | 要求 | 最佳实测 | 模式 | 状态 |
+|------|------|---:|------|:---:|
+| 4K RGB @60fps | ≥498 MP/s | 2753 MP/s | SEP_FP16 | **5.5x 余量** |
+| 4K Gray @60fps | ≥498 MP/s | 6268 MP/s | SEP_FP16 | **12.6x 余量** |
+| vs OpenCV CPU | 显著加速 | **27.2x** (4K RGB) | SEP_FP16 | 达标 |
+| vs OpenCV CUDA | > 1.0x | **4.00x** (4K RGB) | SEP_FP16 | 达标 |
+| MAE | < 1.0 | 0.12~0.61 | 所有模式 | 达标 |
+| PSNR | > 40 dB | 48.28~57.00 dB | 所有模式 | 达标 |
+
+### 7.4 质量验证
+
+| 模式 | MAE (RGB) | MAE (Gray) | PSNR (RGB) | PSNR (Gray) |
+|------|---:|---:|---:|---:|
+| STANDARD | 0.48 | 0.61 | 48.61 dB | 50.23 dB |
+| TEMPLATE | 0.60 | 0.61 | 48.28 dB | 50.23 dB |
+| SEPARABLE | 0.45 | 0.15 | 48.49 dB | 56.18 dB |
+| SEP_FP16 | 0.46 | 0.12 | 48.39 dB | 57.00 dB |
+| ADAPTIVE | 0.40 | 0.61 | 49.42 dB | 50.23 dB |
+
+> SEPARABLE 在 Gray 下 MAE 仅 0.15，是所有模式中最接近 OpenCV 的结果。
+
+---
+
+## 8. Profiler 深度分析
+
+### 8.1 nsys 时间线分析（Thor 平台）
+
+#### TEMPLATE 模式
+
+| 阶段 | 耗时 | 占比 |
+|------|---:|---:|
+| `k_bilateral_filter_rgb_template<5,u8,u8>` | 4.98 ms | **91%** |
+| cudaMemcpy H2D (24.9 MB) | 0.20 ms | 3.6% |
+| cudaMemcpy D2H (24.9 MB) | 0.21 ms | 3.8% |
+| cudaLaunchKernel | 0.008 ms | <0.1% |
+
+统一内存传输带宽：H2D **122 GB/s**，D2H **117 GB/s**，远高于 RTX 4060 的 PCIe（~8 GB/s）。
+
+#### SEPARABLE 模式
+
+| 阶段 | 耗时 | 占比 |
+|------|---:|---:|
+| Horizontal pass | 1.52 ms | **50.7%** |
+| Vertical pass | 1.48 ms | **49.3%** |
+| cudaMemcpy 合计 | 0.42 ms | ~12% |
+
+**关键发现**：统一内存消除了 PCIe 传输瓶颈——Thor 上 memcpy 占比仅 ~5%（RTX 4060 上占 46-68%）。
+
+### 8.2 ncu 硬件计数器 — TEMPLATE 模式
+
+#### Speed-of-Light 概览
+
+| 指标 | 值 | 说明 |
+|------|---:|------|
+| **SM Throughput** | **88.83%** | 接近计算峰值 |
+| L1/TEX Throughput | 87.28% | Shared memory 密集 |
+| L2 Throughput | 3.50% | 数据集 fit in L2 (32 MB) |
+| DRAM Throughput | ~0% | 统一内存由 L2 服务 |
+| **瓶颈诊断** | **Compute-bound** | SM 88.8% >> DRAM ~0% |
+
+#### Occupancy 与寄存器
+
+| 指标 | 值 |
+|------|---:|
+| 理论 Occupancy | 100% |
+| **实测 Occupancy** | **97.76%** |
+| 寄存器/线程 | 23 (sm_110) vs 64 (sm_89) |
+
+> Blackwell 编译器将同一 kernel 的寄存器从 64 降至 23（-64%），occupancy 从 67% 提升至 97.8%。
+
+#### 流水线利用率
+
+| 流水线 | 利用率 | 说明 |
+|--------|---:|------|
+| **LSU** (Load/Store) | **43.84%** | Shared memory 读写 |
+| **FMA** (浮点乘加) | **40.99%** | 权重计算 |
+| ALU (整数) | 30.41% | 索引计算 |
+| XU (超越函数) | 28.17% | 残余 exp 路径 |
+
+> LSU 与 FMA **负载均衡**（43.8% vs 41.0%），计算与访存交织良好。
+
+#### 缓存命中率
+
+| 缓存层 | 命中率 |
+|--------|---:|
+| **Constant Cache** (LUT) | **99.99%** |
+| Instruction Cache | 100.0% |
+| L1/TEX | 68.53% |
+| L2 | 73.31% |
+
+### 8.3 ncu 驱动的关键优化（Thor 平台）
+
+#### Opt G: Block 32×8 消除 Bank Conflict
+
+**ncu 发现**：16×16 block 下 smem load 存在 50% bank conflict（2-way）。根因：warp 跨两行，行间 stride=27 mod 32=27，导致 11/16 bank 重叠。
+
+**方案**：改为 32×8 block，warp 全在一行内（stride=1）。
+
+| 指标 | 16×16 | 32×8 | 变化 |
+|------|---:|---:|---:|
+| Shared excessive wavefronts | 50% | **2.3%** | **-97.6%** |
+| SM Throughput | 87.49% | 88.36% | +0.87pp |
+
+#### Opt H: SoA 中间缓冲区改善合并访问
+
+**ncu 发现**：SEPARABLE vertical kernel 全局内存合并效率仅 33%，68% sector 冗余。
+
+| 指标 | AoS | SoA | 变化 |
+|------|---:|---:|---:|
+| Vertical uncoalesced | 68% | **29%** | **-39pp** |
+| Horizontal uncoalesced | 69% | **47%** | **-22pp** |
+
+#### Opt K: 激进 Launch Bounds 提升 Occupancy
+
+**ncu 发现**：SEPARABLE RGB kernel 使用 61-63 regs，occupancy 仅 62%。
+
+**方案**：`__launch_bounds__(256, 6)`，强制编译器将 regs 压到 ≤42（65536/256/6=42.67）。
+
+| 指标 | 优化前 | 优化后 | 变化 |
+|------|---:|---:|---:|
+| Registers/thread | 63 | **40** | **-35%** |
+| Achieved Occupancy | 62% | **97.5%** | **+35pp** |
+| SM Throughput | 64.5% | **79.4%** | **+14.9pp** |
+| 4K RGB min | 3.39 ms | **3.02 ms** | **-10.9%** |
+
+> 这是单一改动（一个宏常量 `MIN_BLOCKS_PER_SM_SEP=6`）带来最大端到端收益的优化。关键：**零 spill**。
+
+#### Opt N: FP16 中间缓冲区
+
+**方案**：H↔V 中间缓冲区从 float 改为 `__half`，带宽减半。
+
+| 指标 | FP32 | FP16 | 变化 |
+|------|---:|---:|---:|
+| V kernel LD sectors/req | 4.00 | **2.00** | **-50%** |
+| H kernel ST sectors/req | 4.00 | **2.00** | **-50%** |
+| 4K Gray min | 1.40 ms | **1.30 ms** | **-7.1%** |
+
+### 8.4 Warp Stall 分析
+
+| Stall 原因 | 比率 | 说明 |
+|-----------|---:|------|
+| **short_scoreboard** | **3.92** | Shared memory / L1 依赖 |
+| not_selected | 3.21 | warp 调度竞争 |
+| wait | 2.79 | 固定延迟依赖 |
+| long_scoreboard | 0.66 | L2/DRAM 依赖 |
+| math_pipe_throttle | 0.40 | 计算流水线满 |
+| barrier | 0.33 | `__syncthreads()` |
+
+### 8.5 Roofline 定位
+
+```
+SM Throughput (%)
+  100 ┤
+   89 ┤ ··[TEMPLATE]·················  (97.8% occ, 3.55 IPC)
+      │
+   67 ┤ ·········[SEP-V]             (63→97.5% occ after Opt K)
+   65 ┤ ········[SEP-H]
+      │
+      └─────────────────────────────
+        所有 kernel 均为 Compute-bound (SM >> Memory)
+```
+
+---
+
+## 9. 失败实验与优化边界
+
+### 9.1 Opt B: Warp Shuffle 水平 pass（-2.4x，已回退）
+
+为 separable 灰度水平 kernel 实现 `__shfl_sync` 版本。
+
+| 指标 | 基线 | Shuffle |
+|------|---:|---:|
+| 4K Gray | 2.07 ms | 5.02 ms |
+| MAE | 0.15 | 0.50 |
+
+**失败原因**：radius=5 时仅 10/32 lane 需要 halo 数据，if/else 分支导致严重 warp divergence。Shared memory 方案的加载延迟已被计算充分隐藏，shuffle 无法取得优势。
+
+### 9.2 Opt C: Host 端 SoA 全链路（-36%，已回退）
+
+将 RGB separable 改为 AoS→SoA 转换 + 3×灰度 separable + SoA→AoS 转换。
+
+**失败原因**：(1) AoS↔SoA 转换引入 2 次额外全局内存遍历（47 MB）；(2) 8 个 kernel launch 替代原来 2 个；(3) 丧失三通道共享权重优势。
+
+### 9.3 Opt D: cudaFuncCachePreferL1（-36%，已回退）
+
+L1 偏好将 shared memory 从 64 KB 压缩到 32 KB/SM，导致 occupancy 下降，SEPARABLE 从 5.42 ms 恶化到 7.36 ms。
+
+### 9.4 Opt M: Fused H+V 单 Kernel（-31~34%）
+
+单 kernel 内三阶段（加载 2D halo → 水平滤波 → 垂直滤波），消除中间 buffer。
+
+| 测试 | SEPARABLE | FUSED | 变化 |
+|------|---:|---:|---:|
+| 4K RGB | 3.02 ms | 3.96 ms | **-31%** |
+| 1080p RGB | 0.77 ms | 1.03 ms | **-34%** |
+
+**失败原因**：每 block 需处理 576 个水平滤波点（vs SEPARABLE 的 256），计算膨胀远超带宽节省。Thor 统一内存下中间 buffer 成本仅占 ~10%。
+
+### 9.5 Opt N2: FP16 全量计算（-8.4%）
+
+将 smem 和内循环累加全部改为 `__half`。
+
+**失败原因**：(1) sm_110 的 2x FP16 优势仅在 `__half2` 打包操作上，标量 `__hfma` 与 FP32 `fmaf` 吞吐相同；(2) LUT 返回 float，每次迭代需 6+ 次 float↔half 转换。
+
+### 9.6 实验总结
+
+| 优化 | 状态 | 实际效果 | 教训 |
+|------|:---:|---:|------|
+| Warp Shuffle | 回退 | -2.4x | 需 halo 的滤波不适合 shuffle |
+| Host SoA | 回退 | -36% | 格式转换开销 > 合并收益 |
+| PreferL1 | 回退 | -36% | smem 容量比 L1 缓存更重要 |
+| Fused H+V | 失败 | -31~34% | 统一内存下中间 buffer 成本低 |
+| FP16 计算 | 失败 | -8.4% | 标量 half 无 2x，转换开销高 |
+| fmaf 显式 | 保留 | 0% | nvcc -O3 已自动最优融合 |
+| Strip Pipeline | 无收益 | 0~-8% | WSL2 阻止 copy+compute 并行 |
+
+> 失败实验同样有价值——它们通过实测验证了优化边界，避免在无效方向继续投入。
+
+---
+
+## 10. 跨平台对比分析
+
+### 10.1 Jetson AGX Thor 最终性能（4K RGB，实测 2026-03-16）
+
+| 实现 | Avg (ms) | Min (ms) | 吞吐量 (MP/s) | vs OCV CPU | vs OCV CUDA |
+|------|---:|---:|---:|---:|---:|
+| **CUDA SEP_FP16** | **3.01** | **2.95** | **2753** | **27.2x** | **4.00x** |
+| CUDA SEPARABLE | 3.06 | 2.99 | 2713 | 26.8x | 3.94x |
+| CUDA TEMPLATE | 5.48 | 5.44 | 1515 | 15.0x | 2.23x |
+| CUDA ADAPTIVE | 6.17 | 6.11 | 1344 | 13.3x | 1.94x |
+| CUDA STANDARD | 9.41 | 9.26 | 882 | 8.70x | 1.29x |
+| OpenCV CUDA | 12.07 | 11.79 | 687 | 6.79x | 1.00x |
+| **OpenCV CPU** | **81.87** | **80.79** | **101** | **1.00x** | — |
+
+### 10.2 跨平台 Kernel 耗时对比
+
+| Kernel | Thor (ms) | RTX 4060 (ms) | cudaMemcpy |
+|--------|---:|---:|---:|
+| rgb_template<5> | 4.98 | 3.36 | — |
+| horizontal_rgb<5> | 1.52 | 0.79 | — |
+| vertical_rgb<5> | 1.48 | 0.72 | — |
+| **H2D+D2H 合计** | **0.41** | **3.94** | **Thor 快 10x** |
+
+### 10.3 编译器差异
+
+| Kernel | sm_110 Regs | sm_89 Regs | sm_110 Occupancy | sm_89 Occupancy |
+|--------|---:|---:|---:|---:|
+| rgb_template | **23** | 64 | **100%** | 67% |
+| gray_template | **21** | 63 | **100%** | ~67% |
+| horizontal_rgb (Opt K) | **40** | 62 | **100%** | 66% |
+
+### 10.4 平台特异性总结
+
+| 因素 | 独显 (PCIe) | 统一内存 (Jetson) |
+|------|:---:|:---:|
+| 主瓶颈 | H2D/D2H (46-68%) | Kernel 计算 (90-95%) |
+| 传输优化价值 | 极高 | 无意义 |
+| Kernel 优化边际收益 | 中等（被传输稀释） | **极高**（直接反映端到端） |
+| Fused kernel 价值 | 可能有效 | 无效（中间 buffer 成本低） |
+
+---
+
+## 11. 结论与展望
+
+### 11.1 结论
+
+本项目实现了从 naive CUDA kernel（250 ms）到高度优化的多模式双边滤波器，在 **Jetson AGX Thor** 上最终 SEP_FP16 模式 4K RGB 仅需 **3.01 ms**，总加速比 **83x**（vs naive）、**27.2x**（vs OpenCV CPU）、**4.00x**（vs OpenCV CUDA）。核心优化手段及其贡献：
+
+| 排名 | 优化手段 | 加速贡献 | 代码位置 |
+|:---:|---------|---:|------|
+| 1 | Color Weight LUT | **~3x** | `d_color_lut[256]`, L52-53 |
+| 2 | Shared Memory | **3-5x** | smem 协作加载, L96-107 |
+| 3 | SEPARABLE 近似 | **O(r)** | H/V kernel, L241-522 |
+| 4 | 持久 GPU 缓冲 | **+71%** | `g_bufs` 结构体, L1391-1467 |
+| 5 | 圆形窗口 DCE | **+13~65%** | `spatial_weight==0` continue, L124 |
+| 6 | RGB 单色权重 | **+16%** | 均值距离, L218-222 |
+| 7 | launch_bounds(256,6) | **+10.9%** | `MIN_BLOCKS_PER_SM_SEP=6`, L41 |
+| 8 | Template 展开 | **+7%** | `template<int RADIUS>`, L74 |
+
+所有模式均满足：MAE < 1.0，PSNR > 48 dB，4K@60fps 吞吐余量 **5.5x** 以上。
+
+### 11.2 展望
+
+| 方向 | 可行性 | 预期收益 |
+|------|:---:|:---:|
+| `__half2` 向量化（打包两邻域像素） | 中 | FP16 吞吐翻倍 |
+| Spatial LUT 搬到 smem（避免 constant cache 序列化） | 高 | 5-10% |
+| L2 Cache 持久化（视频流场景） | 高 | 5-15% |
+| 多 Stream Pipeline（原生 Linux，非 WSL2） | 高 | 传输与计算重叠 |
+| Bilateral Grid（大半径 r≥10） | 低 | O(N) 复杂度 |
+
+---
+
+## 12. 附录：完整数据图表
+
+### 表 A1: TEMPLATE 版本迭代（4K RGB, RTX 4060）
+
+| 版本 | 优化手段 | Time (ms) | 吞吐量 (MP/s) | vs 上一版 | vs OCV CPU |
+|------|---------|---:|---:|---:|---:|
+| v1 | Naive | 250 | 33 | — | 0.23x |
+| v2 | Shared memory | 176 | 47 | +42% | 0.32x |
+| v3 | + Spatial LUT | 140 | 59 | +26% | 0.40x |
+| v4 | + fast math | 55 | 150 | +154% | 1.03x |
+| v5 | + Color LUT | 18 | 460 | +207% | 3.14x |
+| v6 | + Template | 16.9 | 492 | +7% | 3.35x |
+| v7 | + 持久缓冲 | 9.86 | 841 | +71% | 5.74x |
+| v8 | + u8 I/O | 8.91 | 930 | +11% | 6.35x |
+| v9 | + page-lock | 8.65 | 959 | +3% | 6.54x |
+| v10 | + Block 16×16 | 8.64 | 960 | +1% | 6.55x |
+| v11 | + 单色权重 | 7.45 | 1113 | +16% | 7.59x |
+| v12 | + 圆形窗口 | 6.53 | 1271 | +13% | 8.67x |
+
+### 表 A2: Thor 平台 4K RGB 全模式对比（实测 2026-03-16）
+
+| 实现 | Avg (ms) | Min (ms) | 吞吐量 (MP/s) | MAE | vs OCV CPU | vs OCV CUDA |
+|------|---:|---:|---:|---:|---:|---:|
+| **SEP_FP16** | **3.01** | **2.95** | **2753** | **0.46** | **27.2x** | **4.00x** |
+| SEPARABLE | 3.06 | 2.99 | 2713 | 0.45 | 26.8x | 3.94x |
+| TEMPLATE | 5.48 | 5.44 | 1515 | 0.60 | 15.0x | 2.23x |
+| ADAPTIVE | 6.17 | 6.11 | 1344 | 0.40 | 13.3x | 1.94x |
+| STANDARD | 9.41 | 9.26 | 882 | 0.48 | 8.70x | 1.29x |
+| OpenCV CUDA | 12.07 | 11.79 | 687 | 0.00 | 6.79x | 1.00x |
+| OpenCV CPU | 81.87 | 80.79 | 101 | — | 1.00x | — |
+
+### 表 A3: ncu 优化指标汇总
+
+| 优化 | 指标 | Before | After | 变化 |
+|------|------|---:|---:|---:|
+| Opt G (32×8) | Smem bank conflict | 50% | 2.3% | -97.6% |
+| Opt H (SoA) | Global uncoalesced (V) | 68% | 29% | -39pp |
+| Opt H (SoA) | Global uncoalesced (H) | 69% | 47% | -22pp |
+| Opt K (launch_bounds) | Registers/thread | 63 | 40 | -35% |
+| Opt K (launch_bounds) | Achieved occupancy | 62% | 97.5% | +35pp |
+| Opt K (launch_bounds) | SM throughput | 64.5% | 79.4% | +14.9pp |
+| Opt N (FP16 temp) | V LD sectors/req | 4.00 | 2.00 | -50% |
+| Opt N (FP16 temp) | H ST sectors/req | 4.00 | 2.00 | -50% |
+
+### 表 A4: 编译期寄存器对比（sm_110 vs sm_89）
+
+| Kernel | sm_110 Regs | sm_89 Regs | Spill | Occupancy (sm_110) |
+|--------|---:|---:|:---:|---:|
+| rgb_template<5,u8,u8> | 28 | 64 | 0 | 100% |
+| gray_template<5,u8,u8> | 21 | 63 | 0 | 100% |
+| horizontal_rgb<5,u8> | 40 | 62 | 0 | 100% |
+| vertical_rgb<5,u8> | 40 | 62 | 0 | 100% |
+| horizontal_gray<5,u8> | 33 | 35 | 0 | 100% |
diff --git a/08_bilateral_filter/Andy1314Chen/include/bilateral_filter.h b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter.h
new file mode 100644
index 0000000..7a97bad
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter.h
@@ -0,0 +1,20 @@
+#ifndef BILATERAL_FILTER_H_
+#define BILATERAL_FILTER_H_
+
+#include "image_io.h"
+#include <vector>
+
+void bilateral_filter_cpu(const float* input,
+                          float* output,
+                          int width,
+                          int height,
+                          int channels,
+                          int radius,
+                          float sigma_spatial,
+                          float sigma_color);
+
+void apply_bilateral_filter_cpu(const ImageData& input,
+                                ImageData& output,
+                                const FilterParams& params);
+
+#endif  // BILATERAL_FILTER_H_
diff --git a/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_cuda.cuh b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_cuda.cuh
new file mode 100644
index 0000000..9cd9b3e
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_cuda.cuh
@@ -0,0 +1,32 @@
+#ifndef BILATERAL_FILTER_CUDA_CUH_
+#define BILATERAL_FILTER_CUDA_CUH_
+
+#include <cuda_runtime.h>
+#include <cstdint>
+
+// Filter modes for benchmarking different implementations
+// 0 = STANDARD (shared memory + LUT, runtime radius)
+// 1 = TEMPLATE (compile-time radius, full unroll)
+// 2 = SEPARABLE (horizontal + vertical passes, O(r) complexity)
+void set_bilateral_filter_mode(int mode);
+
+void bilateral_filter_cuda(const float* d_input,
+                           float* d_output,
+                           int width,
+                           int height,
+                           int channels,
+                           int radius,
+                           float sigma_spatial,
+                           float sigma_color,
+                           cudaStream_t stream = 0);
+
+void apply_bilateral_filter_cuda(const uint8_t* h_input,
+                                 uint8_t* h_output,
+                                 int width,
+                                 int height,
+                                 int channels,
+                                 int radius,
+                                 float sigma_spatial,
+                                 float sigma_color);
+
+#endif  // BILATERAL_FILTER_CUDA_CUH_
diff --git a/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_opencv.h b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_opencv.h
new file mode 100644
index 0000000..92b5a77
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_opencv.h
@@ -0,0 +1,19 @@
+#ifndef BILATERAL_FILTER_OPENCV_H_
+#define BILATERAL_FILTER_OPENCV_H_
+
+#include "image_io.h"
+
+void apply_bilateral_filter_opencv(const ImageData& input, ImageData& output,
+                                   const FilterParams& params);
+
+double compute_mae(const ImageData& img1, const ImageData& img2);
+double compute_psnr(const ImageData& img1, const ImageData& img2);
+
+#ifdef HAVE_OPENCV_CUDA
+// OpenCV CUDA bilateral filter (cv::cuda::bilateralFilter).
+// Returns false if CUDA device is unavailable at runtime.
+bool apply_bilateral_filter_opencv_cuda(const ImageData& input, ImageData& output,
+                                        const FilterParams& params);
+#endif
+
+#endif // BILATERAL_FILTER_OPENCV_H_
diff --git a/08_bilateral_filter/Andy1314Chen/include/image_io.h b/08_bilateral_filter/Andy1314Chen/include/image_io.h
new file mode 100644
index 0000000..4bc5178
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/include/image_io.h
@@ -0,0 +1,29 @@
+#ifndef IMAGE_IO_H_
+#define IMAGE_IO_H_
+
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+struct ImageData {
+    int width;
+    int height;
+    int channels;
+    std::vector<uint8_t> data;
+
+    size_t size() const { return static_cast<size_t>(width) * height * channels; }
+    bool valid() const { return width > 0 && height > 0 && channels > 0 && !data.empty(); }
+};
+
+struct FilterParams {
+    int radius = 0;
+    float sigma_spatial = 0.0f;
+    float sigma_color = 0.0f;
+};
+
+bool load_raw_image(const char* path, ImageData* img);
+bool save_raw_image(const char* path, const ImageData& img);
+bool load_params(const char* path, FilterParams* params);
+
+#endif // IMAGE_IO_H_
diff --git a/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cpu.cpp b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cpu.cpp
new file mode 100644
index 0000000..03e4ed1
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cpu.cpp
@@ -0,0 +1,71 @@
+#include <algorithm>
+#include <cmath>
+
+#include "bilateral_filter.h"
+
+void bilateral_filter_cpu(const float* input, float* output, int width, int height, int channels,
+                          int radius, float sigma_spatial, float sigma_color) {
+    float spatial_coeff = -0.5f / (sigma_spatial * sigma_spatial);
+    float color_coeff = -0.5f / (sigma_color * sigma_color);
+
+    for (int y = 0; y < height; ++y) {
+        for (int x = 0; x < width; ++x) {
+            int center_idx = (y * width + x) * channels;
+
+            for (int c = 0; c < channels; ++c) {
+                float sum = 0.0f;
+                float weight_sum = 0.0f;
+                float center_val = input[center_idx + c];
+
+                for (int dy = -radius; dy <= radius; ++dy) {
+                    int ny = y + dy;
+                    if (ny < 0 || ny >= height)
+                        continue;
+
+                    for (int dx = -radius; dx <= radius; ++dx) {
+                        int nx = x + dx;
+                        if (nx < 0 || nx >= width)
+                            continue;
+
+                        int neighbor_idx = (ny * width + nx) * channels;
+                        float neighbor_val = input[neighbor_idx + c];
+
+                        float spatial_dist = static_cast<float>(dx * dx + dy * dy);
+                        float spatial_weight = expf(spatial_dist * spatial_coeff);
+
+                        float color_dist = neighbor_val - center_val;
+                        float color_weight = expf(color_dist * color_dist * color_coeff);
+
+                        float weight = spatial_weight * color_weight;
+                        sum += neighbor_val * weight;
+                        weight_sum += weight;
+                    }
+                }
+
+                output[center_idx + c] = sum / weight_sum;
+            }
+        }
+    }
+}
+
+void apply_bilateral_filter_cpu(const ImageData& input, ImageData& output,
+                                const FilterParams& params) {
+    output.width = input.width;
+    output.height = input.height;
+    output.channels = input.channels;
+    output.data.resize(input.size());
+
+    std::vector<float> input_float(input.size());
+    std::vector<float> output_float(input.size());
+
+    for (size_t i = 0; i < input.size(); ++i) {
+        input_float[i] = static_cast<float>(input.data[i]);
+    }
+
+    bilateral_filter_cpu(input_float.data(), output_float.data(), input.width, input.height,
+                         input.channels, params.radius, params.sigma_spatial, params.sigma_color);
+
+    for (size_t i = 0; i < output.size(); ++i) {
+        output.data[i] = static_cast<uint8_t>(std::clamp(output_float[i], 0.0f, 255.0f));
+    }
+}
diff --git a/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cuda.cu b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cuda.cu
new file mode 100644
index 0000000..9ad4b6e
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cuda.cu
@@ -0,0 +1,2046 @@
+// Standard library
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+// CUDA FP16 (half-precision) support
+#include <cuda_fp16.h>
+
+// Project headers (includes <cuda_runtime.h> via bilateral_filter_cuda.cuh)
+#include "bilateral_filter_cuda.cuh"
+
+// clang-format off
+#define CUDA_CHECK(call)                                                \
+    do {                                                                \
+        cudaError_t err = call;                                         \
+        if (err != cudaSuccess) {                                       \
+            fprintf(stderr, "CUDA error at %s:%d: %s\n",                \
+                    __FILE__, __LINE__, cudaGetErrorString(err));       \
+            exit(EXIT_FAILURE);                                         \
+        }                                                               \
+    } while (0)
+// clang-format on
+
+#ifndef BLOCK_X
+// Opt G: 32x8 ensures each warp spans a single row in shared memory,
+// eliminating cross-row bank conflicts (16x16 caused 2-way conflicts
+// because warp straddles 2 rows with stride 27 mod 32 = -5 overlap).
+#define BLOCK_X 32
+#endif
+#ifndef BLOCK_Y
+#define BLOCK_Y 8
+#endif
+// Opt A: launch bounds hints for register allocation
+#define THREADS_PER_BLOCK (BLOCK_X * BLOCK_Y)
+#define MIN_BLOCKS_PER_SM 4
+// Opt K: more aggressive launch bounds for SEPARABLE kernels to reduce
+// register pressure. With 256 threads and 6 blocks/SM: max regs = 65536/(256*6) = 42.
+// This forces compiler to use ≤42 registers, boosting occupancy from 66% to 100%.
+#define MIN_BLOCKS_PER_SM_SEP 6
+#define MAX_RADIUS     16
+#define LUT_SIZE       ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define COLOR_LUT_SIZE 256
+
+// Opt G: Shared memory bank conflict mitigation via row padding.
+// 32 banks × 4B = 128B per bank cycle. If tile row width is even, threads
+// in a warp accessing consecutive rows at the same column hit the same bank.
+// Padding to an odd width ensures consecutive rows map to different banks.
+#define SMEM_PAD(w) ((w) | 1)  // ensure odd: if even, add 1; if odd, keep
+
+__constant__ float d_spatial_lut[LUT_SIZE];
+__constant__ float d_color_lut[COLOR_LUT_SIZE];
+
+// ============================================================================
+// Opt2: type-safe output helper for uint8/float kernel output
+// ============================================================================
+
+template <typename T>
+__device__ inline T to_output(float v);
+template <>
+__device__ inline float to_output<float>(float v) {
+    return v;
+}
+template <>
+__device__ inline uint8_t to_output<uint8_t>(float v) {
+    return static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, v)));
+}
+
+// ============================================================================
+// Template-based grayscale bilateral filter with compile-time radius
+// ============================================================================
+
+template <int RADIUS, typename InT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_filter_gray_template(const InT* __restrict__ input,
+                                                 OutT* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int TILE_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);  // Opt G: odd stride to avoid bank conflicts
+    constexpr int TILE_SIZE_LOG = TILE_W * TILE_H; // logical elements to load
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem[TILE_H * TILE_W_PAD];  // Opt G: padded rows
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int thread_id = ty * BLOCK_X + tx;
+    const int threads_per_block = BLOCK_X * BLOCK_Y;
+
+// Collaborative loading: iterate over logical tile, store into padded layout
+#pragma unroll
+    for (int i = thread_id; i < TILE_SIZE_LOG; i += threads_per_block) {
+        int sy = i / TILE_W;
+        int sx = i % TILE_W;
+        int gx = blockIdx.x * BLOCK_X + sx - RADIUS;
+        int gy = blockIdx.y * BLOCK_Y + sy - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        smem[sy * TILE_W_PAD + sx] = static_cast<float>(input[gy * width + gx]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + RADIUS;
+    const int ly = ty + RADIUS;
+    const float center = smem[ly * TILE_W_PAD + lx];
+
+    float sum = 0.0f;
+    float weight_sum = 0.0f;
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+#pragma unroll
+        for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+            float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + (dx + RADIUS)];
+            if (spatial_weight == 0.0f) continue;  // Opt C/F: skip circular-window corners
+
+            float neighbor = smem[(ly + dy) * TILE_W_PAD + (lx + dx)];
+
+            int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+            diff = min(diff, COLOR_LUT_SIZE - 1);
+            float color_weight = d_color_lut[diff];
+
+            float w = spatial_weight * color_weight;
+            sum += neighbor * w;
+            weight_sum += w;
+        }
+    }
+
+    output[y * width + x] = to_output<OutT>(sum / weight_sum);
+}
+
+// ============================================================================
+// Template-based RGB bilateral filter with compile-time radius
+// Full loop unrolling for known radius values
+// ============================================================================
+
+template <int RADIUS, typename InT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_filter_rgb_template(const InT* __restrict__ input,
+                                                OutT* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int TILE_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);  // Opt G: odd stride to avoid bank conflicts
+    constexpr int TILE_SIZE_LOG = TILE_W * TILE_H; // logical elements to load
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem_r[TILE_H * TILE_W_PAD];
+    __shared__ float smem_g[TILE_H * TILE_W_PAD];
+    __shared__ float smem_b[TILE_H * TILE_W_PAD];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int thread_id = ty * BLOCK_X + tx;
+    const int threads_per_block = BLOCK_X * BLOCK_Y;
+
+// Collaborative loading: iterate over logical tile, store into padded layout
+#pragma unroll
+    for (int i = thread_id; i < TILE_SIZE_LOG; i += threads_per_block) {
+        int sy = i / TILE_W;
+        int sx = i % TILE_W;
+        int gx = blockIdx.x * BLOCK_X + sx - RADIUS;
+        int gy = blockIdx.y * BLOCK_Y + sy - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        int gidx = (gy * width + gx) * 3;
+        int smem_idx = sy * TILE_W_PAD + sx;
+        smem_r[smem_idx] = static_cast<float>(input[gidx]);
+        smem_g[smem_idx] = static_cast<float>(input[gidx + 1]);
+        smem_b[smem_idx] = static_cast<float>(input[gidx + 2]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + RADIUS;
+    const int ly = ty + RADIUS;
+    const int lidx = ly * TILE_W_PAD + lx;
+
+    const float center_r = smem_r[lidx];
+    const float center_g = smem_g[lidx];
+    const float center_b = smem_b[lidx];
+
+    float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f;
+    float wsum = 0.0f;
+
+// Opt5: single shared color weight per neighbor, computed from mean absolute channel diff.
+// Reduces 3 LUT lookups + 3 wsum accumulators to 1 each. Tradeoff: MAE rises ~0.65→0.80,
+// which is acceptable (< 1.0). OpenCV actually uses Euclidean distance across channels,
+// so this is a different (simpler) approximation.
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+#pragma unroll
+        for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+            float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + (dx + RADIUS)];
+            if (spatial_weight == 0.0f) continue;  // Opt C/F: skip circular-window corners
+
+            int nidx = (ly + dy) * TILE_W_PAD + (lx + dx);
+            float nr = smem_r[nidx];
+            float ng = smem_g[nidx];
+            float nb = smem_b[nidx];
+
+            // Single color distance: mean absolute channel difference
+            int diff = static_cast<int>(
+                (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b)) *
+                    (1.0f / 3.0f) +
+                0.5f);
+            diff = min(diff, COLOR_LUT_SIZE - 1);
+
+            float w = spatial_weight * d_color_lut[diff];
+
+            sum_r += nr * w;
+            sum_g += ng * w;
+            sum_b += nb * w;
+            wsum += w;
+        }
+    }
+
+    // Opt6b: replace 3 divisions with 1 reciprocal + 3 multiplications
+    float rcp_wsum = __frcp_rn(wsum);
+    int out_idx = (y * width + x) * 3;
+    output[out_idx] = to_output<OutT>(sum_r * rcp_wsum);
+    output[out_idx + 1] = to_output<OutT>(sum_g * rcp_wsum);
+    output[out_idx + 2] = to_output<OutT>(sum_b * rcp_wsum);
+}
+
+// ============================================================================
+// Separable approximation for grayscale: horizontal + vertical passes
+// O(2r) complexity instead of O(r^2)
+// ============================================================================
+
+template <int RADIUS, typename InT = float, typename TmpT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_horizontal_gray(const InT* __restrict__ input,
+                                            TmpT* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);  // Opt G: odd stride
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem[BLOCK_Y][TILE_W_PAD];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    // Load row with halo
+    for (int i = tx; i < TILE_W; i += BLOCK_X) {
+        int gx = blockIdx.x * BLOCK_X + i - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        int gy = min(y, height - 1);
+        smem[ty][i] = static_cast<float>(input[gy * width + gx]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + RADIUS;
+    const float center = smem[ty][lx];
+
+    float sum = 0.0f;
+    float weight_sum = 0.0f;
+
+#pragma unroll
+    for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+        float neighbor = smem[ty][lx + dx];
+        float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+
+        int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+        diff = min(diff, COLOR_LUT_SIZE - 1);
+        float color_weight = d_color_lut[diff];
+
+        float w = spatial_weight * color_weight;
+        sum = fmaf(neighbor, w, sum);  // Opt I: explicit FMA
+        weight_sum += w;
+    }
+
+    output[y * width + x] = static_cast<TmpT>(sum / weight_sum);
+}
+
+template <int RADIUS, typename TmpT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_vertical_gray(const TmpT* __restrict__ input,
+                                           OutT* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem[TILE_H][BLOCK_X];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    // Load column with halo (cast TmpT -> float for FP16 intermediate support)
+    for (int i = ty; i < TILE_H; i += BLOCK_Y) {
+        int gy = blockIdx.y * BLOCK_Y + i - RADIUS;
+        gy = max(0, min(height - 1, gy));
+        int gx = min(x, width - 1);
+        smem[i][tx] = static_cast<float>(input[gy * width + gx]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int ly = ty + RADIUS;
+    const float center = smem[ly][tx];
+
+    float sum = 0.0f;
+    float weight_sum = 0.0f;
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        float neighbor = smem[ly + dy][tx];
+        float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+
+        int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+        diff = min(diff, COLOR_LUT_SIZE - 1);
+        float color_weight = d_color_lut[diff];
+
+        float w = spatial_weight * color_weight;
+        sum = fmaf(neighbor, w, sum);  // Opt I: explicit FMA
+        weight_sum += w;
+    }
+
+    output[y * width + x] = to_output<OutT>(sum / weight_sum);
+}
+
+// ============================================================================
+// Separable approximation for RGB: horizontal + vertical passes
+// O(2r) complexity instead of O(r^2)
+// ============================================================================
+
+template <int RADIUS, typename InT = float, typename TmpT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_horizontal_rgb(const InT* __restrict__ input,
+                                           TmpT* __restrict__ output, // intermediate (float or __half)
+                                           int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);  // Opt G: odd stride
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem_r[BLOCK_Y][TILE_W_PAD];
+    __shared__ float smem_g[BLOCK_Y][TILE_W_PAD];
+    __shared__ float smem_b[BLOCK_Y][TILE_W_PAD];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    // Load row with halo
+    for (int i = tx; i < TILE_W; i += BLOCK_X) {
+        int gx = blockIdx.x * BLOCK_X + i - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        int gy = min(y, height - 1);
+        int gidx = (gy * width + gx) * 3;
+        smem_r[ty][i] = static_cast<float>(input[gidx]);
+        smem_g[ty][i] = static_cast<float>(input[gidx + 1]);
+        smem_b[ty][i] = static_cast<float>(input[gidx + 2]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + RADIUS;
+    const float center_r = smem_r[ty][lx];
+    const float center_g = smem_g[ty][lx];
+    const float center_b = smem_b[ty][lx];
+
+    float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f;
+    float wsum_r = 0.0f, wsum_g = 0.0f, wsum_b = 0.0f;
+
+#pragma unroll
+    for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+        float nr = smem_r[ty][lx + dx];
+        float ng = smem_g[ty][lx + dx];
+        float nb = smem_b[ty][lx + dx];
+
+        // Use 1D spatial weight (center row of 2D LUT)
+        float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+
+        int diff_r = static_cast<int>(fabsf(nr - center_r) + 0.5f);
+        int diff_g = static_cast<int>(fabsf(ng - center_g) + 0.5f);
+        int diff_b = static_cast<int>(fabsf(nb - center_b) + 0.5f);
+        diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+        diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+        diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+        float cw_r = d_color_lut[diff_r];
+        float cw_g = d_color_lut[diff_g];
+        float cw_b = d_color_lut[diff_b];
+
+        float w_r = spatial_weight * cw_r;
+        float w_g = spatial_weight * cw_g;
+        float w_b = spatial_weight * cw_b;
+
+        // Opt I: explicit FMA for accumulation
+        sum_r = fmaf(nr, w_r, sum_r);
+        sum_g = fmaf(ng, w_g, sum_g);
+        sum_b = fmaf(nb, w_b, sum_b);
+        wsum_r += w_r;
+        wsum_g += w_g;
+        wsum_b += w_b;
+    }
+
+    // Opt H: SoA intermediate output — write R/G/B to separate planes for coalesced
+    // vertical reads. Layout: plane_r[0..W*H-1], plane_g[W*H..2*W*H-1], plane_b[2*W*H..].
+    const int pixel_idx = y * width + x;
+    const int plane_size = width * height;
+    output[pixel_idx]                = static_cast<TmpT>(sum_r / wsum_r);
+    output[pixel_idx + plane_size]   = static_cast<TmpT>(sum_g / wsum_g);
+    output[pixel_idx + 2*plane_size] = static_cast<TmpT>(sum_b / wsum_b);
+}
+
+template <int RADIUS, typename TmpT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_vertical_rgb(const TmpT* __restrict__ input, // intermediate SoA: R|G|B planes
+                         OutT* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem_r[TILE_H][BLOCK_X];
+    __shared__ float smem_g[TILE_H][BLOCK_X];
+    __shared__ float smem_b[TILE_H][BLOCK_X];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int plane_size = width * height;
+
+    // Opt H: Load from SoA planes — fully coalesced: consecutive threads read
+    // consecutive addresses within each plane.
+    for (int i = ty; i < TILE_H; i += BLOCK_Y) {
+        int gy = blockIdx.y * BLOCK_Y + i - RADIUS;
+        gy = max(0, min(height - 1, gy));
+        int gx = min(x, width - 1);
+        int pidx = gy * width + gx;
+        smem_r[i][tx] = static_cast<float>(input[pidx]);
+        smem_g[i][tx] = static_cast<float>(input[pidx + plane_size]);
+        smem_b[i][tx] = static_cast<float>(input[pidx + 2*plane_size]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int ly = ty + RADIUS;
+    const float center_r = smem_r[ly][tx];
+    const float center_g = smem_g[ly][tx];
+    const float center_b = smem_b[ly][tx];
+
+    float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f;
+    float wsum_r = 0.0f, wsum_g = 0.0f, wsum_b = 0.0f;
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        float nr = smem_r[ly + dy][tx];
+        float ng = smem_g[ly + dy][tx];
+        float nb = smem_b[ly + dy][tx];
+
+        // Use 1D spatial weight (center column of 2D LUT)
+        float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+
+        // Opt I: Use fmaf() intrinsic to encourage FMA fusion.
+        // Each diff + 0.5f rounding and w*neighbor accumulation benefits from FMA.
+        int diff_r = static_cast<int>(fabsf(nr - center_r) + 0.5f);
+        int diff_g = static_cast<int>(fabsf(ng - center_g) + 0.5f);
+        int diff_b = static_cast<int>(fabsf(nb - center_b) + 0.5f);
+        diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+        diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+        diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+        float cw_r = d_color_lut[diff_r];
+        float cw_g = d_color_lut[diff_g];
+        float cw_b = d_color_lut[diff_b];
+
+        float w_r = spatial_weight * cw_r;
+        float w_g = spatial_weight * cw_g;
+        float w_b = spatial_weight * cw_b;
+
+        // Opt I: explicit FMA for accumulation
+        sum_r = fmaf(nr, w_r, sum_r);
+        sum_g = fmaf(ng, w_g, sum_g);
+        sum_b = fmaf(nb, w_b, sum_b);
+        wsum_r += w_r;
+        wsum_g += w_g;
+        wsum_b += w_b;
+    }
+
+    int out_idx = (y * width + x) * 3;
+    output[out_idx] = to_output<OutT>(sum_r / wsum_r);
+    output[out_idx + 1] = to_output<OutT>(sum_g / wsum_g);
+    output[out_idx + 2] = to_output<OutT>(sum_b / wsum_b);
+}
+
+// ============================================================================
+// Opt N2: Full FP16 compute separable kernels
+// smem uses __half (halved bandwidth), inner loop uses __half arithmetic
+// (2x throughput on sm_110 Blackwell). LUT lookups stay float, converted to
+// __half before FMA accumulation.
+// ============================================================================
+
+template <int RADIUS>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_horizontal_gray_fp16(const uint8_t* __restrict__ input,
+                                 __half* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ __half smem[BLOCK_Y][TILE_W_PAD];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    for (int i = tx; i < TILE_W; i += BLOCK_X) {
+        int gx = blockIdx.x * BLOCK_X + i - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        int gy = min(y, height - 1);
+        smem[ty][i] = __float2half(static_cast<float>(input[gy * width + gx]));
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + RADIUS;
+    const __half center = smem[ty][lx];
+
+    __half sum = __float2half(0.0f);
+    __half weight_sum = __float2half(0.0f);
+
+#pragma unroll
+    for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+        __half neighbor = smem[ty][lx + dx];
+        float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+
+        // Color distance in float for LUT indexing precision
+        float fn = __half2float(neighbor);
+        float fc = __half2float(center);
+        int diff = static_cast<int>(fabsf(fn - fc) + 0.5f);
+        diff = min(diff, COLOR_LUT_SIZE - 1);
+        float color_weight = d_color_lut[diff];
+
+        __half w = __float2half(spatial_weight * color_weight);
+        sum = __hfma(neighbor, w, sum);
+        weight_sum = __hadd(weight_sum, w);
+    }
+
+    output[y * width + x] = __hdiv(sum, weight_sum);
+}
+
+template <int RADIUS>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_vertical_gray_fp16(const __half* __restrict__ input,
+                               uint8_t* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ __half smem[TILE_H][BLOCK_X];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    for (int i = ty; i < TILE_H; i += BLOCK_Y) {
+        int gy = blockIdx.y * BLOCK_Y + i - RADIUS;
+        gy = max(0, min(height - 1, gy));
+        int gx = min(x, width - 1);
+        smem[i][tx] = input[gy * width + gx];
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int ly = ty + RADIUS;
+    const __half center = smem[ly][tx];
+
+    __half sum = __float2half(0.0f);
+    __half weight_sum = __float2half(0.0f);
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        __half neighbor = smem[ly + dy][tx];
+        float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+
+        float fn = __half2float(neighbor);
+        float fc = __half2float(center);
+        int diff = static_cast<int>(fabsf(fn - fc) + 0.5f);
+        diff = min(diff, COLOR_LUT_SIZE - 1);
+        float color_weight = d_color_lut[diff];
+
+        __half w = __float2half(spatial_weight * color_weight);
+        sum = __hfma(neighbor, w, sum);
+        weight_sum = __hadd(weight_sum, w);
+    }
+
+    float result = __half2float(sum) / __half2float(weight_sum);
+    output[y * width + x] = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, result)));
+}
+
+template <int RADIUS>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_horizontal_rgb_fp16(const uint8_t* __restrict__ input,
+                                __half* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ __half smem_r[BLOCK_Y][TILE_W_PAD];
+    __shared__ __half smem_g[BLOCK_Y][TILE_W_PAD];
+    __shared__ __half smem_b[BLOCK_Y][TILE_W_PAD];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    for (int i = tx; i < TILE_W; i += BLOCK_X) {
+        int gx = blockIdx.x * BLOCK_X + i - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        int gy = min(y, height - 1);
+        int gidx = (gy * width + gx) * 3;
+        smem_r[ty][i] = __float2half(static_cast<float>(input[gidx]));
+        smem_g[ty][i] = __float2half(static_cast<float>(input[gidx + 1]));
+        smem_b[ty][i] = __float2half(static_cast<float>(input[gidx + 2]));
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + RADIUS;
+    const __half center_r = smem_r[ty][lx];
+    const __half center_g = smem_g[ty][lx];
+    const __half center_b = smem_b[ty][lx];
+
+    __half sum_r = __float2half(0.0f), sum_g = __float2half(0.0f), sum_b = __float2half(0.0f);
+    __half wsum_r = __float2half(0.0f), wsum_g = __float2half(0.0f), wsum_b = __float2half(0.0f);
+
+#pragma unroll
+    for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+        __half nr = smem_r[ty][lx + dx];
+        __half ng = smem_g[ty][lx + dx];
+        __half nb = smem_b[ty][lx + dx];
+
+        float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+
+        // Color distance in float for LUT precision
+        float fr = __half2float(nr), fg = __half2float(ng), fb = __half2float(nb);
+        float cr = __half2float(center_r), cg = __half2float(center_g), cb = __half2float(center_b);
+        int diff_r = static_cast<int>(fabsf(fr - cr) + 0.5f);
+        int diff_g = static_cast<int>(fabsf(fg - cg) + 0.5f);
+        int diff_b = static_cast<int>(fabsf(fb - cb) + 0.5f);
+        diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+        diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+        diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+        __half w_r = __float2half(spatial_weight * d_color_lut[diff_r]);
+        __half w_g = __float2half(spatial_weight * d_color_lut[diff_g]);
+        __half w_b = __float2half(spatial_weight * d_color_lut[diff_b]);
+
+        sum_r = __hfma(nr, w_r, sum_r);
+        sum_g = __hfma(ng, w_g, sum_g);
+        sum_b = __hfma(nb, w_b, sum_b);
+        wsum_r = __hadd(wsum_r, w_r);
+        wsum_g = __hadd(wsum_g, w_g);
+        wsum_b = __hadd(wsum_b, w_b);
+    }
+
+    // SoA output
+    const int pixel_idx = y * width + x;
+    const int plane_size = width * height;
+    output[pixel_idx]                = __hdiv(sum_r, wsum_r);
+    output[pixel_idx + plane_size]   = __hdiv(sum_g, wsum_g);
+    output[pixel_idx + 2*plane_size] = __hdiv(sum_b, wsum_b);
+}
+
+template <int RADIUS>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_vertical_rgb_fp16(const __half* __restrict__ input,
+                              uint8_t* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ __half smem_r[TILE_H][BLOCK_X];
+    __shared__ __half smem_g[TILE_H][BLOCK_X];
+    __shared__ __half smem_b[TILE_H][BLOCK_X];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int plane_size = width * height;
+
+    for (int i = ty; i < TILE_H; i += BLOCK_Y) {
+        int gy = blockIdx.y * BLOCK_Y + i - RADIUS;
+        gy = max(0, min(height - 1, gy));
+        int gx = min(x, width - 1);
+        int pidx = gy * width + gx;
+        smem_r[i][tx] = input[pidx];
+        smem_g[i][tx] = input[pidx + plane_size];
+        smem_b[i][tx] = input[pidx + 2*plane_size];
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int ly = ty + RADIUS;
+    const __half center_r = smem_r[ly][tx];
+    const __half center_g = smem_g[ly][tx];
+    const __half center_b = smem_b[ly][tx];
+
+    __half sum_r = __float2half(0.0f), sum_g = __float2half(0.0f), sum_b = __float2half(0.0f);
+    __half wsum_r = __float2half(0.0f), wsum_g = __float2half(0.0f), wsum_b = __float2half(0.0f);
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        __half nr = smem_r[ly + dy][tx];
+        __half ng = smem_g[ly + dy][tx];
+        __half nb = smem_b[ly + dy][tx];
+
+        float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+
+        float fr = __half2float(nr), fg = __half2float(ng), fb = __half2float(nb);
+        float cr = __half2float(center_r), cg = __half2float(center_g), cb = __half2float(center_b);
+        int diff_r = static_cast<int>(fabsf(fr - cr) + 0.5f);
+        int diff_g = static_cast<int>(fabsf(fg - cg) + 0.5f);
+        int diff_b = static_cast<int>(fabsf(fb - cb) + 0.5f);
+        diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+        diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+        diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+        __half w_r = __float2half(spatial_weight * d_color_lut[diff_r]);
+        __half w_g = __float2half(spatial_weight * d_color_lut[diff_g]);
+        __half w_b = __float2half(spatial_weight * d_color_lut[diff_b]);
+
+        sum_r = __hfma(nr, w_r, sum_r);
+        sum_g = __hfma(ng, w_g, sum_g);
+        sum_b = __hfma(nb, w_b, sum_b);
+        wsum_r = __hadd(wsum_r, w_r);
+        wsum_g = __hadd(wsum_g, w_g);
+        wsum_b = __hadd(wsum_b, w_b);
+    }
+
+    // Final division in float for output precision
+    int out_idx = (y * width + x) * 3;
+    float rr = __half2float(sum_r) / __half2float(wsum_r);
+    float rg = __half2float(sum_g) / __half2float(wsum_g);
+    float rb = __half2float(sum_b) / __half2float(wsum_b);
+    output[out_idx]     = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, rr)));
+    output[out_idx + 1] = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, rg)));
+    output[out_idx + 2] = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, rb)));
+}
+
+// ============================================================================
+// Opt M: Fused H+V separable kernel — single kernel, zero intermediate buffer.
+// Phase 1: load raw pixels with 2D halo → smem_raw, do horizontal filtering
+//          for ALL rows (BLOCK_Y + 2*RADIUS) → smem_h.
+// Phase 2: __syncthreads(), do vertical filtering from smem_h → global output.
+// Saves: 1 kernel launch + entire intermediate buffer read/write.
+// ============================================================================
+
+template <int RADIUS>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_fused_gray(const uint8_t* __restrict__ input,
+                       uint8_t* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;       // raw data columns
+    constexpr int FUSED_H = BLOCK_Y + 2 * RADIUS;      // rows to horizontal-filter
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);
+    constexpr int RAW_SIZE = FUSED_H * TILE_W;          // total raw pixels to load
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    // smem_raw: original pixel data with 2D halo for horizontal filtering
+    __shared__ float smem_raw[FUSED_H][TILE_W_PAD];
+    // smem_h: horizontal filtering results (only center BLOCK_X columns, all FUSED_H rows)
+    __shared__ float smem_h[FUSED_H][BLOCK_X];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int thread_id = ty * BLOCK_X + tx;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    // ---- Phase 0: Collaborative load of raw data with 2D halo ----
+    for (int i = thread_id; i < RAW_SIZE; i += THREADS_PER_BLOCK) {
+        int sy = i / TILE_W;
+        int sx = i % TILE_W;
+        int gx = blockIdx.x * BLOCK_X + sx - RADIUS;
+        int gy = blockIdx.y * BLOCK_Y + sy - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        smem_raw[sy][sx] = static_cast<float>(input[gy * width + gx]);
+    }
+
+    __syncthreads();
+
+    // ---- Phase 1: Horizontal filtering for ALL FUSED_H rows ----
+    // Each thread handles multiple rows to cover all FUSED_H rows
+    for (int row = thread_id; row < FUSED_H * BLOCK_X; row += THREADS_PER_BLOCK) {
+        int hy = row / BLOCK_X;  // which of the FUSED_H rows
+        int hx = row % BLOCK_X;  // which column (0..BLOCK_X-1)
+
+        int lx = hx + RADIUS;   // center position in smem_raw
+        float center = smem_raw[hy][lx];
+        float sum = 0.0f;
+        float wsum = 0.0f;
+
+#pragma unroll
+        for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+            float neighbor = smem_raw[hy][lx + dx];
+            float sw = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+            int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+            diff = min(diff, COLOR_LUT_SIZE - 1);
+            float cw = d_color_lut[diff];
+            float w = sw * cw;
+            sum = fmaf(neighbor, w, sum);
+            wsum += w;
+        }
+        smem_h[hy][hx] = sum / wsum;
+    }
+
+    __syncthreads();
+
+    // ---- Phase 2: Vertical filtering from smem_h → global output ----
+    if (x >= width || y >= height)
+        return;
+
+    const int ly = ty + RADIUS;  // center row in smem_h
+    float center = smem_h[ly][tx];
+
+    float sum = 0.0f;
+    float wsum = 0.0f;
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        float neighbor = smem_h[ly + dy][tx];
+        float sw = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+        int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+        diff = min(diff, COLOR_LUT_SIZE - 1);
+        float cw = d_color_lut[diff];
+        float w = sw * cw;
+        sum = fmaf(neighbor, w, sum);
+        wsum += w;
+    }
+
+    output[y * width + x] = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, sum / wsum)));
+}
+
+template <int RADIUS>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP)
+k_bilateral_fused_rgb(const uint8_t* __restrict__ input,
+                      uint8_t* __restrict__ output, int width, int height) {
+
+    constexpr int TILE_W = BLOCK_X + 2 * RADIUS;
+    constexpr int FUSED_H = BLOCK_Y + 2 * RADIUS;
+    constexpr int TILE_W_PAD = SMEM_PAD(TILE_W);
+    constexpr int RAW_SIZE = FUSED_H * TILE_W;
+    constexpr int LUT_WIDTH = 2 * RADIUS + 1;
+
+    __shared__ float smem_raw_r[FUSED_H][TILE_W_PAD];
+    __shared__ float smem_raw_g[FUSED_H][TILE_W_PAD];
+    __shared__ float smem_raw_b[FUSED_H][TILE_W_PAD];
+    __shared__ float smem_h_r[FUSED_H][BLOCK_X];
+    __shared__ float smem_h_g[FUSED_H][BLOCK_X];
+    __shared__ float smem_h_b[FUSED_H][BLOCK_X];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int thread_id = ty * BLOCK_X + tx;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    // ---- Phase 0: Collaborative load ----
+    for (int i = thread_id; i < RAW_SIZE; i += THREADS_PER_BLOCK) {
+        int sy = i / TILE_W;
+        int sx = i % TILE_W;
+        int gx = blockIdx.x * BLOCK_X + sx - RADIUS;
+        int gy = blockIdx.y * BLOCK_Y + sy - RADIUS;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        int gidx = (gy * width + gx) * 3;
+        smem_raw_r[sy][sx] = static_cast<float>(input[gidx]);
+        smem_raw_g[sy][sx] = static_cast<float>(input[gidx + 1]);
+        smem_raw_b[sy][sx] = static_cast<float>(input[gidx + 2]);
+    }
+
+    __syncthreads();
+
+    // ---- Phase 1: Horizontal filtering for ALL FUSED_H rows ----
+    for (int row = thread_id; row < FUSED_H * BLOCK_X; row += THREADS_PER_BLOCK) {
+        int hy = row / BLOCK_X;
+        int hx = row % BLOCK_X;
+        int lx = hx + RADIUS;
+
+        float cr = smem_raw_r[hy][lx];
+        float cg = smem_raw_g[hy][lx];
+        float cb = smem_raw_b[hy][lx];
+
+        float sr = 0.0f, sg = 0.0f, sb = 0.0f;
+        float wr = 0.0f, wg = 0.0f, wb = 0.0f;
+
+#pragma unroll
+        for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
+            float nr = smem_raw_r[hy][lx + dx];
+            float ng = smem_raw_g[hy][lx + dx];
+            float nb = smem_raw_b[hy][lx + dx];
+
+            float sw = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)];
+
+            int diff_r = static_cast<int>(fabsf(nr - cr) + 0.5f);
+            int diff_g = static_cast<int>(fabsf(ng - cg) + 0.5f);
+            int diff_b = static_cast<int>(fabsf(nb - cb) + 0.5f);
+            diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+            diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+            diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+            float w_r = sw * d_color_lut[diff_r];
+            float w_g = sw * d_color_lut[diff_g];
+            float w_b = sw * d_color_lut[diff_b];
+
+            sr = fmaf(nr, w_r, sr);
+            sg = fmaf(ng, w_g, sg);
+            sb = fmaf(nb, w_b, sb);
+            wr += w_r;
+            wg += w_g;
+            wb += w_b;
+        }
+        smem_h_r[hy][hx] = sr / wr;
+        smem_h_g[hy][hx] = sg / wg;
+        smem_h_b[hy][hx] = sb / wb;
+    }
+
+    __syncthreads();
+
+    // ---- Phase 2: Vertical filtering from smem_h → global output ----
+    if (x >= width || y >= height)
+        return;
+
+    const int ly = ty + RADIUS;
+    float cr = smem_h_r[ly][tx];
+    float cg = smem_h_g[ly][tx];
+    float cb = smem_h_b[ly][tx];
+
+    float sr = 0.0f, sg = 0.0f, sb = 0.0f;
+    float wr = 0.0f, wg = 0.0f, wb = 0.0f;
+
+#pragma unroll
+    for (int dy = -RADIUS; dy <= RADIUS; ++dy) {
+        float nr = smem_h_r[ly + dy][tx];
+        float ng = smem_h_g[ly + dy][tx];
+        float nb = smem_h_b[ly + dy][tx];
+
+        float sw = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS];
+
+        int diff_r = static_cast<int>(fabsf(nr - cr) + 0.5f);
+        int diff_g = static_cast<int>(fabsf(ng - cg) + 0.5f);
+        int diff_b = static_cast<int>(fabsf(nb - cb) + 0.5f);
+        diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+        diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+        diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+        float w_r = sw * d_color_lut[diff_r];
+        float w_g = sw * d_color_lut[diff_g];
+        float w_b = sw * d_color_lut[diff_b];
+
+        sr = fmaf(nr, w_r, sr);
+        sg = fmaf(ng, w_g, sg);
+        sb = fmaf(nb, w_b, sb);
+        wr += w_r;
+        wg += w_g;
+        wb += w_b;
+    }
+
+    int out_idx = (y * width + x) * 3;
+    output[out_idx]     = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, sr / wr)));
+    output[out_idx + 1] = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, sg / wg)));
+    output[out_idx + 2] = static_cast<uint8_t>(fminf(255.0f, fmaxf(0.0f, sb / wb)));
+}
+
+// ============================================================================
+// Runtime-radius version (fallback)
+// ============================================================================
+
+// Grayscale runtime version
+template <typename InT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_filter_shared(const InT* __restrict__ input, OutT* __restrict__ output,
+                                          int width, int height, int radius) {
+
+    extern __shared__ float smem[];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int tile_w = BLOCK_X + 2 * radius;
+    const int tile_h = BLOCK_Y + 2 * radius;
+    const int smem_size = tile_w * tile_h;
+    const int thread_id = ty * BLOCK_X + tx;
+    const int threads_per_block = BLOCK_X * BLOCK_Y;
+
+    // Collaborative loading
+    for (int i = thread_id; i < smem_size; i += threads_per_block) {
+        int sy = i / tile_w;
+        int sx = i % tile_w;
+        int gx = blockIdx.x * BLOCK_X + sx - radius;
+        int gy = blockIdx.y * BLOCK_Y + sy - radius;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        smem[i] = static_cast<float>(input[gy * width + gx]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + radius;
+    const int ly = ty + radius;
+    const float center = smem[ly * tile_w + lx];
+
+    float sum = 0.0f;
+    float weight_sum = 0.0f;
+    const int lut_width = 2 * radius + 1;
+
+#pragma unroll 4
+    for (int dy = -radius; dy <= radius; ++dy) {
+#pragma unroll 4
+        for (int dx = -radius; dx <= radius; ++dx) {
+            float spatial_weight = d_spatial_lut[(dy + radius) * lut_width + (dx + radius)];
+            if (spatial_weight == 0.0f) continue;
+
+            float neighbor = smem[(ly + dy) * tile_w + (lx + dx)];
+
+            int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+            diff = min(diff, COLOR_LUT_SIZE - 1);
+            float color_weight = d_color_lut[diff];
+
+            float w = spatial_weight * color_weight;
+            sum += neighbor * w;
+            weight_sum += w;
+        }
+    }
+
+    output[y * width + x] = to_output<OutT>(sum / weight_sum);
+}
+
+// RGB runtime version
+template <typename InT = float, typename OutT = float>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_filter_rgb_shared(const InT* __restrict__ input,
+                                              OutT* __restrict__ output, int width, int height,
+                                              int radius) {
+
+    extern __shared__ float smem[];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int tile_w = BLOCK_X + 2 * radius;
+    const int tile_h = BLOCK_Y + 2 * radius;
+    const int tile_size = tile_w * tile_h;
+
+    float* smem_r = smem;
+    float* smem_g = smem + tile_size;
+    float* smem_b = smem + 2 * tile_size;
+
+    const int thread_id = ty * BLOCK_X + tx;
+    const int threads_per_block = BLOCK_X * BLOCK_Y;
+
+    for (int i = thread_id; i < tile_size; i += threads_per_block) {
+        int sy = i / tile_w;
+        int sx = i % tile_w;
+        int gx = blockIdx.x * BLOCK_X + sx - radius;
+        int gy = blockIdx.y * BLOCK_Y + sy - radius;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        int gidx = (gy * width + gx) * 3;
+        smem_r[i] = static_cast<float>(input[gidx]);
+        smem_g[i] = static_cast<float>(input[gidx + 1]);
+        smem_b[i] = static_cast<float>(input[gidx + 2]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int lx = tx + radius;
+    const int ly = ty + radius;
+    const int lidx = ly * tile_w + lx;
+
+    const float center_r = smem_r[lidx];
+    const float center_g = smem_g[lidx];
+    const float center_b = smem_b[lidx];
+
+    float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f;
+    float wsum_r = 0.0f, wsum_g = 0.0f, wsum_b = 0.0f;
+    const int lut_width = 2 * radius + 1;
+
+#pragma unroll 4
+    for (int dy = -radius; dy <= radius; ++dy) {
+#pragma unroll 4
+        for (int dx = -radius; dx <= radius; ++dx) {
+            float spatial_weight = d_spatial_lut[(dy + radius) * lut_width + (dx + radius)];
+            if (spatial_weight == 0.0f) continue;
+
+            int nidx = (ly + dy) * tile_w + (lx + dx);
+            float nr = smem_r[nidx];
+            float ng = smem_g[nidx];
+            float nb = smem_b[nidx];
+
+            int diff_r = static_cast<int>(fabsf(nr - center_r) + 0.5f);
+            int diff_g = static_cast<int>(fabsf(ng - center_g) + 0.5f);
+            int diff_b = static_cast<int>(fabsf(nb - center_b) + 0.5f);
+            diff_r = min(diff_r, COLOR_LUT_SIZE - 1);
+            diff_g = min(diff_g, COLOR_LUT_SIZE - 1);
+            diff_b = min(diff_b, COLOR_LUT_SIZE - 1);
+
+            float cw_r = d_color_lut[diff_r];
+            float cw_g = d_color_lut[diff_g];
+            float cw_b = d_color_lut[diff_b];
+
+            float w_r = spatial_weight * cw_r;
+            float w_g = spatial_weight * cw_g;
+            float w_b = spatial_weight * cw_b;
+
+            sum_r += nr * w_r;
+            sum_g += ng * w_g;
+            sum_b += nb * w_b;
+            wsum_r += w_r;
+            wsum_g += w_g;
+            wsum_b += w_b;
+        }
+    }
+
+    int out_idx = (y * width + x) * 3;
+    output[out_idx] = to_output<OutT>(sum_r / wsum_r);
+    output[out_idx + 1] = to_output<OutT>(sum_g / wsum_g);
+    output[out_idx + 2] = to_output<OutT>(sum_b / wsum_b);
+}
+
+// ============================================================================
+// LUT initialization
+// ============================================================================
+
+static void init_spatial_lut(int radius, float sigma_spatial) {
+    float coeff = -0.5f / (sigma_spatial * sigma_spatial);
+    int w = 2 * radius + 1;
+    std::vector<float> lut(LUT_SIZE, 0.0f);
+
+    const int r2 = radius * radius;
+    for (int dy = -radius; dy <= radius; ++dy) {
+        for (int dx = -radius; dx <= radius; ++dx) {
+            // Opt C/F: circular window - zero out corners outside inscribed circle.
+            // For r=5: 121 positions → 81 inside, 40 outside (33%).
+            // TEMPLATE kernels with #pragma unroll: compiler eliminates dead iterations
+            // at compile time (dead code elimination), achieving +13%~+65% speedup.
+            if (dx * dx + dy * dy > r2) {
+                lut[(dy + radius) * w + (dx + radius)] = 0.0f;
+            } else {
+                lut[(dy + radius) * w + (dx + radius)] = expf((dx * dx + dy * dy) * coeff);
+            }
+        }
+    }
+
+    CUDA_CHECK(cudaMemcpyToSymbol(d_spatial_lut, lut.data(), w * w * sizeof(float)));
+}
+
+static void init_color_lut(float sigma_color) {
+    float coeff = -0.5f / (sigma_color * sigma_color);
+    std::vector<float> lut(COLOR_LUT_SIZE);
+
+    for (int i = 0; i < COLOR_LUT_SIZE; ++i) {
+        // i ∈ [0,255], i*i ∈ [0,65025]: no overflow for int
+        lut[i] = expf(static_cast<float>(i * i) * coeff);
+    }
+
+    CUDA_CHECK(cudaMemcpyToSymbol(d_color_lut, lut.data(), COLOR_LUT_SIZE * sizeof(float)));
+}
+
+// Opt B (disabled): cudaFuncSetCacheConfig was tested and showed no benefit on sm_89.
+// Moreover, calling it on many template instantiations triggers a JIT crash in the
+// CUDA 13.1 PTX compiler under WSL2. The function has been removed entirely.
+
+// Opt1: LUT cache - only re-upload when params change
+static void ensure_luts(int radius, float sigma_spatial, float sigma_color) {
+    static int cached_radius = -1;
+    static float cached_sigma_s = -1.f;
+    static float cached_sigma_c = -1.f;
+
+    if (radius == cached_radius && sigma_spatial == cached_sigma_s &&
+        sigma_color == cached_sigma_c) {
+        return;
+    }
+
+    init_spatial_lut(radius, sigma_spatial);
+    init_color_lut(sigma_color);
+
+    cached_radius = radius;
+    cached_sigma_s = sigma_spatial;
+    cached_sigma_c = sigma_color;
+}
+
+// ============================================================================
+// Kernel dispatch with template specialization
+// ============================================================================
+
+enum class FilterMode {
+    STANDARD,       // Shared memory + LUT (runtime radius)
+    TEMPLATE,       // Template-based (compile-time radius)
+    SEPARABLE,      // Separable approximation (float intermediate)
+    SEPARABLE_FP16, // Separable approximation (__half intermediate, half bandwidth)
+    ADAPTIVE,       // Adaptive radius: per-pixel radius from local gradient
+    FUSED           // Opt M: Fused H+V single kernel, zero intermediate buffer
+};
+
+static FilterMode g_filter_mode = FilterMode::TEMPLATE;
+
+void set_bilateral_filter_mode(int mode) {
+    g_filter_mode = static_cast<FilterMode>(mode);
+}
+
+static FilterMode get_filter_mode() {
+    static bool initialized = false;
+    if (!initialized) {
+        const char* env = getenv("BILATERAL_MODE");
+        if (env) {
+            int mode = atoi(env);
+            if (mode >= 0 && mode <= 5) {
+                g_filter_mode = static_cast<FilterMode>(mode);
+            }
+        }
+        initialized = true;
+    }
+    return g_filter_mode;
+}
+
+// Float-path template launchers removed: the float bilateral_filter_cuda() now uses
+// only runtime-radius shared-memory kernels. All template specializations are
+// reserved for the uint8 direct path (dispatch_u8_kernel).
+
+// ============================================================================
+// Opt2: uint8 I/O launchers - kernel reads/writes uint8 directly from GPU
+// ============================================================================
+
+// Grayscale template uint8 launcher
+template <int RADIUS>
+static void launch_u8_gray(const uint8_t* d_in, uint8_t* d_out, int w, int h, cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_filter_gray_template<RADIUS, uint8_t, uint8_t>
+        <<<grid, block, 0, s>>>(d_in, d_out, w, h);
+}
+
+// RGB template uint8 launcher
+template <int RADIUS>
+static void launch_u8_rgb(const uint8_t* d_in, uint8_t* d_out, int w, int h, cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_filter_rgb_template<RADIUS, uint8_t, uint8_t>
+        <<<grid, block, 0, s>>>(d_in, d_out, w, h);
+}
+
+// Grayscale separable uint8 launcher (uint8→float→uint8 via d_temp)
+template <int RADIUS>
+static void launch_u8_sep_gray(const uint8_t* d_in, uint8_t* d_out, float* d_temp, int w, int h,
+                               cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_horizontal_gray<RADIUS, uint8_t, float><<<grid, block, 0, s>>>(d_in, d_temp, w, h);
+    k_bilateral_vertical_gray<RADIUS, float, uint8_t><<<grid, block, 0, s>>>(d_temp, d_out, w, h);
+}
+
+// RGB separable uint8 launcher (uint8→float→uint8 via d_temp)
+template <int RADIUS>
+static void launch_u8_sep_rgb(const uint8_t* d_in, uint8_t* d_out, float* d_temp, int w, int h,
+                               cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_horizontal_rgb<RADIUS, uint8_t, float><<<grid, block, 0, s>>>(d_in, d_temp, w, h);
+    k_bilateral_vertical_rgb<RADIUS, float, uint8_t><<<grid, block, 0, s>>>(d_temp, d_out, w, h);
+}
+
+// Opt N: FP16 intermediate separable launchers — half-bandwidth intermediate buffer.
+// H kernel: uint8 → smem(float) → compute(float) → output(__half)
+// V kernel: input(__half) → smem(float) → compute(float) → output(uint8)
+// Compute stays FP32 for precision; only the H↔V transfer uses FP16.
+
+template <int RADIUS>
+static void launch_u8_sep_gray_fp16(const uint8_t* d_in, uint8_t* d_out, __half* d_temp,
+                                    int w, int h, cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_horizontal_gray<RADIUS, uint8_t, __half><<<grid, block, 0, s>>>(d_in, d_temp, w, h);
+    k_bilateral_vertical_gray<RADIUS, __half, uint8_t><<<grid, block, 0, s>>>(d_temp, d_out, w, h);
+}
+
+template <int RADIUS>
+static void launch_u8_sep_rgb_fp16(const uint8_t* d_in, uint8_t* d_out, __half* d_temp,
+                                   int w, int h, cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_horizontal_rgb<RADIUS, uint8_t, __half><<<grid, block, 0, s>>>(d_in, d_temp, w, h);
+    k_bilateral_vertical_rgb<RADIUS, __half, uint8_t><<<grid, block, 0, s>>>(d_temp, d_out, w, h);
+}
+
+// Opt N2: Full FP16 compute launchers — smem + arithmetic in __half
+template <int RADIUS>
+static void launch_u8_sep_gray_fp16_compute(const uint8_t* d_in, uint8_t* d_out, __half* d_temp,
+                                            int w, int h, cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_horizontal_gray_fp16<RADIUS><<<grid, block, 0, s>>>(d_in, d_temp, w, h);
+    k_bilateral_vertical_gray_fp16<RADIUS><<<grid, block, 0, s>>>(d_temp, d_out, w, h);
+}
+
+template <int RADIUS>
+static void launch_u8_sep_rgb_fp16_compute(const uint8_t* d_in, uint8_t* d_out, __half* d_temp,
+                                           int w, int h, cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_horizontal_rgb_fp16<RADIUS><<<grid, block, 0, s>>>(d_in, d_temp, w, h);
+    k_bilateral_vertical_rgb_fp16<RADIUS><<<grid, block, 0, s>>>(d_temp, d_out, w, h);
+}
+
+// Opt M: Fused H+V launchers — single kernel, no intermediate buffer
+template <int RADIUS>
+static void launch_u8_fused_gray(const uint8_t* d_in, uint8_t* d_out, int w, int h,
+                                 cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_fused_gray<RADIUS><<<grid, block, 0, s>>>(d_in, d_out, w, h);
+}
+
+template <int RADIUS>
+static void launch_u8_fused_rgb(const uint8_t* d_in, uint8_t* d_out, int w, int h,
+                                cudaStream_t s) {
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y);
+    k_bilateral_fused_rgb<RADIUS><<<grid, block, 0, s>>>(d_in, d_out, w, h);
+}
+
+// ============================================================================
+// Opt1: persistent GPU buffers - allocated once, reused across calls
+// Opt3: cudaHostRegister cache - page-lock caller's memory for DMA transfers
+// ============================================================================
+
+static struct {
+    uint8_t* d_in_u8 = nullptr;
+    uint8_t* d_out_u8 = nullptr;
+    float*   d_temp = nullptr;      // separable float intermediate
+    __half*  d_temp_h16 = nullptr;  // separable FP16 intermediate
+    uint8_t* d_radius_map = nullptr; // adaptive mode: per-pixel radius
+    size_t n_u8 = 0;
+    size_t n_temp = 0;
+    size_t n_temp_h16 = 0;
+    size_t n_radius_map = 0;
+    // Opt3: cached registered host pointers
+    const uint8_t* h_in_reg = nullptr;
+    uint8_t* h_out_reg = nullptr;
+    size_t n_reg = 0;
+} g_bufs;
+
+// Opt3: register caller's heap memory as page-locked so cudaMemcpy uses DMA.
+// Called once per unique (h_in, h_out, n) triple; re-registers when they change.
+static void ensure_registered(const uint8_t* h_in, uint8_t* h_out, size_t n) {
+    if (h_in == g_bufs.h_in_reg && h_out == g_bufs.h_out_reg && n == g_bufs.n_reg)
+        return;
+    // Unregister previous pointers (ignore errors on first call / already-unregistered)
+    if (g_bufs.h_in_reg)
+        cudaHostUnregister(const_cast<uint8_t*>(g_bufs.h_in_reg));
+    if (g_bufs.h_out_reg)
+        cudaHostUnregister(g_bufs.h_out_reg);
+    g_bufs.h_in_reg = nullptr;
+    g_bufs.h_out_reg = nullptr;
+    g_bufs.n_reg = 0;
+    // Register new pointers; if either fails, CUDA_CHECK will abort
+    CUDA_CHECK(cudaHostRegister(const_cast<uint8_t*>(h_in), n, cudaHostRegisterDefault));
+    CUDA_CHECK(cudaHostRegister(h_out, n, cudaHostRegisterDefault));
+    g_bufs.h_in_reg = h_in;
+    g_bufs.h_out_reg = h_out;
+    g_bufs.n_reg = n;
+}
+
+static void ensure_io_buffers(size_t n_u8) {
+    if (n_u8 == g_bufs.n_u8)
+        return;
+    cudaFree(g_bufs.d_in_u8);
+    cudaFree(g_bufs.d_out_u8);
+    CUDA_CHECK(cudaMalloc(&g_bufs.d_in_u8, n_u8));
+    CUDA_CHECK(cudaMalloc(&g_bufs.d_out_u8, n_u8));
+    g_bufs.n_u8 = n_u8;
+}
+
+static void ensure_temp_buffer(size_t n_bytes) {
+    if (n_bytes == g_bufs.n_temp)
+        return;
+    cudaFree(g_bufs.d_temp);
+    CUDA_CHECK(cudaMalloc(&g_bufs.d_temp, n_bytes));
+    g_bufs.n_temp = n_bytes;
+}
+
+// Opt N: FP16 intermediate buffer for SEPARABLE_FP16 mode
+static void ensure_temp_h16_buffer(size_t n_bytes) {
+    if (n_bytes == g_bufs.n_temp_h16)
+        return;
+    cudaFree(g_bufs.d_temp_h16);
+    CUDA_CHECK(cudaMalloc(&g_bufs.d_temp_h16, n_bytes));
+    g_bufs.n_temp_h16 = n_bytes;
+}
+
+// Adaptive mode: per-pixel radius map buffer
+static void ensure_radius_map_buffer(size_t n_pixels) {
+    if (n_pixels == g_bufs.n_radius_map)
+        return;
+    cudaFree(g_bufs.d_radius_map);
+    CUDA_CHECK(cudaMalloc(&g_bufs.d_radius_map, n_pixels));
+    g_bufs.n_radius_map = n_pixels;
+}
+
+// ============================================================================
+// Adaptive radius: compute per-pixel radius from local gradient magnitude
+// ============================================================================
+
+// Sobel-based gradient magnitude → radius map.
+// High gradient (edge) → small radius; low gradient (flat) → large radius.
+// r_min, r_max are the allowed radius bounds. grad_threshold controls the
+// gradient value at which radius reaches r_min (fully edge).
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_compute_radius_map_gray(const uint8_t* __restrict__ input,
+                          uint8_t* __restrict__ radius_map,
+                          int width, int height,
+                          int r_min, int r_max, float inv_grad_threshold) {
+    const int x = blockIdx.x * BLOCK_X + threadIdx.x;
+    const int y = blockIdx.y * BLOCK_Y + threadIdx.y;
+    if (x >= width || y >= height)
+        return;
+
+    // Clamp coords for border pixels
+    int x0 = max(x - 1, 0), x2 = min(x + 1, width - 1);
+    int y0 = max(y - 1, 0), y2 = min(y + 1, height - 1);
+
+    // Sobel gradient: Gx and Gy using 3x3 Sobel operator
+    float p00 = input[y0 * width + x0], p01 = input[y0 * width + x], p02 = input[y0 * width + x2];
+    float p10 = input[y  * width + x0],                               p12 = input[y  * width + x2];
+    float p20 = input[y2 * width + x0], p21 = input[y2 * width + x], p22 = input[y2 * width + x2];
+
+    float gx = -p00 + p02 - 2.0f * p10 + 2.0f * p12 - p20 + p22;
+    float gy = -p00 - 2.0f * p01 - p02 + p20 + 2.0f * p21 + p22;
+    float grad = sqrtf(gx * gx + gy * gy);
+
+    // Map gradient to radius: linear interpolation
+    // grad=0 → r_max (flat area, smooth more), grad>=threshold → r_min (edge, smooth less)
+    float t = fminf(grad * inv_grad_threshold, 1.0f);
+    int r = __float2int_rn(static_cast<float>(r_max) - t * static_cast<float>(r_max - r_min));
+    r = max(r_min, min(r_max, r));
+
+    radius_map[y * width + x] = static_cast<uint8_t>(r);
+}
+
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_compute_radius_map_rgb(const uint8_t* __restrict__ input,
+                         uint8_t* __restrict__ radius_map,
+                         int width, int height,
+                         int r_min, int r_max, float inv_grad_threshold) {
+    const int x = blockIdx.x * BLOCK_X + threadIdx.x;
+    const int y = blockIdx.y * BLOCK_Y + threadIdx.y;
+    if (x >= width || y >= height)
+        return;
+
+    int x0 = max(x - 1, 0), x2 = min(x + 1, width - 1);
+    int y0 = max(y - 1, 0), y2 = min(y + 1, height - 1);
+
+    // Compute gradient on luminance approximation: (R + G + B) / 3
+    auto lum = [&](int py, int px) -> float {
+        int idx = (py * width + px) * 3;
+        return (static_cast<float>(input[idx]) +
+                static_cast<float>(input[idx + 1]) +
+                static_cast<float>(input[idx + 2])) * (1.0f / 3.0f);
+    };
+
+    float p00 = lum(y0, x0), p01 = lum(y0, x), p02 = lum(y0, x2);
+    float p10 = lum(y,  x0),                    p12 = lum(y,  x2);
+    float p20 = lum(y2, x0), p21 = lum(y2, x), p22 = lum(y2, x2);
+
+    float gx = -p00 + p02 - 2.0f * p10 + 2.0f * p12 - p20 + p22;
+    float gy = -p00 - 2.0f * p01 - p02 + p20 + 2.0f * p21 + p22;
+    float grad = sqrtf(gx * gx + gy * gy);
+
+    float t = fminf(grad * inv_grad_threshold, 1.0f);
+    int r = __float2int_rn(static_cast<float>(r_max) - t * static_cast<float>(r_max - r_min));
+    r = max(r_min, min(r_max, r));
+
+    radius_map[y * width + x] = static_cast<uint8_t>(r);
+}
+
+// ============================================================================
+// Adaptive bilateral filter kernels: read per-pixel radius from radius_map
+// Shared memory is allocated for r_max halo (worst case) so all threads
+// in a block can access the full neighborhood regardless of their radius.
+// ============================================================================
+
+template <typename InT = uint8_t, typename OutT = uint8_t>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_adaptive_gray(const InT* __restrict__ input,
+                          OutT* __restrict__ output,
+                          const uint8_t* __restrict__ radius_map,
+                          int width, int height, int r_max) {
+
+    extern __shared__ float smem[];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int tile_w = BLOCK_X + 2 * r_max;
+    const int tile_h = BLOCK_Y + 2 * r_max;
+    const int tile_size = tile_w * tile_h;
+    const int thread_id = ty * BLOCK_X + tx;
+    const int threads_per_block = BLOCK_X * BLOCK_Y;
+
+    // Collaborative loading with r_max halo
+    for (int i = thread_id; i < tile_size; i += threads_per_block) {
+        int sy = i / tile_w;
+        int sx = i % tile_w;
+        int gx = blockIdx.x * BLOCK_X + sx - r_max;
+        int gy = blockIdx.y * BLOCK_Y + sy - r_max;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        smem[i] = static_cast<float>(input[gy * width + gx]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    // Read per-pixel radius
+    const int my_radius = static_cast<int>(radius_map[y * width + x]);
+    const int lx = tx + r_max;
+    const int ly = ty + r_max;
+    const float center = smem[ly * tile_w + lx];
+    // Spatial LUT was built with r_max, so use r_max-based width and offset
+    const int lut_width = 2 * r_max + 1;
+    const int lut_center = r_max;  // center offset in the LUT
+
+    float sum = 0.0f;
+    float weight_sum = 0.0f;
+
+    for (int dy = -my_radius; dy <= my_radius; ++dy) {
+        for (int dx = -my_radius; dx <= my_radius; ++dx) {
+            float neighbor = smem[(ly + dy) * tile_w + (lx + dx)];
+
+            float spatial_weight = d_spatial_lut[(dy + lut_center) * lut_width + (dx + lut_center)];
+
+            int diff = static_cast<int>(fabsf(neighbor - center) + 0.5f);
+            diff = min(diff, COLOR_LUT_SIZE - 1);
+            float color_weight = d_color_lut[diff];
+
+            float w = spatial_weight * color_weight;
+            sum += neighbor * w;
+            weight_sum += w;
+        }
+    }
+
+    output[y * width + x] = to_output<OutT>(sum / weight_sum);
+}
+
+template <typename InT = uint8_t, typename OutT = uint8_t>
+__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM)
+k_bilateral_adaptive_rgb(const InT* __restrict__ input,
+                         OutT* __restrict__ output,
+                         const uint8_t* __restrict__ radius_map,
+                         int width, int height, int r_max) {
+
+    extern __shared__ float smem[];
+
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int x = blockIdx.x * BLOCK_X + tx;
+    const int y = blockIdx.y * BLOCK_Y + ty;
+
+    const int tile_w = BLOCK_X + 2 * r_max;
+    const int tile_h = BLOCK_Y + 2 * r_max;
+    const int tile_size = tile_w * tile_h;
+    const int thread_id = ty * BLOCK_X + tx;
+    const int threads_per_block = BLOCK_X * BLOCK_Y;
+
+    float* smem_r = smem;
+    float* smem_g = smem + tile_size;
+    float* smem_b = smem + 2 * tile_size;
+
+    // Collaborative loading with r_max halo
+    for (int i = thread_id; i < tile_size; i += threads_per_block) {
+        int sy = i / tile_w;
+        int sx = i % tile_w;
+        int gx = blockIdx.x * BLOCK_X + sx - r_max;
+        int gy = blockIdx.y * BLOCK_Y + sy - r_max;
+        gx = max(0, min(width - 1, gx));
+        gy = max(0, min(height - 1, gy));
+        int gidx = (gy * width + gx) * 3;
+        smem_r[i] = static_cast<float>(input[gidx]);
+        smem_g[i] = static_cast<float>(input[gidx + 1]);
+        smem_b[i] = static_cast<float>(input[gidx + 2]);
+    }
+
+    __syncthreads();
+
+    if (x >= width || y >= height)
+        return;
+
+    const int my_radius = static_cast<int>(radius_map[y * width + x]);
+    const int lx = tx + r_max;
+    const int ly = ty + r_max;
+    const int lidx = ly * tile_w + lx;
+    // Spatial LUT was built with r_max, so use r_max-based width and offset
+    const int lut_width = 2 * r_max + 1;
+    const int lut_center = r_max;
+
+    const float center_r = smem_r[lidx];
+    const float center_g = smem_g[lidx];
+    const float center_b = smem_b[lidx];
+
+    float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f;
+    float wsum = 0.0f;
+
+    for (int dy = -my_radius; dy <= my_radius; ++dy) {
+        for (int dx = -my_radius; dx <= my_radius; ++dx) {
+            int nidx = (ly + dy) * tile_w + (lx + dx);
+            float nr = smem_r[nidx];
+            float ng = smem_g[nidx];
+            float nb = smem_b[nidx];
+
+            float spatial_weight = d_spatial_lut[(dy + lut_center) * lut_width + (dx + lut_center)];
+
+            // Mean absolute channel difference (approximation of OpenCV's L2 distance)
+            int diff = static_cast<int>(
+                (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b)) *
+                    (1.0f / 3.0f) + 0.5f);
+            diff = min(diff, COLOR_LUT_SIZE - 1);
+
+            float w = spatial_weight * d_color_lut[diff];
+            sum_r += nr * w;
+            sum_g += ng * w;
+            sum_b += nb * w;
+            wsum += w;
+        }
+    }
+
+    float rcp_wsum = __frcp_rn(wsum);
+    int out_idx = (y * width + x) * 3;
+    output[out_idx]     = to_output<OutT>(sum_r * rcp_wsum);
+    output[out_idx + 1] = to_output<OutT>(sum_g * rcp_wsum);
+    output[out_idx + 2] = to_output<OutT>(sum_b * rcp_wsum);
+}
+
+// Float-path bilateral_filter_cuda: simplified to use only runtime-radius shared-memory
+// kernels (no template specializations). Template kernels are only used in the uint8
+// direct path (dispatch_u8_kernel) which is the primary performance path.
+void bilateral_filter_cuda(const float* d_input, float* d_output, int width, int height,
+                           int channels, int radius, float sigma_spatial, float sigma_color,
+                           cudaStream_t stream) {
+
+    radius = min(radius, MAX_RADIUS);
+    ensure_luts(radius, sigma_spatial, sigma_color);
+
+    dim3 block(BLOCK_X, BLOCK_Y);
+    dim3 grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y);
+
+    if (channels == 1) {
+        int tile_w = BLOCK_X + 2 * radius;
+        int tile_h = BLOCK_Y + 2 * radius;
+        size_t smem = tile_w * tile_h * sizeof(float);
+        k_bilateral_filter_shared<float, float><<<grid, block, smem, stream>>>(
+            d_input, d_output, width, height, radius);
+    } else {
+        int tile_w = BLOCK_X + 2 * radius;
+        int tile_h = BLOCK_Y + 2 * radius;
+        size_t smem = 3 * tile_w * tile_h * sizeof(float);
+        k_bilateral_filter_rgb_shared<float, float><<<grid, block, smem, stream>>>(
+            d_input, d_output, width, height, radius);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+    if (stream == 0) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+}
+
+// ============================================================================
+// Opt E: Strip pipelining - internal kernel-only dispatcher (no H2D/D2H)
+// Called per-strip or once for the full image.
+// All GPU buffers (d_in, d_out, d_temp, d_temp_h16, d_rmap) are pre-allocated
+// by the caller. ensure_luts() must also be called before this function.
+// ============================================================================
+static void dispatch_u8_kernel(
+        const uint8_t* d_in, uint8_t* d_out,
+        float* d_temp, __half* d_temp_h16, uint8_t* d_rmap,
+        int width, int height, int channels,
+        int radius, float sigma_spatial, float sigma_color,
+        FilterMode mode, cudaStream_t stream) {
+
+    const dim3 block(BLOCK_X, BLOCK_Y);
+    const dim3 grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y);
+
+    if (channels == 1) {
+        if (mode == FilterMode::FUSED) {
+            // Opt M: Fused H+V in single kernel — no intermediate buffer
+            if (radius == 5) {
+                launch_u8_fused_gray<5>(d_in, d_out, width, height, stream);
+            } else {
+                // Fallback to 2-pass separable
+                size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else if (mode == FilterMode::ADAPTIVE) {
+            int r_max = radius;
+            int r_min = max(2, radius - 1);
+            float grad_threshold = fmaxf(sigma_color * 1.5f, 20.0f);
+            float inv_grad_threshold = 1.0f / grad_threshold;
+            k_compute_radius_map_gray<<<grid, block, 0, stream>>>(
+                d_in, d_rmap, width, height, r_min, r_max, inv_grad_threshold);
+            ensure_luts(r_max, sigma_spatial, sigma_color);
+            int tile_w = BLOCK_X + 2 * r_max;
+            int tile_h = BLOCK_Y + 2 * r_max;
+            size_t smem_bytes = static_cast<size_t>(tile_w) * tile_h * sizeof(float);
+            k_bilateral_adaptive_gray<uint8_t, uint8_t>
+                <<<grid, block, smem_bytes, stream>>>(d_in, d_out, d_rmap, width, height, r_max);
+        } else if (mode == FilterMode::SEPARABLE_FP16) {
+            // Opt N: FP16 intermediate buffer — FP32 compute, __half H↔V transfer.
+            // Opt N2 (full FP16 compute) was tested but ~8% slower due to scalar __half
+            // having no throughput advantage over FP32 + float↔half conversion overhead.
+            if (radius == 5) {
+                launch_u8_sep_gray_fp16<5>(d_in, d_out, d_temp_h16, width, height, stream);
+            } else {
+                size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else if (mode == FilterMode::SEPARABLE) {
+            // Only radius=5 specialization; others fallback to runtime shared-memory
+            if (radius == 5) {
+                launch_u8_sep_gray<5>(d_in, d_out, d_temp, width, height, stream);
+            } else {
+                size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else if (mode == FilterMode::TEMPLATE) {
+            if (radius == 5) {
+                launch_u8_gray<5>(d_in, d_out, width, height, stream);
+            } else {
+                size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else {
+            // STANDARD
+            size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+            k_bilateral_filter_shared<uint8_t, uint8_t>
+                <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+        }
+    } else {
+        // RGB
+        if (mode == FilterMode::FUSED) {
+            // Opt M: Fused H+V in single kernel — no intermediate buffer
+            if (radius == 5) {
+                launch_u8_fused_rgb<5>(d_in, d_out, width, height, stream);
+            } else {
+                size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_rgb_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else if (mode == FilterMode::ADAPTIVE) {
+            int r_max = radius;
+            int r_min = max(2, radius - 1);
+            float grad_threshold = fmaxf(sigma_color * 1.5f, 20.0f);
+            float inv_grad_threshold = 1.0f / grad_threshold;
+            k_compute_radius_map_rgb<<<grid, block, 0, stream>>>(
+                d_in, d_rmap, width, height, r_min, r_max, inv_grad_threshold);
+            ensure_luts(r_max, sigma_spatial, sigma_color);
+            int tile_w = BLOCK_X + 2 * r_max;
+            int tile_h = BLOCK_Y + 2 * r_max;
+            size_t smem_bytes = 3 * static_cast<size_t>(tile_w) * tile_h * sizeof(float);
+            k_bilateral_adaptive_rgb<uint8_t, uint8_t>
+                <<<grid, block, smem_bytes, stream>>>(d_in, d_out, d_rmap, width, height, r_max);
+        } else if (mode == FilterMode::SEPARABLE_FP16) {
+            // Opt N: FP16 intermediate buffer — FP32 compute, __half H↔V transfer.
+            if (radius == 5) {
+                launch_u8_sep_rgb_fp16<5>(d_in, d_out, d_temp_h16, width, height, stream);
+            } else {
+                size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_rgb_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else if (mode == FilterMode::SEPARABLE) {
+            if (radius == 5) {
+                launch_u8_sep_rgb<5>(d_in, d_out, d_temp, width, height, stream);
+            } else {
+                size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_rgb_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else if (mode == FilterMode::TEMPLATE) {
+            if (radius == 5) {
+                launch_u8_rgb<5>(d_in, d_out, width, height, stream);
+            } else {
+                size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+                k_bilateral_filter_rgb_shared<uint8_t, uint8_t>
+                    <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+            }
+        } else {
+            // STANDARD
+            size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float);
+            k_bilateral_filter_rgb_shared<uint8_t, uint8_t>
+                <<<grid, block, smem, stream>>>(d_in, d_out, width, height, radius);
+        }
+    }
+}
+
+// ============================================================================
+// Opt E: Strip pipeline state - per-strip GPU buffers + CUDA streams.
+// Enables H2D / kernel / D2H overlap across strips, hiding PCIe latency.
+// ============================================================================
+
+static constexpr int MAX_STRIPS = 8;
+
+static struct {
+    int          count          = 0;
+    cudaStream_t streams[MAX_STRIPS];
+    uint8_t*     d_in  [MAX_STRIPS];
+    uint8_t*     d_out [MAX_STRIPS];
+    float*       d_temp[MAX_STRIPS];   // separable float intermediate
+    __half*      d_temph[MAX_STRIPS];  // separable FP16 intermediate
+    uint8_t*     d_rmap[MAX_STRIPS];   // adaptive radius map
+    size_t       cap_in  [MAX_STRIPS]; // current allocated bytes
+    size_t       cap_temp[MAX_STRIPS];
+    size_t       cap_rmap[MAX_STRIPS];
+} g_strip;
+
+// Lazily allocate / resize per-strip buffers.
+static void ensure_strip_buffers(int s, size_t in_bytes, size_t temp_bytes,
+                                  size_t temph_bytes, size_t rmap_bytes) {
+    if (in_bytes > g_strip.cap_in[s]) {
+        cudaFree(g_strip.d_in[s]);
+        cudaFree(g_strip.d_out[s]);
+        CUDA_CHECK(cudaMalloc(&g_strip.d_in[s],  in_bytes));
+        CUDA_CHECK(cudaMalloc(&g_strip.d_out[s], in_bytes));
+        g_strip.cap_in[s] = in_bytes;
+    }
+    if (temp_bytes > g_strip.cap_temp[s]) {
+        cudaFree(g_strip.d_temp[s]);
+        cudaFree(g_strip.d_temph[s]);
+        CUDA_CHECK(cudaMalloc(&g_strip.d_temp[s],  temp_bytes));
+        CUDA_CHECK(cudaMalloc(&g_strip.d_temph[s], temph_bytes));
+        g_strip.cap_temp[s] = temp_bytes;
+    }
+    if (rmap_bytes > g_strip.cap_rmap[s]) {
+        cudaFree(g_strip.d_rmap[s]);
+        CUDA_CHECK(cudaMalloc(&g_strip.d_rmap[s], rmap_bytes));
+        g_strip.cap_rmap[s] = rmap_bytes;
+    }
+}
+
+// Create streams on first use.
+static void ensure_strip_streams(int num_strips) {
+    if (g_strip.count >= num_strips) return;
+    for (int s = g_strip.count; s < num_strips; ++s) {
+        CUDA_CHECK(cudaStreamCreateWithFlags(&g_strip.streams[s], cudaStreamNonBlocking));
+        g_strip.d_in[s] = g_strip.d_out[s] = nullptr;
+        g_strip.d_temp[s] = nullptr;
+        g_strip.d_temph[s] = nullptr;
+        g_strip.d_rmap[s] = nullptr;
+        g_strip.cap_in[s] = g_strip.cap_temp[s] = g_strip.cap_rmap[s] = 0;
+    }
+    g_strip.count = num_strips;
+}
+
+// Strip pipeline entry point.
+// Splits the image into num_strips horizontal strips and pipelines:
+//   H2D(i) → kernel(i) → D2H(i)  on stream[i]
+// Streams run concurrently; GPU copy engine overlaps with compute engine.
+//
+// Each strip input includes halo rows (±radius) so border pixels are correct.
+// Only the non-halo rows are copied back to the host output.
+static void apply_strip_pipeline(
+        const uint8_t* h_input, uint8_t* h_output,
+        int width, int height, int channels,
+        int radius, float sigma_spatial, float sigma_color,
+        FilterMode mode, int num_strips) {
+
+    ensure_strip_streams(num_strips);
+
+    const int rows_per_strip = (height + num_strips - 1) / num_strips;
+    const size_t row_bytes   = static_cast<size_t>(width) * channels;
+
+    for (int s = 0; s < num_strips; ++s) {
+        const int row_start = s * rows_per_strip;
+        const int row_end   = std::min(row_start + rows_per_strip, height);
+        if (row_start >= height) break;
+
+        const int halo_top    = std::min(radius, row_start);
+        const int halo_bot    = std::min(radius, height - row_end);
+        const int strip_h     = halo_top + (row_end - row_start) + halo_bot;
+        const size_t in_bytes = static_cast<size_t>(strip_h) * row_bytes;
+
+        // Temp buffer for separable float intermediate
+        const size_t temp_bytes  = static_cast<size_t>(strip_h) * width * channels * sizeof(float);
+        const size_t temph_bytes = static_cast<size_t>(strip_h) * width * channels * sizeof(__half);
+        const size_t rmap_bytes  = static_cast<size_t>(strip_h) * width; // 1 byte/pixel
+
+        ensure_strip_buffers(s, in_bytes, temp_bytes, temph_bytes, rmap_bytes);
+
+        // Async H2D: copy strip input with halo
+        const uint8_t* src = h_input + (row_start - halo_top) * row_bytes;
+        CUDA_CHECK(cudaMemcpyAsync(g_strip.d_in[s], src, in_bytes,
+                                   cudaMemcpyHostToDevice, g_strip.streams[s]));
+
+        // Kernel on this strip (local dimensions: width × strip_h)
+        dispatch_u8_kernel(
+            g_strip.d_in[s], g_strip.d_out[s],
+            g_strip.d_temp[s], g_strip.d_temph[s], g_strip.d_rmap[s],
+            width, strip_h, channels,
+            radius, sigma_spatial, sigma_color, mode, g_strip.streams[s]);
+
+        // Async D2H: copy back only the non-halo rows
+        const size_t out_offset  = static_cast<size_t>(halo_top) * row_bytes;
+        const size_t out_bytes   = static_cast<size_t>(row_end - row_start) * row_bytes;
+        uint8_t* dst = h_output + static_cast<size_t>(row_start) * row_bytes;
+        CUDA_CHECK(cudaMemcpyAsync(dst, g_strip.d_out[s] + out_offset, out_bytes,
+                                   cudaMemcpyDeviceToHost, g_strip.streams[s]));
+    }
+
+    // Wait for all strips to complete
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void apply_bilateral_filter_cuda(const uint8_t* h_input, uint8_t* h_output, int width, int height,
+                                 int channels, int radius, float sigma_spatial, float sigma_color) {
+    // Opt B: ensure_l1_cache_prefer() was disabled - triggers JIT crash on WSL2
+    // (cudaFuncSetCacheConfig on all template instantiations forces simultaneous JIT,
+    //  which crashes libnvidia-ptxjitcompiler.so in WSL2 environment.
+    //  The optimization itself also showed no benefit on sm_89 Ada architecture.)
+
+    const size_t n_u8 = static_cast<size_t>(width) * height * channels;
+
+    // (Re)allocate only when image size changes
+    ensure_io_buffers(n_u8);
+
+    // Opt3: page-lock caller's buffers for faster H2D/D2H transfers
+    ensure_registered(h_input, h_output, n_u8);
+
+    radius = min(radius, MAX_RADIUS);
+    ensure_luts(radius, sigma_spatial, sigma_color);
+
+    // Opt E: check BILATERAL_STRIP env var for strip pipeline
+    static int num_strips = -1;
+    if (num_strips < 0) {
+        const char* env = getenv("BILATERAL_STRIP");
+        num_strips = (env && atoi(env) > 1) ? std::min(atoi(env), MAX_STRIPS) : 1;
+    }
+
+    // Opt2: dispatch uint8 I/O kernels directly - no float conversion pipeline
+    const FilterMode mode = get_filter_mode();
+
+    // Opt E: strip pipeline - H2D/kernel/D2H overlap across num_strips streams
+    if (num_strips > 1) {
+        apply_strip_pipeline(h_input, h_output, width, height, channels,
+                             radius, sigma_spatial, sigma_color, mode, num_strips);
+        return;
+    }
+
+    // Single-shot path: one synchronous H2D → kernel → D2H
+    CUDA_CHECK(cudaMemcpy(g_bufs.d_in_u8, h_input, n_u8, cudaMemcpyHostToDevice));
+
+    // Pre-allocate intermediate buffers required by the selected mode
+    if (mode == FilterMode::SEPARABLE_FP16) {
+        // Opt N: FP16 intermediate buffer — half the bandwidth between H and V passes
+        ensure_temp_h16_buffer(static_cast<size_t>(width) * height * channels * sizeof(__half));
+    } else if (mode == FilterMode::SEPARABLE) {
+        ensure_temp_buffer(static_cast<size_t>(width) * height * channels * sizeof(float));
+    } else if (mode == FilterMode::ADAPTIVE) {
+        ensure_radius_map_buffer(static_cast<size_t>(width) * height);
+    }
+
+    dispatch_u8_kernel(g_bufs.d_in_u8, g_bufs.d_out_u8,
+                       g_bufs.d_temp, g_bufs.d_temp_h16, g_bufs.d_radius_map,
+                       width, height, channels,
+                       radius, sigma_spatial, sigma_color, mode, 0);
+
+    // Check kernel launch errors, then sync
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    CUDA_CHECK(cudaMemcpy(h_output, g_bufs.d_out_u8, n_u8, cudaMemcpyDeviceToHost));
+}
diff --git a/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_opencv.cpp b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_opencv.cpp
new file mode 100644
index 0000000..6ecd942
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_opencv.cpp
@@ -0,0 +1,93 @@
+#include "bilateral_filter_opencv.h"
+
+#include <cmath>
+#include <opencv2/opencv.hpp>
+
+#ifdef HAVE_OPENCV_CUDA
+#include <opencv2/cudaimgproc.hpp>
+#endif
+
+void apply_bilateral_filter_opencv(const ImageData& input, ImageData& output,
+                                   const FilterParams& params) {
+    int cv_type = input.channels == 1 ? CV_8UC1 : CV_8UC3;
+    cv::Mat src(input.height, input.width, cv_type, const_cast<uint8_t*>(input.data.data()));
+    cv::Mat dst;
+
+    int d = 2 * params.radius + 1;
+    cv::bilateralFilter(src, dst, d, params.sigma_color, params.sigma_spatial);
+
+    output.width = input.width;
+    output.height = input.height;
+    output.channels = input.channels;
+    output.data.resize(output.size());
+    std::memcpy(output.data.data(), dst.data, output.size());
+}
+
+#ifdef HAVE_OPENCV_CUDA
+bool apply_bilateral_filter_opencv_cuda(const ImageData& input, ImageData& output,
+                                        const FilterParams& params) {
+    // Check CUDA device availability at runtime
+    if (cv::cuda::getCudaEnabledDeviceCount() <= 0) {
+        return false;
+    }
+
+    int cv_type = input.channels == 1 ? CV_8UC1 : CV_8UC3;
+    cv::Mat src(input.height, input.width, cv_type, const_cast<uint8_t*>(input.data.data()));
+
+    // Upload to GPU
+    cv::cuda::GpuMat d_src, d_dst;
+    d_src.upload(src);
+
+    int kernel_size = 2 * params.radius + 1;
+    cv::cuda::bilateralFilter(d_src, d_dst, kernel_size, params.sigma_color, params.sigma_spatial);
+
+    // Download result
+    cv::Mat dst;
+    d_dst.download(dst);
+
+    output.width = input.width;
+    output.height = input.height;
+    output.channels = input.channels;
+    output.data.resize(output.size());
+    std::memcpy(output.data.data(), dst.data, output.size());
+    return true;
+}
+#endif
+
+double compute_mae(const ImageData& img1, const ImageData& img2) {
+    if (img1.width != img2.width || img1.height != img2.height || img1.channels != img2.channels) {
+        return -1.0;
+    }
+
+    double sum = 0.0;
+    size_t count = img1.size();
+
+    for (size_t i = 0; i < count; ++i) {
+        double diff =
+            std::abs(static_cast<double>(img1.data[i]) - static_cast<double>(img2.data[i]));
+        sum += diff;
+    }
+
+    return sum / count;
+}
+
+double compute_psnr(const ImageData& img1, const ImageData& img2) {
+    if (img1.width != img2.width || img1.height != img2.height || img1.channels != img2.channels) {
+        return -1.0;
+    }
+
+    double sum_sq = 0.0;
+    size_t count = img1.size();
+
+    for (size_t i = 0; i < count; ++i) {
+        double diff = static_cast<double>(img1.data[i]) - static_cast<double>(img2.data[i]);
+        sum_sq += diff * diff;
+    }
+
+    double mse = sum_sq / count;
+    if (mse < 1e-10) {
+        return 999.99; // practically identical
+    }
+    // MAX = 255 for uint8
+    return 10.0 * std::log10(255.0 * 255.0 / mse);
+}
diff --git a/08_bilateral_filter/Andy1314Chen/src/image_io.cpp b/08_bilateral_filter/Andy1314Chen/src/image_io.cpp
new file mode 100644
index 0000000..9df03d5
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/src/image_io.cpp
@@ -0,0 +1,102 @@
+#include "image_io.h"
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <stdexcept>
+
+bool load_raw_image(const char* path, ImageData* img) {
+    FILE* fp = fopen(path, "rb");
+    if (!fp) {
+        fprintf(stderr, "Failed to open image file: %s\n", path);
+        return false;
+    }
+
+    if (fread(&img->width, sizeof(int), 1, fp) != 1 ||
+        fread(&img->height, sizeof(int), 1, fp) != 1 ||
+        fread(&img->channels, sizeof(int), 1, fp) != 1) {
+        fprintf(stderr, "Failed to read image header\n");
+        fclose(fp);
+        return false;
+    }
+
+    if (img->width <= 0 || img->height <= 0 || img->channels <= 0 || img->channels > 4) {
+        fprintf(stderr, "Invalid image dimensions: %dx%d, channels=%d\n",
+                img->width, img->height, img->channels);
+        fclose(fp);
+        return false;
+    }
+
+    size_t data_size = img->size();
+    img->data.resize(data_size);
+
+    if (fread(img->data.data(), sizeof(uint8_t), data_size, fp) != data_size) {
+        fprintf(stderr, "Failed to read image data\n");
+        fclose(fp);
+        return false;
+    }
+
+    fclose(fp);
+    return true;
+}
+
+bool save_raw_image(const char* path, const ImageData& img) {
+    FILE* fp = fopen(path, "wb");
+    if (!fp) {
+        fprintf(stderr, "Failed to create output file: %s\n", path);
+        return false;
+    }
+
+    fwrite(&img.width, sizeof(int), 1, fp);
+    fwrite(&img.height, sizeof(int), 1, fp);
+    fwrite(&img.channels, sizeof(int), 1, fp);
+    fwrite(img.data.data(), sizeof(uint8_t), img.size(), fp);
+
+    fclose(fp);
+    return true;
+}
+
+bool load_params(const char* path, FilterParams* params) {
+    std::ifstream file(path);
+    if (!file.is_open()) {
+        fprintf(stderr, "Failed to open params file: %s\n", path);
+        return false;
+    }
+
+    std::string line;
+    while (std::getline(file, line)) {
+        if (line.empty() || line[0] == '#') continue;
+
+        size_t eq_pos = line.find('=');
+        if (eq_pos == std::string::npos) continue;
+
+        std::string key = line.substr(0, eq_pos);
+        std::string value = line.substr(eq_pos + 1);
+
+        auto trim = [](std::string& s) {
+            size_t start = s.find_first_not_of(" \t\r\n");
+            size_t end = s.find_last_not_of(" \t\r\n#");
+            if (start != std::string::npos && end != std::string::npos) {
+                s = s.substr(start, end - start + 1);
+            }
+        };
+        trim(key);
+        trim(value);
+
+        if (key == "radius") {
+            params->radius = std::stoi(value);
+        } else if (key == "sigma_spatial") {
+            params->sigma_spatial = std::stof(value);
+        } else if (key == "sigma_color") {
+            params->sigma_color = std::stof(value);
+        }
+    }
+
+    if (params->radius <= 0 || params->sigma_spatial <= 0 || params->sigma_color <= 0) {
+        fprintf(stderr, "Invalid parameters: radius=%d, sigma_spatial=%.2f, sigma_color=%.2f\n",
+                params->radius, params->sigma_spatial, params->sigma_color);
+        return false;
+    }
+
+    return true;
+}
diff --git a/08_bilateral_filter/Andy1314Chen/src/main.cpp b/08_bilateral_filter/Andy1314Chen/src/main.cpp
new file mode 100644
index 0000000..dba30a5
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/src/main.cpp
@@ -0,0 +1,472 @@
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <numeric>
+#include <vector>
+
+#include "bilateral_filter.h"
+#include "bilateral_filter_cuda.cuh"
+#include "bilateral_filter_opencv.h"
+#include "image_io.h"
+
+constexpr int WARMUP_RUNS = 5;
+constexpr int BENCHMARK_RUNS = 50;
+
+void print_usage(const char* prog) {
+    fprintf(stderr, "Usage:\n");
+    fprintf(stderr, "  %s input.raw params.txt output.raw              # CPU only\n", prog);
+    fprintf(stderr, "  %s --cuda input.raw params.txt output.raw       # CUDA only\n", prog);
+    fprintf(stderr, "  %s --opencv input.raw params.txt output.raw     # OpenCV only\n", prog);
+    fprintf(stderr,
+            "  %s --bench input.raw params.txt                 # Benchmark CUDA vs OpenCV\n", prog);
+    fprintf(stderr, "  %s --compare input.raw params.txt               # Compare CPU vs OpenCV\n",
+            prog);
+    fprintf(stderr,
+            "  %s --compare-all input.raw params.txt           # Compare CPU vs CUDA vs OpenCV\n",
+            prog);
+}
+
+struct BenchmarkResult {
+    double mean_ms;
+    double min_ms;
+    double max_ms;
+    double stddev_ms;
+};
+
+BenchmarkResult compute_stats(const std::vector<double>& times) {
+    BenchmarkResult result = {0.0, 0.0, 0.0, 0.0};
+    if (times.empty()) {
+        return result;
+    }
+
+    double sum = std::accumulate(times.begin(), times.end(), 0.0);
+    result.mean_ms = sum / times.size();
+
+    result.min_ms = *std::min_element(times.begin(), times.end());
+    result.max_ms = *std::max_element(times.begin(), times.end());
+
+    double sq_sum = 0.0;
+    for (double t : times) {
+        sq_sum += (t - result.mean_ms) * (t - result.mean_ms);
+    }
+    result.stddev_ms = sqrt(sq_sum / times.size());
+
+    return result;
+}
+
+// Helper: benchmark OpenCV CUDA if available. Returns true if benchmarked.
+#ifdef HAVE_OPENCV_CUDA
+static bool bench_opencv_cuda(const ImageData& input, const FilterParams& params,
+                              ImageData& output_cv_cuda, BenchmarkResult& cv_cuda_stats,
+                              double /*megapixels*/) {
+    // Probe: check if OpenCV CUDA bilateral filter works
+    if (!apply_bilateral_filter_opencv_cuda(input, output_cv_cuda, params)) {
+        fprintf(stderr, "OpenCV CUDA: no CUDA device available, skipping\n");
+        return false;
+    }
+
+    // Warmup
+    for (int i = 0; i < WARMUP_RUNS; ++i) {
+        apply_bilateral_filter_opencv_cuda(input, output_cv_cuda, params);
+    }
+
+    // Benchmark
+    std::vector<double> times;
+    for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+        auto start = std::chrono::high_resolution_clock::now();
+        apply_bilateral_filter_opencv_cuda(input, output_cv_cuda, params);
+        auto end = std::chrono::high_resolution_clock::now();
+        times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+    }
+
+    cv_cuda_stats = compute_stats(times);
+    return true;
+}
+#endif
+
+int main(int argc, char* argv[]) {
+    if (argc < 4) {
+        print_usage(argv[0]);
+        return 1;
+    }
+
+    bool compare_mode = (strcmp(argv[1], "--compare") == 0);
+    bool compare_all_mode = (strcmp(argv[1], "--compare-all") == 0);
+    bool bench_mode = (strcmp(argv[1], "--bench") == 0);
+    bool cuda_only = (strcmp(argv[1], "--cuda") == 0);
+    bool opencv_only = (strcmp(argv[1], "--opencv") == 0);
+
+    const char* input_path;
+    const char* params_path;
+    const char* output_path = nullptr;
+
+    if (compare_mode || compare_all_mode || bench_mode) {
+        if (argc != 4) {
+            print_usage(argv[0]);
+            return 1;
+        }
+        input_path = argv[2];
+        params_path = argv[3];
+    } else if (cuda_only || opencv_only) {
+        if (argc != 5) {
+            print_usage(argv[0]);
+            return 1;
+        }
+        input_path = argv[2];
+        params_path = argv[3];
+        output_path = argv[4];
+    } else {
+        if (argc != 4) {
+            print_usage(argv[0]);
+            return 1;
+        }
+        input_path = argv[1];
+        params_path = argv[2];
+        output_path = argv[3];
+    }
+
+    ImageData input;
+    if (!load_raw_image(input_path, &input)) {
+        return 1;
+    }
+    printf("Loaded image: %dx%d, channels=%d\n", input.width, input.height, input.channels);
+
+    FilterParams params;
+    if (!load_params(params_path, &params)) {
+        return 1;
+    }
+    printf("Parameters: radius=%d, sigma_spatial=%.2f, sigma_color=%.2f\n", params.radius,
+           params.sigma_spatial, params.sigma_color);
+
+    double megapixels = static_cast<double>(input.width) * input.height / 1e6;
+
+    if (compare_all_mode) {
+        ImageData output_cpu, output_cuda, output_opencv;
+
+        output_cuda.width = input.width;
+        output_cuda.height = input.height;
+        output_cuda.channels = input.channels;
+        output_cuda.data.resize(input.size());
+
+        printf("\nBenchmarking (warmup=%d, runs=%d)...\n", WARMUP_RUNS, BENCHMARK_RUNS);
+
+        // Warmup runs
+        for (int i = 0; i < WARMUP_RUNS; ++i) {
+            apply_bilateral_filter_cpu(input, output_cpu, params);
+            apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width,
+                                        input.height, input.channels, params.radius,
+                                        params.sigma_spatial, params.sigma_color);
+            apply_bilateral_filter_opencv(input, output_opencv, params);
+        }
+
+        // Benchmark CPU
+        std::vector<double> cpu_times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_cpu(input, output_cpu, params);
+            auto end = std::chrono::high_resolution_clock::now();
+            cpu_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        // Benchmark CUDA
+        std::vector<double> cuda_times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width,
+                                        input.height, input.channels, params.radius,
+                                        params.sigma_spatial, params.sigma_color);
+            auto end = std::chrono::high_resolution_clock::now();
+            cuda_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        // Benchmark OpenCV CPU
+        std::vector<double> cv_times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_opencv(input, output_opencv, params);
+            auto end = std::chrono::high_resolution_clock::now();
+            cv_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        auto cpu_stats = compute_stats(cpu_times);
+        auto cuda_stats = compute_stats(cuda_times);
+        auto cv_stats = compute_stats(cv_times);
+
+        double mae_cpu_cv = compute_mae(output_cpu, output_opencv);
+        double mae_cuda_cv = compute_mae(output_cuda, output_opencv);
+        double psnr_cpu_cv = compute_psnr(output_cpu, output_opencv);
+        double psnr_cuda_cv = compute_psnr(output_cuda, output_opencv);
+
+        // OpenCV CUDA baseline
+#ifdef HAVE_OPENCV_CUDA
+        ImageData output_cv_cuda;
+        BenchmarkResult cv_cuda_stats;
+        bool have_cv_cuda =
+            bench_opencv_cuda(input, params, output_cv_cuda, cv_cuda_stats, megapixels);
+        double mae_cv_cuda = -1.0, psnr_cv_cuda = -1.0;
+        if (have_cv_cuda) {
+            mae_cv_cuda = compute_mae(output_cv_cuda, output_opencv);
+            psnr_cv_cuda = compute_psnr(output_cv_cuda, output_opencv);
+        }
+#endif
+
+        printf("\n=== Benchmark Results (mean ± stddev) ===\n");
+        printf("CPU       : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cpu_stats.mean_ms,
+               cpu_stats.stddev_ms, cpu_stats.min_ms, cpu_stats.max_ms,
+               megapixels / (cpu_stats.mean_ms / 1000.0));
+        printf("CUDA      : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cuda_stats.mean_ms,
+               cuda_stats.stddev_ms, cuda_stats.min_ms, cuda_stats.max_ms,
+               megapixels / (cuda_stats.mean_ms / 1000.0));
+        printf("OpenCV    : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cv_stats.mean_ms,
+               cv_stats.stddev_ms, cv_stats.min_ms, cv_stats.max_ms,
+               megapixels / (cv_stats.mean_ms / 1000.0));
+#ifdef HAVE_OPENCV_CUDA
+        if (have_cv_cuda) {
+            printf("OpenCVCUDA: %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n",
+                   cv_cuda_stats.mean_ms, cv_cuda_stats.stddev_ms, cv_cuda_stats.min_ms,
+                   cv_cuda_stats.max_ms, megapixels / (cv_cuda_stats.mean_ms / 1000.0));
+        }
+#endif
+
+        printf("\nSpeedup (based on mean):\n");
+        printf("  CUDA vs CPU:    %.2fx\n", cpu_stats.mean_ms / cuda_stats.mean_ms);
+        printf("  CUDA vs OpenCV: %.2fx\n", cv_stats.mean_ms / cuda_stats.mean_ms);
+        printf("  OpenCV vs CPU:  %.2fx\n", cpu_stats.mean_ms / cv_stats.mean_ms);
+#ifdef HAVE_OPENCV_CUDA
+        if (have_cv_cuda) {
+            printf("  CUDA vs OpenCVCUDA: %.2fx\n", cv_cuda_stats.mean_ms / cuda_stats.mean_ms);
+        }
+#endif
+
+        printf("\nQuality (vs OpenCV CPU):\n");
+        printf("  CPU:  MAE=%.4f  PSNR=%.2f dB  %s\n", mae_cpu_cv, psnr_cpu_cv,
+               mae_cpu_cv < 1.0 ? "✓" : "✗");
+        printf("  CUDA: MAE=%.4f  PSNR=%.2f dB  %s\n", mae_cuda_cv, psnr_cuda_cv,
+               mae_cuda_cv < 1.0 ? "✓" : "✗");
+#ifdef HAVE_OPENCV_CUDA
+        if (have_cv_cuda) {
+            printf("  OpenCVCUDA: MAE=%.4f  PSNR=%.2f dB  %s\n", mae_cv_cuda, psnr_cv_cuda,
+                   mae_cv_cuda < 1.0 ? "✓" : "✗");
+        }
+#endif
+
+    } else if (bench_mode) {
+        // Benchmark CUDA vs OpenCV (skip slow CPU)
+        ImageData output_cuda, output_opencv;
+
+        output_cuda.width = input.width;
+        output_cuda.height = input.height;
+        output_cuda.channels = input.channels;
+        output_cuda.data.resize(input.size());
+
+        printf("\nBenchmarking CUDA vs OpenCV (warmup=%d, runs=%d)...\n", WARMUP_RUNS,
+               BENCHMARK_RUNS);
+
+        // Warmup
+        for (int i = 0; i < WARMUP_RUNS; ++i) {
+            apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width,
+                                        input.height, input.channels, params.radius,
+                                        params.sigma_spatial, params.sigma_color);
+        }
+
+        // Benchmark CUDA
+        std::vector<double> cuda_times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width,
+                                        input.height, input.channels, params.radius,
+                                        params.sigma_spatial, params.sigma_color);
+            auto end = std::chrono::high_resolution_clock::now();
+            cuda_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        // Warmup OpenCV CPU
+        for (int i = 0; i < WARMUP_RUNS; ++i) {
+            apply_bilateral_filter_opencv(input, output_opencv, params);
+        }
+
+        // Benchmark OpenCV CPU
+        std::vector<double> cv_times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_opencv(input, output_opencv, params);
+            auto end = std::chrono::high_resolution_clock::now();
+            cv_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        auto cuda_stats = compute_stats(cuda_times);
+        auto cv_stats = compute_stats(cv_times);
+        double mae = compute_mae(output_cuda, output_opencv);
+        double psnr = compute_psnr(output_cuda, output_opencv);
+
+        // OpenCV CUDA baseline
+#ifdef HAVE_OPENCV_CUDA
+        ImageData output_cv_cuda;
+        BenchmarkResult cv_cuda_stats;
+        bool have_cv_cuda =
+            bench_opencv_cuda(input, params, output_cv_cuda, cv_cuda_stats, megapixels);
+        double mae_cv_cuda = -1.0, psnr_cv_cuda = -1.0;
+        if (have_cv_cuda) {
+            mae_cv_cuda = compute_mae(output_cv_cuda, output_opencv);
+            psnr_cv_cuda = compute_psnr(output_cv_cuda, output_opencv);
+        }
+#endif
+
+        printf("\n=== Benchmark Results ===\n");
+        printf("CUDA      : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cuda_stats.mean_ms,
+               cuda_stats.stddev_ms, cuda_stats.min_ms, cuda_stats.max_ms,
+               megapixels / (cuda_stats.mean_ms / 1000.0));
+        printf("OpenCV    : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cv_stats.mean_ms,
+               cv_stats.stddev_ms, cv_stats.min_ms, cv_stats.max_ms,
+               megapixels / (cv_stats.mean_ms / 1000.0));
+#ifdef HAVE_OPENCV_CUDA
+        if (have_cv_cuda) {
+            printf("OpenCVCUDA: %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n",
+                   cv_cuda_stats.mean_ms, cv_cuda_stats.stddev_ms, cv_cuda_stats.min_ms,
+                   cv_cuda_stats.max_ms, megapixels / (cv_cuda_stats.mean_ms / 1000.0));
+        }
+#endif
+
+        printf("\nSpeedup: CUDA is %.2fx %s than OpenCV\n",
+               cv_stats.mean_ms > cuda_stats.mean_ms ? cv_stats.mean_ms / cuda_stats.mean_ms
+                                                     : cuda_stats.mean_ms / cv_stats.mean_ms,
+               cv_stats.mean_ms > cuda_stats.mean_ms ? "faster" : "slower");
+#ifdef HAVE_OPENCV_CUDA
+        if (have_cv_cuda) {
+            printf("Speedup: CUDA is %.2fx %s than OpenCVCUDA\n",
+                   cv_cuda_stats.mean_ms > cuda_stats.mean_ms
+                       ? cv_cuda_stats.mean_ms / cuda_stats.mean_ms
+                       : cuda_stats.mean_ms / cv_cuda_stats.mean_ms,
+                   cv_cuda_stats.mean_ms > cuda_stats.mean_ms ? "faster" : "slower");
+        }
+#endif
+
+        printf("MAE: %.4f  PSNR: %.2f dB  %s\n", mae, psnr, mae < 1.0 ? "✓" : "✗");
+#ifdef HAVE_OPENCV_CUDA
+        if (have_cv_cuda) {
+            printf("OpenCVCUDA vs OpenCV: MAE=%.4f  PSNR=%.2f dB  %s\n", mae_cv_cuda, psnr_cv_cuda,
+                   mae_cv_cuda < 1.0 ? "✓" : "✗");
+        }
+#endif
+
+    } else if (compare_mode) {
+        ImageData output_cpu, output_opencv;
+
+        printf("\nBenchmarking (warmup=%d, runs=%d)...\n", WARMUP_RUNS, BENCHMARK_RUNS);
+
+        // Warmup
+        for (int i = 0; i < WARMUP_RUNS; ++i) {
+            apply_bilateral_filter_cpu(input, output_cpu, params);
+            apply_bilateral_filter_opencv(input, output_opencv, params);
+        }
+
+        // Benchmark
+        std::vector<double> cpu_times, cv_times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_cpu(input, output_cpu, params);
+            auto end = std::chrono::high_resolution_clock::now();
+            cpu_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_opencv(input, output_opencv, params);
+            auto end = std::chrono::high_resolution_clock::now();
+            cv_times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        auto cpu_stats = compute_stats(cpu_times);
+        auto cv_stats = compute_stats(cv_times);
+        double mae = compute_mae(output_cpu, output_opencv);
+        double psnr = compute_psnr(output_cpu, output_opencv);
+
+        printf("\n=== Benchmark Results ===\n");
+        printf("CPU    : %.2f ± %.2f ms (%.2f MP/s)\n", cpu_stats.mean_ms, cpu_stats.stddev_ms,
+               megapixels / (cpu_stats.mean_ms / 1000.0));
+        printf("OpenCV : %.2f ± %.2f ms (%.2f MP/s)\n", cv_stats.mean_ms, cv_stats.stddev_ms,
+               megapixels / (cv_stats.mean_ms / 1000.0));
+        printf("Speedup: %.2fx\n", cpu_stats.mean_ms / cv_stats.mean_ms);
+        printf("MAE: %.4f  PSNR: %.2f dB  %s\n", mae, psnr, mae < 1.0 ? "✓" : "✗");
+
+    } else if (cuda_only) {
+        ImageData output;
+        output.width = input.width;
+        output.height = input.height;
+        output.channels = input.channels;
+        output.data.resize(input.size());
+
+        // Warmup
+        for (int i = 0; i < WARMUP_RUNS; ++i) {
+            apply_bilateral_filter_cuda(input.data.data(), output.data.data(), input.width,
+                                        input.height, input.channels, params.radius,
+                                        params.sigma_spatial, params.sigma_color);
+        }
+
+        // Benchmark
+        std::vector<double> times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_cuda(input.data.data(), output.data.data(), input.width,
+                                        input.height, input.channels, params.radius,
+                                        params.sigma_spatial, params.sigma_color);
+            auto end = std::chrono::high_resolution_clock::now();
+            times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        auto stats = compute_stats(times);
+        printf("CUDA: %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", stats.mean_ms,
+               stats.stddev_ms, stats.min_ms, stats.max_ms, megapixels / (stats.mean_ms / 1000.0));
+
+        if (!save_raw_image(output_path, output)) {
+            return 1;
+        }
+        printf("Output saved to: %s\n", output_path);
+
+    } else if (opencv_only) {
+        ImageData output;
+
+        // Warmup
+        for (int i = 0; i < WARMUP_RUNS; ++i) {
+            apply_bilateral_filter_opencv(input, output, params);
+        }
+
+        // Benchmark
+        std::vector<double> times;
+        for (int i = 0; i < BENCHMARK_RUNS; ++i) {
+            auto start = std::chrono::high_resolution_clock::now();
+            apply_bilateral_filter_opencv(input, output, params);
+            auto end = std::chrono::high_resolution_clock::now();
+            times.push_back(std::chrono::duration<double, std::milli>(end - start).count());
+        }
+
+        auto stats = compute_stats(times);
+        printf("OpenCV: %.2f ± %.2f ms (%.2f MP/s)\n", stats.mean_ms, stats.stddev_ms,
+               megapixels / (stats.mean_ms / 1000.0));
+
+        if (!save_raw_image(output_path, output)) {
+            return 1;
+        }
+        printf("Output saved to: %s\n", output_path);
+
+    } else {
+        ImageData output;
+
+        auto start = std::chrono::high_resolution_clock::now();
+        apply_bilateral_filter_cpu(input, output, params);
+        auto end = std::chrono::high_resolution_clock::now();
+
+        double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();
+        printf("CPU time: %.2f ms (%.2f MP/s)\n", elapsed_ms, megapixels / (elapsed_ms / 1000.0));
+
+        if (!save_raw_image(output_path, output)) {
+            return 1;
+        }
+        printf("Output saved to: %s\n", output_path);
+    }
+
+    return 0;
+}
diff --git a/08_bilateral_filter/Andy1314Chen/tests/test_data/params.txt b/08_bilateral_filter/Andy1314Chen/tests/test_data/params.txt
new file mode 100644
index 0000000..6fa2360
--- /dev/null
+++ b/08_bilateral_filter/Andy1314Chen/tests/test_data/params.txt
@@ -0,0 +1,3 @@
+radius = 5
+sigma_spatial = 3.0
+sigma_color = 30.0