diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b0cd528 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +build/ +bilateral_filter +lib/ +*.raw +*.o +*.prof +*.nsys-rep +*.ncu-rep +.vscode/ \ No newline at end of file diff --git a/08_bilateral_filter/.clang-format b/08_bilateral_filter/.clang-format new file mode 100644 index 0000000..bd66366 --- /dev/null +++ b/08_bilateral_filter/.clang-format @@ -0,0 +1,87 @@ +BasedOnStyle: LLVM +Language: Cpp + +# Indentation +IndentWidth: 4 +TabWidth: 4 +UseTab: Never +ContinuationIndentWidth: 4 +IndentCaseLabels: false +IndentPPDirectives: None +NamespaceIndentation: None + +# Line length +ColumnLimit: 100 + +# Braces +BreakBeforeBraces: Attach +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterStruct: false + BeforeElse: false + BeforeWhile: false + +# Alignment +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignConsecutiveMacros: true +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true + +# Pointer and reference alignment +DerivePointerAlignment: false +PointerAlignment: Left + +# Includes +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^' + Priority: 1 + - Regex: '^<.*\.h>' + Priority: 2 + - Regex: '^<.*>' + Priority: 3 + - Regex: '".*"' + Priority: 4 +SortIncludes: true + +# Spaces +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesInAngles: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: true +SpacesInParentheses: false +SpacesInSquareBrackets: false + +# Line breaks +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterReturnType: None +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BreakBeforeBinaryOperators: None +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakStringLiterals: true + +# Others +Cpp11BracedListStyle: true +FixNamespaceComments: true +MaxEmptyLinesToKeep: 1 +ReflowComments: true +SortUsingDeclarations: true \ No newline at end of file diff --git a/08_bilateral_filter/Andy1314Chen/Makefile b/08_bilateral_filter/Andy1314Chen/Makefile new file mode 100644 index 0000000..aba81b4 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/Makefile @@ -0,0 +1,53 @@ +CXX = g++ +NVCC = nvcc +CUDA_PATH ?= /usr/local/cuda +CXXFLAGS = -std=c++17 -O3 -Wall -Wextra -I./include -I$(CUDA_PATH)/include +NVCCFLAGS = -O3 -gencode arch=compute_110,code=sm_110 -I./include + +OPENCV_CFLAGS = $(shell pkg-config --cflags opencv4) +OPENCV_LIBS = $(shell pkg-config --libs opencv4) +CUDA_LIBS = -L$(CUDA_PATH)/lib64 -lcudart + +# Detect OpenCV CUDA module (cudaimgproc) +HAVE_OPENCV_CUDA := $(shell pkg-config --libs opencv4 2>/dev/null | grep -q cudaimgproc && echo 1) +ifeq ($(HAVE_OPENCV_CUDA),1) + CXXFLAGS += -DHAVE_OPENCV_CUDA + $(info OpenCV CUDA modules detected: enabling cv::cuda::bilateralFilter baseline) +else + $(info OpenCV CUDA modules not found: cv::cuda baseline disabled) +endif + +SRC_DIR = src +BUILD_DIR = build +BIN_DIR = . + +SRCS_CPP = $(SRC_DIR)/main.cpp \ + $(SRC_DIR)/image_io.cpp \ + $(SRC_DIR)/bilateral_filter_cpu.cpp \ + $(SRC_DIR)/bilateral_filter_opencv.cpp + +SRCS_CUDA = $(SRC_DIR)/bilateral_filter_cuda.cu + +OBJS_CPP = $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SRCS_CPP)) +OBJS_CUDA = $(patsubst $(SRC_DIR)/%.cu,$(BUILD_DIR)/%.o,$(SRCS_CUDA)) + +TARGET = $(BIN_DIR)/bilateral_filter + +.PHONY: all clean + +all: $(TARGET) + +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp + @mkdir -p $(BUILD_DIR) + $(CXX) $(CXXFLAGS) $(OPENCV_CFLAGS) -c -o $@ $< + +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu + @mkdir -p $(BUILD_DIR) + $(NVCC) $(NVCCFLAGS) -c -o $@ $< + +$(TARGET): $(OBJS_CPP) $(OBJS_CUDA) + @mkdir -p $(BUILD_DIR) + $(NVCC) -o $@ $^ $(OPENCV_LIBS) $(CUDA_LIBS) + +clean: + rm -rf $(BUILD_DIR) $(TARGET) diff --git a/08_bilateral_filter/README.md b/08_bilateral_filter/Andy1314Chen/README.md similarity index 100% rename from 08_bilateral_filter/README.md rename to 08_bilateral_filter/Andy1314Chen/README.md diff --git a/08_bilateral_filter/Andy1314Chen/REPORT_course.md b/08_bilateral_filter/Andy1314Chen/REPORT_course.md new file mode 100644 index 0000000..41ee5f8 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/REPORT_course.md @@ -0,0 +1,923 @@ +# CUDA 双边滤波优化实验报告 + +**题目**:CUDA 双边滤波优化实验与性能分析 +**课程**:并行计算 / GPU 编程 +**作者**:Andy1314Chen +**日期**:2026-02-27 +**项目路径**:`08_bilateral_filter/Andy1314Chen` + +--- + +## 目录 + +1. [引言](#1-引言) +2. [算法原理与任务定义](#2-算法原理与任务定义) +3. [CUDA 实现模式设计](#3-cuda-实现模式设计) +4. [实验环境与评测方法](#4-实验环境与评测方法) +5. [基线实现与瓶颈分析](#5-基线实现与瓶颈分析) +6. [优化策略与逐步实验](#6-优化策略与逐步实验) +7. [性能结果与质量验证](#7-性能结果与质量验证) +8. [Profiler 深度分析](#8-profiler-深度分析) +9. [失败实验与优化边界](#9-失败实验与优化边界) +10. [跨平台对比分析](#10-跨平台对比分析) +11. [结论与展望](#11-结论与展望) +12. [附录:完整数据图表](#12-附录完整数据图表) + +--- + +## 1. 引言 + +双边滤波是一种经典的非线性保边平滑滤波器,在 VR 头显弱光降噪、医学图像处理等领域有广泛应用。然而,其计算复杂度为 $O((2r+1)^2)$,当滤波半径 $r=5$ 时每个像素需进行 121 次邻域访问和权重计算,对 4K(3840×2160)@60fps 实时处理构成极大挑战。 + +本文目标:在 CUDA 平台上实现高性能双边滤波,满足以下指标: +- **吞吐量**:4K@60fps 即 ≥498 MP/s +- **正确性**:MAE < 1.0、PSNR > 40 dB(以 OpenCV CPU `bilateralFilter` 为参考) +- **性能**:相对 OpenCV CPU 显著加速,并超越 OpenCV CUDA + +实验经历了 12 个迭代版本和 15 个优化实验(含 5 个失败实验),最终在两个 GPU 平台上验证了优化效果。 + +--- + +## 2. 算法原理与任务定义 + +### 2.1 双边滤波公式 + +$$ +BF[I]_p = \frac{1}{W_p} \sum_{q \in \mathcal{S}} G_{\sigma_s}(\|p-q\|) \cdot G_{\sigma_r}(|I_p - I_q|) \cdot I_q +$$ + +其中: +- $G_{\sigma_s}(\|p-q\|) = \exp\left(-\frac{\|p-q\|^2}{2\sigma_s^2}\right)$ 为**空间高斯权重** +- $G_{\sigma_r}(|I_p-I_q|) = \exp\left(-\frac{|I_p-I_q|^2}{2\sigma_r^2}\right)$ 为**值域高斯权重** +- $W_p = \sum_{q} G_{\sigma_s} \cdot G_{\sigma_r}$ 为归一化因子 + +在平坦区域,颜色相似使权重近似空间高斯,实现充分平滑;在边缘处,颜色差异大使值域权重趋零,避免跨边缘模糊。 + +### 2.2 计算量分析 + +对于 4K RGB 图像(3840×2160×3),radius=5: +- 每个像素:$(2 \times 5+1)^2 = 121$ 次邻域访问 +- 每次访问:1 次空间权重 + 1 次值域权重(含 `expf`)+ 加权累加 +- 全图:$8.3M \times 121 \approx 10$ 亿次浮点运算 + +### 2.3 代码结构 + +``` +src/ + bilateral_filter_cuda.cu # 全部 CUDA kernel(6 种模式)、LUT、pipeline + bilateral_filter_cpu.cpp # CPU 参考实现 + bilateral_filter_opencv.cpp # OpenCV CPU + CUDA 封装 + main.cpp # CLI 入口(--bench/--cuda/--compare-all) + image_io.cpp # 二进制 raw 图像 I/O +include/ + bilateral_filter_cuda.cuh # CUDA 滤波声明 + bilateral_filter.h # CPU 滤波声明 + image_io.h # ImageData/FilterParams 结构体 +``` + +--- + +## 3. CUDA 实现模式设计 + +本项目实现了 6 种 CUDA 滤波模式,通过环境变量 `BILATERAL_MODE` 在运行时切换。各模式在算法复杂度、编译器优化能力、精度和适用场景上有本质差异。以下从代码层面逐一分析。 + +### 3.1 STANDARD 模式(MODE=0)—— 运行时半径 + +STANDARD 是最通用的 fallback 实现:半径作为**运行时参数**传入 kernel,shared memory 大小通过 `extern __shared__` 动态分配。 + +```cpp +// bilateral_filter_cuda.cu:1031-1034 — 运行时半径,动态 shared memory +__global__ void k_bilateral_filter_shared( + const InT* __restrict__ input, OutT* __restrict__ output, + int width, int height, int radius) { + extern __shared__ float smem[]; // 大小在 launch 时指定 + ... + #pragma unroll 4 // 仅提示展开因子 4(循环界不确定) + for (int dy = -radius; dy <= radius; ++dy) { ... } +} +``` + +**核心特征**: +- 支持**任意半径值**,无需预编译 +- 循环界为运行时变量,编译器**无法完全展开**,仅按 `#pragma unroll 4` 部分展开 +- RGB 版本对三通道使用**独立权重**(每通道单独查 LUT、独立累加),精度最高但 LUT 访问量为 TEMPLATE 的 3 倍 + +```cpp +// bilateral_filter_cuda.cu:1160-1180 — STANDARD RGB: 三通道独立权重 +float cw_r = d_color_lut[diff_r]; // R 通道权重 +float cw_g = d_color_lut[diff_g]; // G 通道权重 +float cw_b = d_color_lut[diff_b]; // B 通道权重 +sum_r += nr * (spatial_weight * cw_r); // 各通道独立加权 +sum_g += ng * (spatial_weight * cw_g); +sum_b += nb * (spatial_weight * cw_b); +``` + +**适用场景**:半径不在预编译列表(3/5/7/9/10)中时的通用回退。 + +### 3.2 TEMPLATE 模式(MODE=1)—— 编译期半径 + +TEMPLATE 是性能最均衡的模式:将半径作为 **C++ 模板参数**,使编译器获得完整循环界信息。 + +```cpp +// bilateral_filter_cuda.cu:74-76 — 编译期半径 +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_filter_gray_template(...) { + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; // 编译期常量 + ... + #pragma unroll // 编译器可完全展开(RADIUS 已知) + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + #pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { ... } + } +} +``` + +**与 STANDARD 的关键差异**: + +| 维度 | STANDARD | TEMPLATE | +|------|----------|----------| +| 半径 | 运行时参数 `int radius` | 编译期常量 `template` | +| Shared memory | `extern __shared__` 动态 | `__shared__ float smem[编译期大小]` | +| 循环展开 | `#pragma unroll 4`(部分) | `#pragma unroll`(**完全展开**) | +| 编译器优化 | 有限(循环界未知) | **DCE + 寄存器优化 + 指令调度** | +| RGB 权重 | 三通道独立(3 次 LUT) | **均值近似**(1 次 LUT) | + +TEMPLATE 的 RGB kernel 使用**单一色彩权重近似**——三通道差值的均值作为查表索引,LUT 查找从 3 次降为 1 次: + +```cpp +// bilateral_filter_cuda.cu:218-224 — TEMPLATE RGB: 单色权重近似 +int diff = static_cast( + (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b)) + * (1.0f / 3.0f) + 0.5f); +float w = spatial_weight * d_color_lut[diff]; // 1 次查表,3 通道共享 +``` + +**圆形窗口 DCE**:由于 RADIUS 是编译期常量,`#pragma unroll` 完全展开 121 次迭代后,编译器在编译期确定哪 40 个位置的 `d_spatial_lut` 恒为 0,通过 Dead Code Elimination 直接删除这些迭代,等效于仅执行 81 次。 + +运行时通过 `switch(radius)` 分发到预编译的模板实例: + +```cpp +// 支持 radius = 3, 5, 7, 9, 10 五种预编译版本 +switch (radius) { + case 3: launch_u8_gray<3>(d_in, d_out, w, h, s); break; + case 5: launch_u8_gray<5>(d_in, d_out, w, h, s); break; + case 7: launch_u8_gray<7>(d_in, d_out, w, h, s); break; + ... +} +``` + +### 3.3 SEPARABLE 模式(MODE=2)—— 分离近似 + +SEPARABLE 是**算法级优化**:将 2D 双边滤波近似为水平 + 垂直两次 1D 滤波。 + +严格来说,双边滤波因值域权重依赖像素值而**不可分离**。但实践中近似分离的质量损失很小(MAE 0.45 vs STANDARD 0.48),且复杂度从 $O((2r+1)^2)$ 降至 $O(2 \times (2r+1))$——r=5 时从 121 次降到 22 次。 + +**水平 pass**:每行加载到 shared memory,沿行方向滤波: + +```cpp +// bilateral_filter_cuda.cu:246-296 — 水平 kernel +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_horizontal_gray(...) { + __shared__ float smem[BLOCK_Y][TILE_W_PAD]; // 每行独立 + // 加载行 + halo + for (int i = tx; i < TILE_W; i += BLOCK_X) { + smem[ty][i] = static_cast(input[gy * width + gx]); + } + __syncthreads(); + // 1D 滤波(仅沿 dx 方向) + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float neighbor = smem[ty][lx + dx]; + float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + ... + } +} +``` + +**垂直 pass**:每列加载到 shared memory,沿列方向滤波: + +```cpp +// bilateral_filter_cuda.cu:298-347 — 垂直 kernel +template +__global__ void k_bilateral_vertical_gray(...) { + __shared__ float smem[TILE_H][BLOCK_X]; // 每列独立 + // 1D 滤波(仅沿 dy 方向) + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + float neighbor = smem[ly + dy][tx]; + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + ... + } +} +``` + +**与 TEMPLATE 2D 的关键差异**: + +| 维度 | TEMPLATE (2D) | SEPARABLE (H+V) | +|------|---------------|-----------------| +| 复杂度 | $O((2r+1)^2) = 121$ 次 | $O(2 \times (2r+1)) = 22$ 次 | +| Kernel 数 | 1 个 | 2 个(H + V) | +| 中间缓冲区 | 无 | float 或 FP16(SoA 布局) | +| Shared memory | 2D tile (H×W) | 1D 行或列 | +| 精度 | 精确 2D 滤波 | 近似(可分离假设) | +| launch_bounds | `(256, 4)` | `(256, 6)`(更激进) | + +### 3.4 SEPARABLE_FP16 模式(MODE=3)—— FP16 中间缓冲 + +在 SEPARABLE 基础上,将两趟之间的中间缓冲区从 `float`(4B)改为 `__half`(2B),**全局内存带宽减半**。计算仍在 FP32 进行,精度无损。 + +```cpp +// bilateral_filter_cuda.cu:1332-1335 +// H kernel: uint8 → smem(float) → compute(float) → output(__half) +// V kernel: input(__half) → smem(float) → compute(float) → output(uint8) +// 仅 H↔V 传输使用 FP16,计算精度不受影响。 +``` + +ncu 验证:V kernel LD sectors/request 从 4.00 降至 2.00(**-50%**),Gray 模式端到端提速 **7-10%**。 + +### 3.5 ADAPTIVE 模式(MODE=4)—— 自适应半径 + +ADAPTIVE 是**质量优先**的模式:先用 Sobel 算子计算逐像素梯度,再将梯度映射为半径——平坦区域用大半径(充分平滑),边缘区域用小半径(保留细节)。 + +**第一步:梯度计算 → 半径映射** + +```cpp +// bilateral_filter_cuda.cu:1491-1506 — Sobel 梯度 → radius +float gx = -p00 + p02 - 2.0f*p10 + 2.0f*p12 - p20 + p22; +float gy = -p00 - 2.0f*p01 - p02 + p20 + 2.0f*p21 + p22; +float grad = sqrtf(gx*gx + gy*gy); +// 线性插值:grad=0 → r_max,grad≥threshold → r_min +float t = fminf(grad * inv_grad_threshold, 1.0f); +int r = __float2int_rn(r_max - t * (r_max - r_min)); +radius_map[y * width + x] = static_cast(r); +``` + +**第二步:使用 per-pixel radius 的双边滤波** + +```cpp +// bilateral_filter_cuda.cu:1551-1588 — 自适应 kernel +k_bilateral_adaptive_gray(..., const uint8_t* radius_map, ...) { + // shared memory 按 r_max 分配(保证所有线程可访问最大邻域) + const int tile_w = BLOCK_X + 2 * r_max; + ... + const int my_radius = static_cast(radius_map[y * width + x]); + for (int dy = -my_radius; dy <= my_radius; ++dy) { ... } +} +``` + +**与其他模式的关键差异**: + +| 维度 | TEMPLATE/STANDARD | ADAPTIVE | +|------|-------------------|----------| +| 半径 | 全图统一 | **逐像素不同** | +| 预处理 | 无 | 额外 Sobel 梯度 kernel | +| Warp divergence | 无(循环界一致) | **有**(同 warp 内不同线程循环次数不同) | +| Shared memory | 按实际 radius 分配 | 按 **r_max** 分配(最坏情况) | +| MAE | 0.48-0.60 | **0.40**(最优) | +| 性能 | 较快 | 较慢(梯度 pass + divergence 开销) | + +### 3.6 FUSED 模式(MODE=5)—— 融合 H+V + +实验性模式:将 SEPARABLE 的两个 kernel 融合为单 kernel,消除中间缓冲区的全局内存读写。 + +``` +单 kernel 三阶段: +Phase 0: 加载 2D halo → smem_raw +Phase 1: 对全部行做水平滤波 → smem_h(smem 内,无全局写入) +Phase 2: 从 smem_h 做垂直滤波 → 输出 +``` + +实测性能倒退 31-34%(见第 9 章失败实验),已标记为实验性。 + +### 3.7 模式对比总结 + +下表为 **Jetson AGX Thor** 实测数据(统一内存,无 PCIe 瓶颈,kernel 性能直接反映端到端)。OpenCV CPU 82 ms = 1.00x。 + +| 模式 | 环境变量 | 复杂度 | MAE | Avg (ms) | Min (ms) | vs OCV CPU | vs OCV CUDA | 核心特点 | +|------|---------|:---:|---:|---:|---:|---:|---:|---------| +| **SEP_FP16** | **`MODE=3`** | **$O(r)$** | **0.46** | **3.01** | **2.95** | **27.2x** | **4.00x** | **SEPARABLE + FP16 中间缓冲,最快** | +| SEPARABLE | `MODE=2` | $O(r)$ | 0.45 | 3.06 | 2.99 | 26.8x | 3.94x | 水平+垂直分离近似 | +| TEMPLATE | `MODE=1` | $O(r^2)$ | 0.60 | 5.48 | 5.44 | 15.0x | 2.23x | 编译期半径,完全展开 + DCE | +| ADAPTIVE | `MODE=4` | $O(r_{avg}^2)$ | **0.40** | 6.17 | 6.11 | 13.3x | 1.94x | Sobel 自适应半径,最高精度 | +| STANDARD | `MODE=0` | $O(r^2)$ | 0.48 | 9.41 | 9.26 | 8.70x | 1.29x | 运行时半径,三通道独立权重 | +| OpenCV CUDA | — | — | 0.00 | 12.07 | 11.79 | 6.79x | 1.00x | GPU 参考基线 | +| **OpenCV CPU** | — | — | — | **82** | — | **1.00x** | — | **CPU 基线** | + +--- + +## 4. 实验环境与评测方法 + +### 4.1 硬件平台 + +| 规格 | RTX 4060 (桌面) | Jetson AGX Thor (嵌入式) | +|------|:---:|:---:| +| GPU 架构 | Ada Lovelace (sm_89) | Blackwell (sm_110) | +| SM 数量 | 24 | 20 | +| 显存/统一内存 | 8 GB GDDR6 (独立) | 128 GB LPDDR5x (共享) | +| L2 Cache | 24 MB | 32 MB | +| Shared Memory/SM | 100 KB | 228 KB | +| CPU-GPU 互联 | PCIe 4.0 x8 (WSL2) | 统一内存 (无 PCIe) | +| CUDA | 13.1 | 13.0 | +| OpenCV | 4.13.0 (with CUDA) | 4.x (with CUDA) | + +### 4.2 评测方法 + +- **Benchmark 流程**:5 次 warmup + 50 次计时,报告 mean ± stddev +- **测试数据**:4K/1080p × RGB/Gray,参数 radius=5, σ_s=3.0, σ_c=30.0 +- **基线口径**: + - **主基线**:OpenCV CPU `bilateralFilter`(正确性参照 + 性能基准) + - **次基线**:OpenCV CUDA `cv::cuda::bilateralFilter`(GPU 性能参照) +- **工具**:`ncu --set full`(硬件计数器)、`nsys profile`(时间线) + +--- + +## 5. 基线实现与瓶颈分析 + +### 5.1 OpenCV CPU 基线 + +本项目以 OpenCV `cv::bilateralFilter`(CPU 版本)作为 **主基线**——它是工业级优化的参考实现,也是正确性验证的金标准。 + +4K RGB 耗时 **82 ms**(~101 MP/s),作为 1.00x 基线。 + +项目同时提供了一份朴素 CPU 实现(`bilateral_filter_cpu.cpp`),用于展示算法逻辑。其核心循环与 CUDA naive kernel 对应: + +```cpp +// bilateral_filter_cpu.cpp — 朴素三重循环(仅作算法参照,不作为性能基线) +for (int dy = -radius; dy <= radius; ++dy) { + for (int dx = -radius; dx <= radius; ++dx) { + float spatial_weight = expf((dx*dx + dy*dy) * spatial_coeff); + float color_dist = neighbor_val - center_val; + float color_weight = expf(color_dist * color_dist * color_coeff); + float weight = spatial_weight * color_weight; + sum += neighbor_val * weight; + weight_sum += weight; + } +} +output[idx] = sum / weight_sum; +``` + +> 该朴素 CPU 实现在 1080p RGB 上耗时约 6918 ms,比 OpenCV CPU(~27 ms)慢约 **256 倍**(OpenCV 内部使用 SIMD + 缓存优化),不作为性能对比基准。 + +### 5.2 Naive CUDA Kernel(v1) + +最初的 CUDA 实现将 CPU 逻辑直接迁移到 GPU:每个线程处理一个像素,从全局内存独立读取邻域数据,实时计算 `expf`。4K RGB 耗时 **250 ms**(33 MP/s),甚至慢于 CPU——原因在于: + +### 5.3 瓶颈分析 + +| 瓶颈 | 分析 | 量化 | +|------|------|------| +| **全局内存冗余** | 16×16 block + r=5 时,256 线程发起 30,976 次访问,实际不重复仅 676 个像素,冗余率 **45x** | 带宽浪费 ~97% | +| **expf 调用代价** | 每像素 121 次 `expf`(~20 cycles/次),RGB 三通道 363 次 | 占 kernel 时间 ~60% | +| **warp divergence** | 边界像素的 `if (ny < 0 || ny >= height) continue` 导致同 warp 线程执行路径不一致 | 边界 block 效率下降 ~30% | +| **PCIe 传输** | 4K RGB 24.9 MB,PCIe 4.0 x8 (WSL2) 有效带宽 ~8 GB/s | H2D+D2H 约 6 ms | + +--- + +## 6. 优化策略与逐步实验 + +### 6.1 v2: Shared Memory 协作加载(+42%) + +**动机**:消除邻域的全局内存重复访问。 + +**实现**:block 内线程协作将 tile(含 halo 区域)加载到 shared memory,边界用 clamp 策略避免分支: + +```cpp +// bilateral_filter_cuda.cu:96-107 +#pragma unroll +for (int i = thread_id; i < TILE_SIZE_LOG; i += threads_per_block) { + int gx = blockIdx.x * BLOCK_X + sx - RADIUS; + gx = max(0, min(width - 1, gx)); // clamp 边界 + gy = max(0, min(height - 1, gy)); + smem[sy * TILE_W_PAD + sx] = static_cast(input[gy * width + gx]); +} +__syncthreads(); // 确保数据就绪 +``` + +shared memory 大小为 $(blockDim + 2r)^2$,访问延迟从 ~200-400 ns 降至 ~5 ns。 + +| 版本 | 4K RGB (ms) | 吞吐量 (MP/s) | vs OpenCV CPU | +|------|---:|---:|---:| +| v1 naive | 250 | 33 | 0.23x | +| **v2 shared** | **176** | **47** | **0.32x** | + +### 6.2 v3-v4: LUT 替代 expf(+26%, +154%) + +**动机**:消除内循环中代价最高的超越函数调用。 + +**空间权重 LUT**(v3):空间权重仅依赖偏移 $(dx,dy)$,预计算后存入 constant memory(warp 广播,单次事务): + +```cpp +// bilateral_filter_cuda.cu:52-53 +__constant__ float d_spatial_lut[LUT_SIZE]; // (2r+1)² 个 float +__constant__ float d_color_lut[COLOR_LUT_SIZE]; // 256 个 float + +// bilateral_filter_cuda.cu:1194-1214 — LUT 初始化 +static void init_spatial_lut(int radius, float sigma_spatial) { + float coeff = -0.5f / (sigma_spatial * sigma_spatial); + for (int dy = -radius; dy <= radius; ++dy) + for (int dx = -radius; dx <= radius; ++dx) + lut[(dy+radius)*w + (dx+radius)] = expf((dx*dx+dy*dy) * coeff); + cudaMemcpyToSymbol(d_spatial_lut, lut.data(), ...); +} +``` + +**值域权重 LUT**(v4-v5):8-bit 图像差值范围 [0,255],预计算 256 项查表**完全消除 expf**: + +```cpp +// kernel 内:查表替代 expf +int diff = static_cast(fabsf(neighbor - center) + 0.5f); +diff = min(diff, 255); +float color_weight = d_color_lut[diff]; // ~4 cycles vs expf ~20 cycles +``` + +ncu 实测 constant cache 命中率 **99.99%**。 + +| 版本 | 4K RGB (ms) | 吞吐量 (MP/s) | vs OpenCV CPU | +|------|---:|---:|---:| +| v3 spatial LUT | 140 | 59 | 0.40x | +| v4 fast math | 55 | 150 | 1.03x | +| **v5 color LUT** | **18** | **460** | **3.14x** | + +> Color LUT 是单项收益最大的优化(**~3x**),每像素消除了 3 通道 × 81 邻域 = 243 次 `expf` 调用。 + +### 6.3 v6: Template 编译期半径(+7%) + +**动机**:使编译器完全展开双重循环,消除循环控制开销,提升 ILP。 + +```cpp +// bilateral_filter_cuda.cu:74-75 — 模板参数化 +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_filter_gray_template(...) { + #pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + #pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { ... } + } +} +``` + +运行时通过 `switch(radius)` 分发到对应模板实例化版本(支持 r=3/5/7/9/10)。 + +### 6.4 v7-v10: 工程级优化(持久缓冲 +71%,u8 I/O +11%,页锁定 +7%) + +| 优化 | 机制 | 收益 | +|------|------|---:| +| **持久 GPU 缓冲** | 静态 `g_bufs` 结构体,仅尺寸变化时 `cudaMalloc`,LUT 参数不变则跳过上传 | **+71%** | +| **uint8 直接 I/O** | kernel 模板参数 ``,省去 host 端 u8→float→u8 转换 | **+11%** | +| **cudaHostRegister** | 将调用方堆内存注册为 page-locked,使 H2D/D2H 走 DMA 通道 | **+7%** | +| **Block 16×16** | 较好的 L1/smem 缓存利用率 | **+1%** | + +### 6.5 v11: RGB 单一色彩权重(+16%) + +**动机**:原始实现每邻域需 3 次 LUT 查找(R/G/B 各一次)。改用三通道均值距离,仅 1 次查找: + +```cpp +// bilateral_filter_cuda.cu:218-222 +int diff = static_cast( + (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b)) + * (1.0f / 3.0f) + 0.5f); +float w = spatial_weight * d_color_lut[diff]; // 1 次 vs 3 次 +``` + +MAE 从 0.65 略升至 0.80,仍满足 < 1.0 要求。 + +### 6.6 v12: 圆形窗口 Early-Continue(+13% RGB, +65% Gray) + +**动机**:r=5 的方形窗口 121 个位置中,40 个角落点超出半径圆,空间权重为零。 + +**Phase 1**:在 `init_spatial_lut` 中预置零圆外项(零性能成本,MAE 改善 0.15~0.20): + +```cpp +// bilateral_filter_cuda.cu:1206-1207 +if (dx * dx + dy * dy > r2) + lut[...] = 0.0f; // 33% 的位置置零 +``` + +**Phase 2**:kernel 内 `if (spatial_weight == 0.0f) continue;`(`bilateral_filter_cuda.cu:124`)。TEMPLATE 模式下 RADIUS 为编译期常量,`#pragma unroll` 完全展开后编译器通过 **Dead Code Elimination** 直接删除这 40 次迭代的全部指令。 + +| 模式 | Before (ms) | After (ms) | 提升 | +|------|---:|---:|---:| +| TEMPLATE RGB | 7.36 | **6.53** | **+13%** | +| TEMPLATE Gray | 4.97 | **3.01** | **+65%** | + +### 6.7 SEPARABLE 分离近似(算法级优化) + +**核心思想**:将 2D 双边滤波近似为水平 + 垂直两次 1D 滤波,复杂度从 $O(r^2)$ 降至 $O(r)$。r=5 时从 121 次降到 22 次邻域访问。 + +水平 pass 从 shared memory 沿行方向滤波,垂直 pass 沿列方向滤波。两个 pass 之间通过中间缓冲区(float 或 FP16)传递数据。 + +```cpp +// 水平 kernel: bilateral_filter_cuda.cu:281-293 +#pragma unroll +for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float neighbor = smem[ty][lx + dx]; + float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + float w = spatial_weight * color_weight; + sum = fmaf(neighbor, w, sum); // 显式 FMA +} +``` + +**中间缓冲区 SoA 布局**(Opt H):水平输出改为 R|G|B 平面格式,垂直 kernel 读取单通道连续 float,合并效率从 ~33% 提升至 ~71%。 + +### 6.8 版本迭代汇总 + +> 以下 v1-v12 数据为早期开发阶段在 RTX 4060 (WSL2) 上的历史测量,vs OpenCV CPU 按当时基线(~57 ms)。最终行 SEPARABLE / SEP_FP16 为 Thor 实测。 + +| 版本 | 优化手段 | 4K RGB (ms) | 吞吐量 (MP/s) | vs OpenCV CPU | +|------|---------|---:|---:|---:| +| v1 | Naive global memory | 250 | 33 | 0.23x | +| v2 | Shared memory | 176 | 47 | 0.32x | +| v3 | + Spatial LUT | 140 | 59 | 0.40x | +| v4 | + `__expf` fast math | 55 | 150 | 1.03x | +| v5 | + Color LUT + unroll | 18 | 460 | 3.14x | +| v6 | + Template radius | 16.9 | 492 | 3.35x | +| v7 | + 持久缓冲 + LUT cache | 9.86 | 841 | 5.74x | +| v8 | + uint8 I/O kernels | 8.91 | 930 | 6.35x | +| v9 | + cudaHostRegister | 8.65 | 959 | 6.54x | +| v10 | + Block 16×16 | 8.64 | 960 | 6.55x | +| **v11** | **+ 单一色彩权重** | **7.45** | **1113** | **7.59x** | +| **v12** | **+ 圆形窗口 DCE** | **6.53** | **1271** | **8.67x** | +| — | SEPARABLE (Thor) | **3.06** | **2713** | **26.8x** | +| — | **SEP_FP16 (Thor)** | **3.01** | **2753** | **27.2x** | + +> 从 naive 250 ms 到 TEMPLATE 6.53 ms 实现 **38x** 加速;Thor 上 SEP_FP16 3.01 ms 实现 **83x** 加速(vs naive)、**27.2x**(vs OpenCV CPU)。 + +--- + +## 7. 性能结果与质量验证 + +> 以下数据均在 **Jetson AGX Thor** 上实测(2026-03-16),统一内存架构,无 PCIe 传输瓶颈。 + +### 7.1 Thor — 4K RGB(3840×2160×3) + +| 实现 | Time (ms) | Min (ms) | 吞吐量 (MP/s) | MAE | PSNR (dB) | vs OCV CPU | vs OCV CUDA | +|------|---:|---:|---:|---:|---:|---:|---:| +| **CUDA SEP_FP16** | **3.01** | **2.95** | **2753** | **0.46** | **48.39** | **27.2x** | **4.00x** | +| CUDA SEPARABLE | 3.06 | 2.99 | 2713 | 0.45 | 48.49 | 26.8x | 3.94x | +| CUDA TEMPLATE | 5.48 | 5.44 | 1515 | 0.60 | 48.28 | 15.0x | 2.23x | +| CUDA ADAPTIVE | 6.17 | 6.11 | 1344 | 0.40 | 49.42 | 13.3x | 1.94x | +| CUDA STANDARD | 9.41 | 9.26 | 882 | 0.48 | 48.61 | 8.70x | 1.29x | +| OpenCV CUDA | 12.07 | 11.79 | 687 | 0.00 | — | 6.79x | 1.00x | +| **OpenCV CPU** | **81.87** | **80.79** | **101** | — | — | **1.00x** | — | + +### 7.2 Thor — 4K Grayscale(3840×2160×1) + +| 实现 | Time (ms) | Min (ms) | 吞吐量 (MP/s) | MAE | PSNR (dB) | vs OCV CPU | vs OCV CUDA | +|------|---:|---:|---:|---:|---:|---:|---:| +| **CUDA SEP_FP16** | **1.32** | **1.27** | **6268** | **0.12** | **57.00** | **38.9x** | **5.88x** | +| CUDA SEPARABLE | 1.42 | 1.39 | 5832 | 0.15 | 56.18 | 36.4x | 5.53x | +| CUDA TEMPLATE | 3.52 | 3.46 | 2358 | 0.61 | 50.23 | 14.7x | 2.22x | +| CUDA STANDARD | 4.36 | 4.32 | 1901 | 0.61 | 50.23 | 11.9x | 1.76x | +| CUDA ADAPTIVE | 5.77 | 5.75 | 1438 | 0.61 | 50.23 | 8.94x | 1.34x | +| OpenCV CUDA | 7.78 | 7.44 | 1066 | 0.00 | — | 6.63x | 1.00x | +| **OpenCV CPU** | **51.60** | **50.57** | **161** | — | — | **1.00x** | — | + +### 7.3 性能目标达成 + +| 目标 | 要求 | 最佳实测 | 模式 | 状态 | +|------|------|---:|------|:---:| +| 4K RGB @60fps | ≥498 MP/s | 2753 MP/s | SEP_FP16 | **5.5x 余量** | +| 4K Gray @60fps | ≥498 MP/s | 6268 MP/s | SEP_FP16 | **12.6x 余量** | +| vs OpenCV CPU | 显著加速 | **27.2x** (4K RGB) | SEP_FP16 | 达标 | +| vs OpenCV CUDA | > 1.0x | **4.00x** (4K RGB) | SEP_FP16 | 达标 | +| MAE | < 1.0 | 0.12~0.61 | 所有模式 | 达标 | +| PSNR | > 40 dB | 48.28~57.00 dB | 所有模式 | 达标 | + +### 7.4 质量验证 + +| 模式 | MAE (RGB) | MAE (Gray) | PSNR (RGB) | PSNR (Gray) | +|------|---:|---:|---:|---:| +| STANDARD | 0.48 | 0.61 | 48.61 dB | 50.23 dB | +| TEMPLATE | 0.60 | 0.61 | 48.28 dB | 50.23 dB | +| SEPARABLE | 0.45 | 0.15 | 48.49 dB | 56.18 dB | +| SEP_FP16 | 0.46 | 0.12 | 48.39 dB | 57.00 dB | +| ADAPTIVE | 0.40 | 0.61 | 49.42 dB | 50.23 dB | + +> SEPARABLE 在 Gray 下 MAE 仅 0.15,是所有模式中最接近 OpenCV 的结果。 + +--- + +## 8. Profiler 深度分析 + +### 8.1 nsys 时间线分析(Thor 平台) + +#### TEMPLATE 模式 + +| 阶段 | 耗时 | 占比 | +|------|---:|---:| +| `k_bilateral_filter_rgb_template<5,u8,u8>` | 4.98 ms | **91%** | +| cudaMemcpy H2D (24.9 MB) | 0.20 ms | 3.6% | +| cudaMemcpy D2H (24.9 MB) | 0.21 ms | 3.8% | +| cudaLaunchKernel | 0.008 ms | <0.1% | + +统一内存传输带宽:H2D **122 GB/s**,D2H **117 GB/s**,远高于 RTX 4060 的 PCIe(~8 GB/s)。 + +#### SEPARABLE 模式 + +| 阶段 | 耗时 | 占比 | +|------|---:|---:| +| Horizontal pass | 1.52 ms | **50.7%** | +| Vertical pass | 1.48 ms | **49.3%** | +| cudaMemcpy 合计 | 0.42 ms | ~12% | + +**关键发现**:统一内存消除了 PCIe 传输瓶颈——Thor 上 memcpy 占比仅 ~5%(RTX 4060 上占 46-68%)。 + +### 8.2 ncu 硬件计数器 — TEMPLATE 模式 + +#### Speed-of-Light 概览 + +| 指标 | 值 | 说明 | +|------|---:|------| +| **SM Throughput** | **88.83%** | 接近计算峰值 | +| L1/TEX Throughput | 87.28% | Shared memory 密集 | +| L2 Throughput | 3.50% | 数据集 fit in L2 (32 MB) | +| DRAM Throughput | ~0% | 统一内存由 L2 服务 | +| **瓶颈诊断** | **Compute-bound** | SM 88.8% >> DRAM ~0% | + +#### Occupancy 与寄存器 + +| 指标 | 值 | +|------|---:| +| 理论 Occupancy | 100% | +| **实测 Occupancy** | **97.76%** | +| 寄存器/线程 | 23 (sm_110) vs 64 (sm_89) | + +> Blackwell 编译器将同一 kernel 的寄存器从 64 降至 23(-64%),occupancy 从 67% 提升至 97.8%。 + +#### 流水线利用率 + +| 流水线 | 利用率 | 说明 | +|--------|---:|------| +| **LSU** (Load/Store) | **43.84%** | Shared memory 读写 | +| **FMA** (浮点乘加) | **40.99%** | 权重计算 | +| ALU (整数) | 30.41% | 索引计算 | +| XU (超越函数) | 28.17% | 残余 exp 路径 | + +> LSU 与 FMA **负载均衡**(43.8% vs 41.0%),计算与访存交织良好。 + +#### 缓存命中率 + +| 缓存层 | 命中率 | +|--------|---:| +| **Constant Cache** (LUT) | **99.99%** | +| Instruction Cache | 100.0% | +| L1/TEX | 68.53% | +| L2 | 73.31% | + +### 8.3 ncu 驱动的关键优化(Thor 平台) + +#### Opt G: Block 32×8 消除 Bank Conflict + +**ncu 发现**:16×16 block 下 smem load 存在 50% bank conflict(2-way)。根因:warp 跨两行,行间 stride=27 mod 32=27,导致 11/16 bank 重叠。 + +**方案**:改为 32×8 block,warp 全在一行内(stride=1)。 + +| 指标 | 16×16 | 32×8 | 变化 | +|------|---:|---:|---:| +| Shared excessive wavefronts | 50% | **2.3%** | **-97.6%** | +| SM Throughput | 87.49% | 88.36% | +0.87pp | + +#### Opt H: SoA 中间缓冲区改善合并访问 + +**ncu 发现**:SEPARABLE vertical kernel 全局内存合并效率仅 33%,68% sector 冗余。 + +| 指标 | AoS | SoA | 变化 | +|------|---:|---:|---:| +| Vertical uncoalesced | 68% | **29%** | **-39pp** | +| Horizontal uncoalesced | 69% | **47%** | **-22pp** | + +#### Opt K: 激进 Launch Bounds 提升 Occupancy + +**ncu 发现**:SEPARABLE RGB kernel 使用 61-63 regs,occupancy 仅 62%。 + +**方案**:`__launch_bounds__(256, 6)`,强制编译器将 regs 压到 ≤42(65536/256/6=42.67)。 + +| 指标 | 优化前 | 优化后 | 变化 | +|------|---:|---:|---:| +| Registers/thread | 63 | **40** | **-35%** | +| Achieved Occupancy | 62% | **97.5%** | **+35pp** | +| SM Throughput | 64.5% | **79.4%** | **+14.9pp** | +| 4K RGB min | 3.39 ms | **3.02 ms** | **-10.9%** | + +> 这是单一改动(一个宏常量 `MIN_BLOCKS_PER_SM_SEP=6`)带来最大端到端收益的优化。关键:**零 spill**。 + +#### Opt N: FP16 中间缓冲区 + +**方案**:H↔V 中间缓冲区从 float 改为 `__half`,带宽减半。 + +| 指标 | FP32 | FP16 | 变化 | +|------|---:|---:|---:| +| V kernel LD sectors/req | 4.00 | **2.00** | **-50%** | +| H kernel ST sectors/req | 4.00 | **2.00** | **-50%** | +| 4K Gray min | 1.40 ms | **1.30 ms** | **-7.1%** | + +### 8.4 Warp Stall 分析 + +| Stall 原因 | 比率 | 说明 | +|-----------|---:|------| +| **short_scoreboard** | **3.92** | Shared memory / L1 依赖 | +| not_selected | 3.21 | warp 调度竞争 | +| wait | 2.79 | 固定延迟依赖 | +| long_scoreboard | 0.66 | L2/DRAM 依赖 | +| math_pipe_throttle | 0.40 | 计算流水线满 | +| barrier | 0.33 | `__syncthreads()` | + +### 8.5 Roofline 定位 + +``` +SM Throughput (%) + 100 ┤ + 89 ┤ ··[TEMPLATE]················· (97.8% occ, 3.55 IPC) + │ + 67 ┤ ·········[SEP-V] (63→97.5% occ after Opt K) + 65 ┤ ········[SEP-H] + │ + └───────────────────────────── + 所有 kernel 均为 Compute-bound (SM >> Memory) +``` + +--- + +## 9. 失败实验与优化边界 + +### 9.1 Opt B: Warp Shuffle 水平 pass(-2.4x,已回退) + +为 separable 灰度水平 kernel 实现 `__shfl_sync` 版本。 + +| 指标 | 基线 | Shuffle | +|------|---:|---:| +| 4K Gray | 2.07 ms | 5.02 ms | +| MAE | 0.15 | 0.50 | + +**失败原因**:radius=5 时仅 10/32 lane 需要 halo 数据,if/else 分支导致严重 warp divergence。Shared memory 方案的加载延迟已被计算充分隐藏,shuffle 无法取得优势。 + +### 9.2 Opt C: Host 端 SoA 全链路(-36%,已回退) + +将 RGB separable 改为 AoS→SoA 转换 + 3×灰度 separable + SoA→AoS 转换。 + +**失败原因**:(1) AoS↔SoA 转换引入 2 次额外全局内存遍历(47 MB);(2) 8 个 kernel launch 替代原来 2 个;(3) 丧失三通道共享权重优势。 + +### 9.3 Opt D: cudaFuncCachePreferL1(-36%,已回退) + +L1 偏好将 shared memory 从 64 KB 压缩到 32 KB/SM,导致 occupancy 下降,SEPARABLE 从 5.42 ms 恶化到 7.36 ms。 + +### 9.4 Opt M: Fused H+V 单 Kernel(-31~34%) + +单 kernel 内三阶段(加载 2D halo → 水平滤波 → 垂直滤波),消除中间 buffer。 + +| 测试 | SEPARABLE | FUSED | 变化 | +|------|---:|---:|---:| +| 4K RGB | 3.02 ms | 3.96 ms | **-31%** | +| 1080p RGB | 0.77 ms | 1.03 ms | **-34%** | + +**失败原因**:每 block 需处理 576 个水平滤波点(vs SEPARABLE 的 256),计算膨胀远超带宽节省。Thor 统一内存下中间 buffer 成本仅占 ~10%。 + +### 9.5 Opt N2: FP16 全量计算(-8.4%) + +将 smem 和内循环累加全部改为 `__half`。 + +**失败原因**:(1) sm_110 的 2x FP16 优势仅在 `__half2` 打包操作上,标量 `__hfma` 与 FP32 `fmaf` 吞吐相同;(2) LUT 返回 float,每次迭代需 6+ 次 float↔half 转换。 + +### 9.6 实验总结 + +| 优化 | 状态 | 实际效果 | 教训 | +|------|:---:|---:|------| +| Warp Shuffle | 回退 | -2.4x | 需 halo 的滤波不适合 shuffle | +| Host SoA | 回退 | -36% | 格式转换开销 > 合并收益 | +| PreferL1 | 回退 | -36% | smem 容量比 L1 缓存更重要 | +| Fused H+V | 失败 | -31~34% | 统一内存下中间 buffer 成本低 | +| FP16 计算 | 失败 | -8.4% | 标量 half 无 2x,转换开销高 | +| fmaf 显式 | 保留 | 0% | nvcc -O3 已自动最优融合 | +| Strip Pipeline | 无收益 | 0~-8% | WSL2 阻止 copy+compute 并行 | + +> 失败实验同样有价值——它们通过实测验证了优化边界,避免在无效方向继续投入。 + +--- + +## 10. 跨平台对比分析 + +### 10.1 Jetson AGX Thor 最终性能(4K RGB,实测 2026-03-16) + +| 实现 | Avg (ms) | Min (ms) | 吞吐量 (MP/s) | vs OCV CPU | vs OCV CUDA | +|------|---:|---:|---:|---:|---:| +| **CUDA SEP_FP16** | **3.01** | **2.95** | **2753** | **27.2x** | **4.00x** | +| CUDA SEPARABLE | 3.06 | 2.99 | 2713 | 26.8x | 3.94x | +| CUDA TEMPLATE | 5.48 | 5.44 | 1515 | 15.0x | 2.23x | +| CUDA ADAPTIVE | 6.17 | 6.11 | 1344 | 13.3x | 1.94x | +| CUDA STANDARD | 9.41 | 9.26 | 882 | 8.70x | 1.29x | +| OpenCV CUDA | 12.07 | 11.79 | 687 | 6.79x | 1.00x | +| **OpenCV CPU** | **81.87** | **80.79** | **101** | **1.00x** | — | + +### 10.2 跨平台 Kernel 耗时对比 + +| Kernel | Thor (ms) | RTX 4060 (ms) | cudaMemcpy | +|--------|---:|---:|---:| +| rgb_template<5> | 4.98 | 3.36 | — | +| horizontal_rgb<5> | 1.52 | 0.79 | — | +| vertical_rgb<5> | 1.48 | 0.72 | — | +| **H2D+D2H 合计** | **0.41** | **3.94** | **Thor 快 10x** | + +### 10.3 编译器差异 + +| Kernel | sm_110 Regs | sm_89 Regs | sm_110 Occupancy | sm_89 Occupancy | +|--------|---:|---:|---:|---:| +| rgb_template | **23** | 64 | **100%** | 67% | +| gray_template | **21** | 63 | **100%** | ~67% | +| horizontal_rgb (Opt K) | **40** | 62 | **100%** | 66% | + +### 10.4 平台特异性总结 + +| 因素 | 独显 (PCIe) | 统一内存 (Jetson) | +|------|:---:|:---:| +| 主瓶颈 | H2D/D2H (46-68%) | Kernel 计算 (90-95%) | +| 传输优化价值 | 极高 | 无意义 | +| Kernel 优化边际收益 | 中等(被传输稀释) | **极高**(直接反映端到端) | +| Fused kernel 价值 | 可能有效 | 无效(中间 buffer 成本低) | + +--- + +## 11. 结论与展望 + +### 11.1 结论 + +本项目实现了从 naive CUDA kernel(250 ms)到高度优化的多模式双边滤波器,在 **Jetson AGX Thor** 上最终 SEP_FP16 模式 4K RGB 仅需 **3.01 ms**,总加速比 **83x**(vs naive)、**27.2x**(vs OpenCV CPU)、**4.00x**(vs OpenCV CUDA)。核心优化手段及其贡献: + +| 排名 | 优化手段 | 加速贡献 | 代码位置 | +|:---:|---------|---:|------| +| 1 | Color Weight LUT | **~3x** | `d_color_lut[256]`, L52-53 | +| 2 | Shared Memory | **3-5x** | smem 协作加载, L96-107 | +| 3 | SEPARABLE 近似 | **O(r)** | H/V kernel, L241-522 | +| 4 | 持久 GPU 缓冲 | **+71%** | `g_bufs` 结构体, L1391-1467 | +| 5 | 圆形窗口 DCE | **+13~65%** | `spatial_weight==0` continue, L124 | +| 6 | RGB 单色权重 | **+16%** | 均值距离, L218-222 | +| 7 | launch_bounds(256,6) | **+10.9%** | `MIN_BLOCKS_PER_SM_SEP=6`, L41 | +| 8 | Template 展开 | **+7%** | `template`, L74 | + +所有模式均满足:MAE < 1.0,PSNR > 48 dB,4K@60fps 吞吐余量 **5.5x** 以上。 + +### 11.2 展望 + +| 方向 | 可行性 | 预期收益 | +|------|:---:|:---:| +| `__half2` 向量化(打包两邻域像素) | 中 | FP16 吞吐翻倍 | +| Spatial LUT 搬到 smem(避免 constant cache 序列化) | 高 | 5-10% | +| L2 Cache 持久化(视频流场景) | 高 | 5-15% | +| 多 Stream Pipeline(原生 Linux,非 WSL2) | 高 | 传输与计算重叠 | +| Bilateral Grid(大半径 r≥10) | 低 | O(N) 复杂度 | + +--- + +## 12. 附录:完整数据图表 + +### 表 A1: TEMPLATE 版本迭代(4K RGB, RTX 4060) + +| 版本 | 优化手段 | Time (ms) | 吞吐量 (MP/s) | vs 上一版 | vs OCV CPU | +|------|---------|---:|---:|---:|---:| +| v1 | Naive | 250 | 33 | — | 0.23x | +| v2 | Shared memory | 176 | 47 | +42% | 0.32x | +| v3 | + Spatial LUT | 140 | 59 | +26% | 0.40x | +| v4 | + fast math | 55 | 150 | +154% | 1.03x | +| v5 | + Color LUT | 18 | 460 | +207% | 3.14x | +| v6 | + Template | 16.9 | 492 | +7% | 3.35x | +| v7 | + 持久缓冲 | 9.86 | 841 | +71% | 5.74x | +| v8 | + u8 I/O | 8.91 | 930 | +11% | 6.35x | +| v9 | + page-lock | 8.65 | 959 | +3% | 6.54x | +| v10 | + Block 16×16 | 8.64 | 960 | +1% | 6.55x | +| v11 | + 单色权重 | 7.45 | 1113 | +16% | 7.59x | +| v12 | + 圆形窗口 | 6.53 | 1271 | +13% | 8.67x | + +### 表 A2: Thor 平台 4K RGB 全模式对比(实测 2026-03-16) + +| 实现 | Avg (ms) | Min (ms) | 吞吐量 (MP/s) | MAE | vs OCV CPU | vs OCV CUDA | +|------|---:|---:|---:|---:|---:|---:| +| **SEP_FP16** | **3.01** | **2.95** | **2753** | **0.46** | **27.2x** | **4.00x** | +| SEPARABLE | 3.06 | 2.99 | 2713 | 0.45 | 26.8x | 3.94x | +| TEMPLATE | 5.48 | 5.44 | 1515 | 0.60 | 15.0x | 2.23x | +| ADAPTIVE | 6.17 | 6.11 | 1344 | 0.40 | 13.3x | 1.94x | +| STANDARD | 9.41 | 9.26 | 882 | 0.48 | 8.70x | 1.29x | +| OpenCV CUDA | 12.07 | 11.79 | 687 | 0.00 | 6.79x | 1.00x | +| OpenCV CPU | 81.87 | 80.79 | 101 | — | 1.00x | — | + +### 表 A3: ncu 优化指标汇总 + +| 优化 | 指标 | Before | After | 变化 | +|------|------|---:|---:|---:| +| Opt G (32×8) | Smem bank conflict | 50% | 2.3% | -97.6% | +| Opt H (SoA) | Global uncoalesced (V) | 68% | 29% | -39pp | +| Opt H (SoA) | Global uncoalesced (H) | 69% | 47% | -22pp | +| Opt K (launch_bounds) | Registers/thread | 63 | 40 | -35% | +| Opt K (launch_bounds) | Achieved occupancy | 62% | 97.5% | +35pp | +| Opt K (launch_bounds) | SM throughput | 64.5% | 79.4% | +14.9pp | +| Opt N (FP16 temp) | V LD sectors/req | 4.00 | 2.00 | -50% | +| Opt N (FP16 temp) | H ST sectors/req | 4.00 | 2.00 | -50% | + +### 表 A4: 编译期寄存器对比(sm_110 vs sm_89) + +| Kernel | sm_110 Regs | sm_89 Regs | Spill | Occupancy (sm_110) | +|--------|---:|---:|:---:|---:| +| rgb_template<5,u8,u8> | 28 | 64 | 0 | 100% | +| gray_template<5,u8,u8> | 21 | 63 | 0 | 100% | +| horizontal_rgb<5,u8> | 40 | 62 | 0 | 100% | +| vertical_rgb<5,u8> | 40 | 62 | 0 | 100% | +| horizontal_gray<5,u8> | 33 | 35 | 0 | 100% | diff --git a/08_bilateral_filter/Andy1314Chen/include/bilateral_filter.h b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter.h new file mode 100644 index 0000000..7a97bad --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter.h @@ -0,0 +1,20 @@ +#ifndef BILATERAL_FILTER_H_ +#define BILATERAL_FILTER_H_ + +#include "image_io.h" +#include + +void bilateral_filter_cpu(const float* input, + float* output, + int width, + int height, + int channels, + int radius, + float sigma_spatial, + float sigma_color); + +void apply_bilateral_filter_cpu(const ImageData& input, + ImageData& output, + const FilterParams& params); + +#endif // BILATERAL_FILTER_H_ diff --git a/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_cuda.cuh b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_cuda.cuh new file mode 100644 index 0000000..9cd9b3e --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_cuda.cuh @@ -0,0 +1,32 @@ +#ifndef BILATERAL_FILTER_CUDA_CUH_ +#define BILATERAL_FILTER_CUDA_CUH_ + +#include +#include + +// Filter modes for benchmarking different implementations +// 0 = STANDARD (shared memory + LUT, runtime radius) +// 1 = TEMPLATE (compile-time radius, full unroll) +// 2 = SEPARABLE (horizontal + vertical passes, O(r) complexity) +void set_bilateral_filter_mode(int mode); + +void bilateral_filter_cuda(const float* d_input, + float* d_output, + int width, + int height, + int channels, + int radius, + float sigma_spatial, + float sigma_color, + cudaStream_t stream = 0); + +void apply_bilateral_filter_cuda(const uint8_t* h_input, + uint8_t* h_output, + int width, + int height, + int channels, + int radius, + float sigma_spatial, + float sigma_color); + +#endif // BILATERAL_FILTER_CUDA_CUH_ diff --git a/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_opencv.h b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_opencv.h new file mode 100644 index 0000000..92b5a77 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/include/bilateral_filter_opencv.h @@ -0,0 +1,19 @@ +#ifndef BILATERAL_FILTER_OPENCV_H_ +#define BILATERAL_FILTER_OPENCV_H_ + +#include "image_io.h" + +void apply_bilateral_filter_opencv(const ImageData& input, ImageData& output, + const FilterParams& params); + +double compute_mae(const ImageData& img1, const ImageData& img2); +double compute_psnr(const ImageData& img1, const ImageData& img2); + +#ifdef HAVE_OPENCV_CUDA +// OpenCV CUDA bilateral filter (cv::cuda::bilateralFilter). +// Returns false if CUDA device is unavailable at runtime. +bool apply_bilateral_filter_opencv_cuda(const ImageData& input, ImageData& output, + const FilterParams& params); +#endif + +#endif // BILATERAL_FILTER_OPENCV_H_ diff --git a/08_bilateral_filter/Andy1314Chen/include/image_io.h b/08_bilateral_filter/Andy1314Chen/include/image_io.h new file mode 100644 index 0000000..4bc5178 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/include/image_io.h @@ -0,0 +1,29 @@ +#ifndef IMAGE_IO_H_ +#define IMAGE_IO_H_ + +#include +#include +#include +#include + +struct ImageData { + int width; + int height; + int channels; + std::vector data; + + size_t size() const { return static_cast(width) * height * channels; } + bool valid() const { return width > 0 && height > 0 && channels > 0 && !data.empty(); } +}; + +struct FilterParams { + int radius = 0; + float sigma_spatial = 0.0f; + float sigma_color = 0.0f; +}; + +bool load_raw_image(const char* path, ImageData* img); +bool save_raw_image(const char* path, const ImageData& img); +bool load_params(const char* path, FilterParams* params); + +#endif // IMAGE_IO_H_ diff --git a/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cpu.cpp b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cpu.cpp new file mode 100644 index 0000000..03e4ed1 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cpu.cpp @@ -0,0 +1,71 @@ +#include +#include + +#include "bilateral_filter.h" + +void bilateral_filter_cpu(const float* input, float* output, int width, int height, int channels, + int radius, float sigma_spatial, float sigma_color) { + float spatial_coeff = -0.5f / (sigma_spatial * sigma_spatial); + float color_coeff = -0.5f / (sigma_color * sigma_color); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + int center_idx = (y * width + x) * channels; + + for (int c = 0; c < channels; ++c) { + float sum = 0.0f; + float weight_sum = 0.0f; + float center_val = input[center_idx + c]; + + for (int dy = -radius; dy <= radius; ++dy) { + int ny = y + dy; + if (ny < 0 || ny >= height) + continue; + + for (int dx = -radius; dx <= radius; ++dx) { + int nx = x + dx; + if (nx < 0 || nx >= width) + continue; + + int neighbor_idx = (ny * width + nx) * channels; + float neighbor_val = input[neighbor_idx + c]; + + float spatial_dist = static_cast(dx * dx + dy * dy); + float spatial_weight = expf(spatial_dist * spatial_coeff); + + float color_dist = neighbor_val - center_val; + float color_weight = expf(color_dist * color_dist * color_coeff); + + float weight = spatial_weight * color_weight; + sum += neighbor_val * weight; + weight_sum += weight; + } + } + + output[center_idx + c] = sum / weight_sum; + } + } + } +} + +void apply_bilateral_filter_cpu(const ImageData& input, ImageData& output, + const FilterParams& params) { + output.width = input.width; + output.height = input.height; + output.channels = input.channels; + output.data.resize(input.size()); + + std::vector input_float(input.size()); + std::vector output_float(input.size()); + + for (size_t i = 0; i < input.size(); ++i) { + input_float[i] = static_cast(input.data[i]); + } + + bilateral_filter_cpu(input_float.data(), output_float.data(), input.width, input.height, + input.channels, params.radius, params.sigma_spatial, params.sigma_color); + + for (size_t i = 0; i < output.size(); ++i) { + output.data[i] = static_cast(std::clamp(output_float[i], 0.0f, 255.0f)); + } +} diff --git a/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cuda.cu b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cuda.cu new file mode 100644 index 0000000..9ad4b6e --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_cuda.cu @@ -0,0 +1,2046 @@ +// Standard library +#include +#include +#include +#include +#include + +// CUDA FP16 (half-precision) support +#include + +// Project headers (includes via bilateral_filter_cuda.cuh) +#include "bilateral_filter_cuda.cuh" + +// clang-format off +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +// clang-format on + +#ifndef BLOCK_X +// Opt G: 32x8 ensures each warp spans a single row in shared memory, +// eliminating cross-row bank conflicts (16x16 caused 2-way conflicts +// because warp straddles 2 rows with stride 27 mod 32 = -5 overlap). +#define BLOCK_X 32 +#endif +#ifndef BLOCK_Y +#define BLOCK_Y 8 +#endif +// Opt A: launch bounds hints for register allocation +#define THREADS_PER_BLOCK (BLOCK_X * BLOCK_Y) +#define MIN_BLOCKS_PER_SM 4 +// Opt K: more aggressive launch bounds for SEPARABLE kernels to reduce +// register pressure. With 256 threads and 6 blocks/SM: max regs = 65536/(256*6) = 42. +// This forces compiler to use ≤42 registers, boosting occupancy from 66% to 100%. +#define MIN_BLOCKS_PER_SM_SEP 6 +#define MAX_RADIUS 16 +#define LUT_SIZE ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1)) +#define COLOR_LUT_SIZE 256 + +// Opt G: Shared memory bank conflict mitigation via row padding. +// 32 banks × 4B = 128B per bank cycle. If tile row width is even, threads +// in a warp accessing consecutive rows at the same column hit the same bank. +// Padding to an odd width ensures consecutive rows map to different banks. +#define SMEM_PAD(w) ((w) | 1) // ensure odd: if even, add 1; if odd, keep + +__constant__ float d_spatial_lut[LUT_SIZE]; +__constant__ float d_color_lut[COLOR_LUT_SIZE]; + +// ============================================================================ +// Opt2: type-safe output helper for uint8/float kernel output +// ============================================================================ + +template +__device__ inline T to_output(float v); +template <> +__device__ inline float to_output(float v) { + return v; +} +template <> +__device__ inline uint8_t to_output(float v) { + return static_cast(fminf(255.0f, fmaxf(0.0f, v))); +} + +// ============================================================================ +// Template-based grayscale bilateral filter with compile-time radius +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_filter_gray_template(const InT* __restrict__ input, + OutT* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int TILE_H = BLOCK_Y + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); // Opt G: odd stride to avoid bank conflicts + constexpr int TILE_SIZE_LOG = TILE_W * TILE_H; // logical elements to load + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem[TILE_H * TILE_W_PAD]; // Opt G: padded rows + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int thread_id = ty * BLOCK_X + tx; + const int threads_per_block = BLOCK_X * BLOCK_Y; + +// Collaborative loading: iterate over logical tile, store into padded layout +#pragma unroll + for (int i = thread_id; i < TILE_SIZE_LOG; i += threads_per_block) { + int sy = i / TILE_W; + int sx = i % TILE_W; + int gx = blockIdx.x * BLOCK_X + sx - RADIUS; + int gy = blockIdx.y * BLOCK_Y + sy - RADIUS; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + smem[sy * TILE_W_PAD + sx] = static_cast(input[gy * width + gx]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + RADIUS; + const int ly = ty + RADIUS; + const float center = smem[ly * TILE_W_PAD + lx]; + + float sum = 0.0f; + float weight_sum = 0.0f; + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + (dx + RADIUS)]; + if (spatial_weight == 0.0f) continue; // Opt C/F: skip circular-window corners + + float neighbor = smem[(ly + dy) * TILE_W_PAD + (lx + dx)]; + + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + float w = spatial_weight * color_weight; + sum += neighbor * w; + weight_sum += w; + } + } + + output[y * width + x] = to_output(sum / weight_sum); +} + +// ============================================================================ +// Template-based RGB bilateral filter with compile-time radius +// Full loop unrolling for known radius values +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_filter_rgb_template(const InT* __restrict__ input, + OutT* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int TILE_H = BLOCK_Y + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); // Opt G: odd stride to avoid bank conflicts + constexpr int TILE_SIZE_LOG = TILE_W * TILE_H; // logical elements to load + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem_r[TILE_H * TILE_W_PAD]; + __shared__ float smem_g[TILE_H * TILE_W_PAD]; + __shared__ float smem_b[TILE_H * TILE_W_PAD]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int thread_id = ty * BLOCK_X + tx; + const int threads_per_block = BLOCK_X * BLOCK_Y; + +// Collaborative loading: iterate over logical tile, store into padded layout +#pragma unroll + for (int i = thread_id; i < TILE_SIZE_LOG; i += threads_per_block) { + int sy = i / TILE_W; + int sx = i % TILE_W; + int gx = blockIdx.x * BLOCK_X + sx - RADIUS; + int gy = blockIdx.y * BLOCK_Y + sy - RADIUS; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + int gidx = (gy * width + gx) * 3; + int smem_idx = sy * TILE_W_PAD + sx; + smem_r[smem_idx] = static_cast(input[gidx]); + smem_g[smem_idx] = static_cast(input[gidx + 1]); + smem_b[smem_idx] = static_cast(input[gidx + 2]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + RADIUS; + const int ly = ty + RADIUS; + const int lidx = ly * TILE_W_PAD + lx; + + const float center_r = smem_r[lidx]; + const float center_g = smem_g[lidx]; + const float center_b = smem_b[lidx]; + + float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f; + float wsum = 0.0f; + +// Opt5: single shared color weight per neighbor, computed from mean absolute channel diff. +// Reduces 3 LUT lookups + 3 wsum accumulators to 1 each. Tradeoff: MAE rises ~0.65→0.80, +// which is acceptable (< 1.0). OpenCV actually uses Euclidean distance across channels, +// so this is a different (simpler) approximation. +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + (dx + RADIUS)]; + if (spatial_weight == 0.0f) continue; // Opt C/F: skip circular-window corners + + int nidx = (ly + dy) * TILE_W_PAD + (lx + dx); + float nr = smem_r[nidx]; + float ng = smem_g[nidx]; + float nb = smem_b[nidx]; + + // Single color distance: mean absolute channel difference + int diff = static_cast( + (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b)) * + (1.0f / 3.0f) + + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + + float w = spatial_weight * d_color_lut[diff]; + + sum_r += nr * w; + sum_g += ng * w; + sum_b += nb * w; + wsum += w; + } + } + + // Opt6b: replace 3 divisions with 1 reciprocal + 3 multiplications + float rcp_wsum = __frcp_rn(wsum); + int out_idx = (y * width + x) * 3; + output[out_idx] = to_output(sum_r * rcp_wsum); + output[out_idx + 1] = to_output(sum_g * rcp_wsum); + output[out_idx + 2] = to_output(sum_b * rcp_wsum); +} + +// ============================================================================ +// Separable approximation for grayscale: horizontal + vertical passes +// O(2r) complexity instead of O(r^2) +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_horizontal_gray(const InT* __restrict__ input, + TmpT* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); // Opt G: odd stride + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem[BLOCK_Y][TILE_W_PAD]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + // Load row with halo + for (int i = tx; i < TILE_W; i += BLOCK_X) { + int gx = blockIdx.x * BLOCK_X + i - RADIUS; + gx = max(0, min(width - 1, gx)); + int gy = min(y, height - 1); + smem[ty][i] = static_cast(input[gy * width + gx]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + RADIUS; + const float center = smem[ty][lx]; + + float sum = 0.0f; + float weight_sum = 0.0f; + +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float neighbor = smem[ty][lx + dx]; + float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + float w = spatial_weight * color_weight; + sum = fmaf(neighbor, w, sum); // Opt I: explicit FMA + weight_sum += w; + } + + output[y * width + x] = static_cast(sum / weight_sum); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_vertical_gray(const TmpT* __restrict__ input, + OutT* __restrict__ output, int width, int height) { + + constexpr int TILE_H = BLOCK_Y + 2 * RADIUS; + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem[TILE_H][BLOCK_X]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + // Load column with halo (cast TmpT -> float for FP16 intermediate support) + for (int i = ty; i < TILE_H; i += BLOCK_Y) { + int gy = blockIdx.y * BLOCK_Y + i - RADIUS; + gy = max(0, min(height - 1, gy)); + int gx = min(x, width - 1); + smem[i][tx] = static_cast(input[gy * width + gx]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int ly = ty + RADIUS; + const float center = smem[ly][tx]; + + float sum = 0.0f; + float weight_sum = 0.0f; + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + float neighbor = smem[ly + dy][tx]; + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + float w = spatial_weight * color_weight; + sum = fmaf(neighbor, w, sum); // Opt I: explicit FMA + weight_sum += w; + } + + output[y * width + x] = to_output(sum / weight_sum); +} + +// ============================================================================ +// Separable approximation for RGB: horizontal + vertical passes +// O(2r) complexity instead of O(r^2) +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_horizontal_rgb(const InT* __restrict__ input, + TmpT* __restrict__ output, // intermediate (float or __half) + int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); // Opt G: odd stride + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem_r[BLOCK_Y][TILE_W_PAD]; + __shared__ float smem_g[BLOCK_Y][TILE_W_PAD]; + __shared__ float smem_b[BLOCK_Y][TILE_W_PAD]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + // Load row with halo + for (int i = tx; i < TILE_W; i += BLOCK_X) { + int gx = blockIdx.x * BLOCK_X + i - RADIUS; + gx = max(0, min(width - 1, gx)); + int gy = min(y, height - 1); + int gidx = (gy * width + gx) * 3; + smem_r[ty][i] = static_cast(input[gidx]); + smem_g[ty][i] = static_cast(input[gidx + 1]); + smem_b[ty][i] = static_cast(input[gidx + 2]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + RADIUS; + const float center_r = smem_r[ty][lx]; + const float center_g = smem_g[ty][lx]; + const float center_b = smem_b[ty][lx]; + + float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f; + float wsum_r = 0.0f, wsum_g = 0.0f, wsum_b = 0.0f; + +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float nr = smem_r[ty][lx + dx]; + float ng = smem_g[ty][lx + dx]; + float nb = smem_b[ty][lx + dx]; + + // Use 1D spatial weight (center row of 2D LUT) + float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + + int diff_r = static_cast(fabsf(nr - center_r) + 0.5f); + int diff_g = static_cast(fabsf(ng - center_g) + 0.5f); + int diff_b = static_cast(fabsf(nb - center_b) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + float cw_r = d_color_lut[diff_r]; + float cw_g = d_color_lut[diff_g]; + float cw_b = d_color_lut[diff_b]; + + float w_r = spatial_weight * cw_r; + float w_g = spatial_weight * cw_g; + float w_b = spatial_weight * cw_b; + + // Opt I: explicit FMA for accumulation + sum_r = fmaf(nr, w_r, sum_r); + sum_g = fmaf(ng, w_g, sum_g); + sum_b = fmaf(nb, w_b, sum_b); + wsum_r += w_r; + wsum_g += w_g; + wsum_b += w_b; + } + + // Opt H: SoA intermediate output — write R/G/B to separate planes for coalesced + // vertical reads. Layout: plane_r[0..W*H-1], plane_g[W*H..2*W*H-1], plane_b[2*W*H..]. + const int pixel_idx = y * width + x; + const int plane_size = width * height; + output[pixel_idx] = static_cast(sum_r / wsum_r); + output[pixel_idx + plane_size] = static_cast(sum_g / wsum_g); + output[pixel_idx + 2*plane_size] = static_cast(sum_b / wsum_b); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_vertical_rgb(const TmpT* __restrict__ input, // intermediate SoA: R|G|B planes + OutT* __restrict__ output, int width, int height) { + + constexpr int TILE_H = BLOCK_Y + 2 * RADIUS; + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem_r[TILE_H][BLOCK_X]; + __shared__ float smem_g[TILE_H][BLOCK_X]; + __shared__ float smem_b[TILE_H][BLOCK_X]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int plane_size = width * height; + + // Opt H: Load from SoA planes — fully coalesced: consecutive threads read + // consecutive addresses within each plane. + for (int i = ty; i < TILE_H; i += BLOCK_Y) { + int gy = blockIdx.y * BLOCK_Y + i - RADIUS; + gy = max(0, min(height - 1, gy)); + int gx = min(x, width - 1); + int pidx = gy * width + gx; + smem_r[i][tx] = static_cast(input[pidx]); + smem_g[i][tx] = static_cast(input[pidx + plane_size]); + smem_b[i][tx] = static_cast(input[pidx + 2*plane_size]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int ly = ty + RADIUS; + const float center_r = smem_r[ly][tx]; + const float center_g = smem_g[ly][tx]; + const float center_b = smem_b[ly][tx]; + + float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f; + float wsum_r = 0.0f, wsum_g = 0.0f, wsum_b = 0.0f; + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + float nr = smem_r[ly + dy][tx]; + float ng = smem_g[ly + dy][tx]; + float nb = smem_b[ly + dy][tx]; + + // Use 1D spatial weight (center column of 2D LUT) + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + + // Opt I: Use fmaf() intrinsic to encourage FMA fusion. + // Each diff + 0.5f rounding and w*neighbor accumulation benefits from FMA. + int diff_r = static_cast(fabsf(nr - center_r) + 0.5f); + int diff_g = static_cast(fabsf(ng - center_g) + 0.5f); + int diff_b = static_cast(fabsf(nb - center_b) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + float cw_r = d_color_lut[diff_r]; + float cw_g = d_color_lut[diff_g]; + float cw_b = d_color_lut[diff_b]; + + float w_r = spatial_weight * cw_r; + float w_g = spatial_weight * cw_g; + float w_b = spatial_weight * cw_b; + + // Opt I: explicit FMA for accumulation + sum_r = fmaf(nr, w_r, sum_r); + sum_g = fmaf(ng, w_g, sum_g); + sum_b = fmaf(nb, w_b, sum_b); + wsum_r += w_r; + wsum_g += w_g; + wsum_b += w_b; + } + + int out_idx = (y * width + x) * 3; + output[out_idx] = to_output(sum_r / wsum_r); + output[out_idx + 1] = to_output(sum_g / wsum_g); + output[out_idx + 2] = to_output(sum_b / wsum_b); +} + +// ============================================================================ +// Opt N2: Full FP16 compute separable kernels +// smem uses __half (halved bandwidth), inner loop uses __half arithmetic +// (2x throughput on sm_110 Blackwell). LUT lookups stay float, converted to +// __half before FMA accumulation. +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_horizontal_gray_fp16(const uint8_t* __restrict__ input, + __half* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ __half smem[BLOCK_Y][TILE_W_PAD]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + for (int i = tx; i < TILE_W; i += BLOCK_X) { + int gx = blockIdx.x * BLOCK_X + i - RADIUS; + gx = max(0, min(width - 1, gx)); + int gy = min(y, height - 1); + smem[ty][i] = __float2half(static_cast(input[gy * width + gx])); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + RADIUS; + const __half center = smem[ty][lx]; + + __half sum = __float2half(0.0f); + __half weight_sum = __float2half(0.0f); + +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + __half neighbor = smem[ty][lx + dx]; + float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + + // Color distance in float for LUT indexing precision + float fn = __half2float(neighbor); + float fc = __half2float(center); + int diff = static_cast(fabsf(fn - fc) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + __half w = __float2half(spatial_weight * color_weight); + sum = __hfma(neighbor, w, sum); + weight_sum = __hadd(weight_sum, w); + } + + output[y * width + x] = __hdiv(sum, weight_sum); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_vertical_gray_fp16(const __half* __restrict__ input, + uint8_t* __restrict__ output, int width, int height) { + + constexpr int TILE_H = BLOCK_Y + 2 * RADIUS; + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ __half smem[TILE_H][BLOCK_X]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + for (int i = ty; i < TILE_H; i += BLOCK_Y) { + int gy = blockIdx.y * BLOCK_Y + i - RADIUS; + gy = max(0, min(height - 1, gy)); + int gx = min(x, width - 1); + smem[i][tx] = input[gy * width + gx]; + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int ly = ty + RADIUS; + const __half center = smem[ly][tx]; + + __half sum = __float2half(0.0f); + __half weight_sum = __float2half(0.0f); + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + __half neighbor = smem[ly + dy][tx]; + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + + float fn = __half2float(neighbor); + float fc = __half2float(center); + int diff = static_cast(fabsf(fn - fc) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + __half w = __float2half(spatial_weight * color_weight); + sum = __hfma(neighbor, w, sum); + weight_sum = __hadd(weight_sum, w); + } + + float result = __half2float(sum) / __half2float(weight_sum); + output[y * width + x] = static_cast(fminf(255.0f, fmaxf(0.0f, result))); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_horizontal_rgb_fp16(const uint8_t* __restrict__ input, + __half* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ __half smem_r[BLOCK_Y][TILE_W_PAD]; + __shared__ __half smem_g[BLOCK_Y][TILE_W_PAD]; + __shared__ __half smem_b[BLOCK_Y][TILE_W_PAD]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + for (int i = tx; i < TILE_W; i += BLOCK_X) { + int gx = blockIdx.x * BLOCK_X + i - RADIUS; + gx = max(0, min(width - 1, gx)); + int gy = min(y, height - 1); + int gidx = (gy * width + gx) * 3; + smem_r[ty][i] = __float2half(static_cast(input[gidx])); + smem_g[ty][i] = __float2half(static_cast(input[gidx + 1])); + smem_b[ty][i] = __float2half(static_cast(input[gidx + 2])); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + RADIUS; + const __half center_r = smem_r[ty][lx]; + const __half center_g = smem_g[ty][lx]; + const __half center_b = smem_b[ty][lx]; + + __half sum_r = __float2half(0.0f), sum_g = __float2half(0.0f), sum_b = __float2half(0.0f); + __half wsum_r = __float2half(0.0f), wsum_g = __float2half(0.0f), wsum_b = __float2half(0.0f); + +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + __half nr = smem_r[ty][lx + dx]; + __half ng = smem_g[ty][lx + dx]; + __half nb = smem_b[ty][lx + dx]; + + float spatial_weight = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + + // Color distance in float for LUT precision + float fr = __half2float(nr), fg = __half2float(ng), fb = __half2float(nb); + float cr = __half2float(center_r), cg = __half2float(center_g), cb = __half2float(center_b); + int diff_r = static_cast(fabsf(fr - cr) + 0.5f); + int diff_g = static_cast(fabsf(fg - cg) + 0.5f); + int diff_b = static_cast(fabsf(fb - cb) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + __half w_r = __float2half(spatial_weight * d_color_lut[diff_r]); + __half w_g = __float2half(spatial_weight * d_color_lut[diff_g]); + __half w_b = __float2half(spatial_weight * d_color_lut[diff_b]); + + sum_r = __hfma(nr, w_r, sum_r); + sum_g = __hfma(ng, w_g, sum_g); + sum_b = __hfma(nb, w_b, sum_b); + wsum_r = __hadd(wsum_r, w_r); + wsum_g = __hadd(wsum_g, w_g); + wsum_b = __hadd(wsum_b, w_b); + } + + // SoA output + const int pixel_idx = y * width + x; + const int plane_size = width * height; + output[pixel_idx] = __hdiv(sum_r, wsum_r); + output[pixel_idx + plane_size] = __hdiv(sum_g, wsum_g); + output[pixel_idx + 2*plane_size] = __hdiv(sum_b, wsum_b); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_vertical_rgb_fp16(const __half* __restrict__ input, + uint8_t* __restrict__ output, int width, int height) { + + constexpr int TILE_H = BLOCK_Y + 2 * RADIUS; + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ __half smem_r[TILE_H][BLOCK_X]; + __shared__ __half smem_g[TILE_H][BLOCK_X]; + __shared__ __half smem_b[TILE_H][BLOCK_X]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int plane_size = width * height; + + for (int i = ty; i < TILE_H; i += BLOCK_Y) { + int gy = blockIdx.y * BLOCK_Y + i - RADIUS; + gy = max(0, min(height - 1, gy)); + int gx = min(x, width - 1); + int pidx = gy * width + gx; + smem_r[i][tx] = input[pidx]; + smem_g[i][tx] = input[pidx + plane_size]; + smem_b[i][tx] = input[pidx + 2*plane_size]; + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int ly = ty + RADIUS; + const __half center_r = smem_r[ly][tx]; + const __half center_g = smem_g[ly][tx]; + const __half center_b = smem_b[ly][tx]; + + __half sum_r = __float2half(0.0f), sum_g = __float2half(0.0f), sum_b = __float2half(0.0f); + __half wsum_r = __float2half(0.0f), wsum_g = __float2half(0.0f), wsum_b = __float2half(0.0f); + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + __half nr = smem_r[ly + dy][tx]; + __half ng = smem_g[ly + dy][tx]; + __half nb = smem_b[ly + dy][tx]; + + float spatial_weight = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + + float fr = __half2float(nr), fg = __half2float(ng), fb = __half2float(nb); + float cr = __half2float(center_r), cg = __half2float(center_g), cb = __half2float(center_b); + int diff_r = static_cast(fabsf(fr - cr) + 0.5f); + int diff_g = static_cast(fabsf(fg - cg) + 0.5f); + int diff_b = static_cast(fabsf(fb - cb) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + __half w_r = __float2half(spatial_weight * d_color_lut[diff_r]); + __half w_g = __float2half(spatial_weight * d_color_lut[diff_g]); + __half w_b = __float2half(spatial_weight * d_color_lut[diff_b]); + + sum_r = __hfma(nr, w_r, sum_r); + sum_g = __hfma(ng, w_g, sum_g); + sum_b = __hfma(nb, w_b, sum_b); + wsum_r = __hadd(wsum_r, w_r); + wsum_g = __hadd(wsum_g, w_g); + wsum_b = __hadd(wsum_b, w_b); + } + + // Final division in float for output precision + int out_idx = (y * width + x) * 3; + float rr = __half2float(sum_r) / __half2float(wsum_r); + float rg = __half2float(sum_g) / __half2float(wsum_g); + float rb = __half2float(sum_b) / __half2float(wsum_b); + output[out_idx] = static_cast(fminf(255.0f, fmaxf(0.0f, rr))); + output[out_idx + 1] = static_cast(fminf(255.0f, fmaxf(0.0f, rg))); + output[out_idx + 2] = static_cast(fminf(255.0f, fmaxf(0.0f, rb))); +} + +// ============================================================================ +// Opt M: Fused H+V separable kernel — single kernel, zero intermediate buffer. +// Phase 1: load raw pixels with 2D halo → smem_raw, do horizontal filtering +// for ALL rows (BLOCK_Y + 2*RADIUS) → smem_h. +// Phase 2: __syncthreads(), do vertical filtering from smem_h → global output. +// Saves: 1 kernel launch + entire intermediate buffer read/write. +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_fused_gray(const uint8_t* __restrict__ input, + uint8_t* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; // raw data columns + constexpr int FUSED_H = BLOCK_Y + 2 * RADIUS; // rows to horizontal-filter + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); + constexpr int RAW_SIZE = FUSED_H * TILE_W; // total raw pixels to load + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + // smem_raw: original pixel data with 2D halo for horizontal filtering + __shared__ float smem_raw[FUSED_H][TILE_W_PAD]; + // smem_h: horizontal filtering results (only center BLOCK_X columns, all FUSED_H rows) + __shared__ float smem_h[FUSED_H][BLOCK_X]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int thread_id = ty * BLOCK_X + tx; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + // ---- Phase 0: Collaborative load of raw data with 2D halo ---- + for (int i = thread_id; i < RAW_SIZE; i += THREADS_PER_BLOCK) { + int sy = i / TILE_W; + int sx = i % TILE_W; + int gx = blockIdx.x * BLOCK_X + sx - RADIUS; + int gy = blockIdx.y * BLOCK_Y + sy - RADIUS; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + smem_raw[sy][sx] = static_cast(input[gy * width + gx]); + } + + __syncthreads(); + + // ---- Phase 1: Horizontal filtering for ALL FUSED_H rows ---- + // Each thread handles multiple rows to cover all FUSED_H rows + for (int row = thread_id; row < FUSED_H * BLOCK_X; row += THREADS_PER_BLOCK) { + int hy = row / BLOCK_X; // which of the FUSED_H rows + int hx = row % BLOCK_X; // which column (0..BLOCK_X-1) + + int lx = hx + RADIUS; // center position in smem_raw + float center = smem_raw[hy][lx]; + float sum = 0.0f; + float wsum = 0.0f; + +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float neighbor = smem_raw[hy][lx + dx]; + float sw = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float cw = d_color_lut[diff]; + float w = sw * cw; + sum = fmaf(neighbor, w, sum); + wsum += w; + } + smem_h[hy][hx] = sum / wsum; + } + + __syncthreads(); + + // ---- Phase 2: Vertical filtering from smem_h → global output ---- + if (x >= width || y >= height) + return; + + const int ly = ty + RADIUS; // center row in smem_h + float center = smem_h[ly][tx]; + + float sum = 0.0f; + float wsum = 0.0f; + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + float neighbor = smem_h[ly + dy][tx]; + float sw = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float cw = d_color_lut[diff]; + float w = sw * cw; + sum = fmaf(neighbor, w, sum); + wsum += w; + } + + output[y * width + x] = static_cast(fminf(255.0f, fmaxf(0.0f, sum / wsum))); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM_SEP) +k_bilateral_fused_rgb(const uint8_t* __restrict__ input, + uint8_t* __restrict__ output, int width, int height) { + + constexpr int TILE_W = BLOCK_X + 2 * RADIUS; + constexpr int FUSED_H = BLOCK_Y + 2 * RADIUS; + constexpr int TILE_W_PAD = SMEM_PAD(TILE_W); + constexpr int RAW_SIZE = FUSED_H * TILE_W; + constexpr int LUT_WIDTH = 2 * RADIUS + 1; + + __shared__ float smem_raw_r[FUSED_H][TILE_W_PAD]; + __shared__ float smem_raw_g[FUSED_H][TILE_W_PAD]; + __shared__ float smem_raw_b[FUSED_H][TILE_W_PAD]; + __shared__ float smem_h_r[FUSED_H][BLOCK_X]; + __shared__ float smem_h_g[FUSED_H][BLOCK_X]; + __shared__ float smem_h_b[FUSED_H][BLOCK_X]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int thread_id = ty * BLOCK_X + tx; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + // ---- Phase 0: Collaborative load ---- + for (int i = thread_id; i < RAW_SIZE; i += THREADS_PER_BLOCK) { + int sy = i / TILE_W; + int sx = i % TILE_W; + int gx = blockIdx.x * BLOCK_X + sx - RADIUS; + int gy = blockIdx.y * BLOCK_Y + sy - RADIUS; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + int gidx = (gy * width + gx) * 3; + smem_raw_r[sy][sx] = static_cast(input[gidx]); + smem_raw_g[sy][sx] = static_cast(input[gidx + 1]); + smem_raw_b[sy][sx] = static_cast(input[gidx + 2]); + } + + __syncthreads(); + + // ---- Phase 1: Horizontal filtering for ALL FUSED_H rows ---- + for (int row = thread_id; row < FUSED_H * BLOCK_X; row += THREADS_PER_BLOCK) { + int hy = row / BLOCK_X; + int hx = row % BLOCK_X; + int lx = hx + RADIUS; + + float cr = smem_raw_r[hy][lx]; + float cg = smem_raw_g[hy][lx]; + float cb = smem_raw_b[hy][lx]; + + float sr = 0.0f, sg = 0.0f, sb = 0.0f; + float wr = 0.0f, wg = 0.0f, wb = 0.0f; + +#pragma unroll + for (int dx = -RADIUS; dx <= RADIUS; ++dx) { + float nr = smem_raw_r[hy][lx + dx]; + float ng = smem_raw_g[hy][lx + dx]; + float nb = smem_raw_b[hy][lx + dx]; + + float sw = d_spatial_lut[RADIUS * LUT_WIDTH + (dx + RADIUS)]; + + int diff_r = static_cast(fabsf(nr - cr) + 0.5f); + int diff_g = static_cast(fabsf(ng - cg) + 0.5f); + int diff_b = static_cast(fabsf(nb - cb) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + float w_r = sw * d_color_lut[diff_r]; + float w_g = sw * d_color_lut[diff_g]; + float w_b = sw * d_color_lut[diff_b]; + + sr = fmaf(nr, w_r, sr); + sg = fmaf(ng, w_g, sg); + sb = fmaf(nb, w_b, sb); + wr += w_r; + wg += w_g; + wb += w_b; + } + smem_h_r[hy][hx] = sr / wr; + smem_h_g[hy][hx] = sg / wg; + smem_h_b[hy][hx] = sb / wb; + } + + __syncthreads(); + + // ---- Phase 2: Vertical filtering from smem_h → global output ---- + if (x >= width || y >= height) + return; + + const int ly = ty + RADIUS; + float cr = smem_h_r[ly][tx]; + float cg = smem_h_g[ly][tx]; + float cb = smem_h_b[ly][tx]; + + float sr = 0.0f, sg = 0.0f, sb = 0.0f; + float wr = 0.0f, wg = 0.0f, wb = 0.0f; + +#pragma unroll + for (int dy = -RADIUS; dy <= RADIUS; ++dy) { + float nr = smem_h_r[ly + dy][tx]; + float ng = smem_h_g[ly + dy][tx]; + float nb = smem_h_b[ly + dy][tx]; + + float sw = d_spatial_lut[(dy + RADIUS) * LUT_WIDTH + RADIUS]; + + int diff_r = static_cast(fabsf(nr - cr) + 0.5f); + int diff_g = static_cast(fabsf(ng - cg) + 0.5f); + int diff_b = static_cast(fabsf(nb - cb) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + float w_r = sw * d_color_lut[diff_r]; + float w_g = sw * d_color_lut[diff_g]; + float w_b = sw * d_color_lut[diff_b]; + + sr = fmaf(nr, w_r, sr); + sg = fmaf(ng, w_g, sg); + sb = fmaf(nb, w_b, sb); + wr += w_r; + wg += w_g; + wb += w_b; + } + + int out_idx = (y * width + x) * 3; + output[out_idx] = static_cast(fminf(255.0f, fmaxf(0.0f, sr / wr))); + output[out_idx + 1] = static_cast(fminf(255.0f, fmaxf(0.0f, sg / wg))); + output[out_idx + 2] = static_cast(fminf(255.0f, fmaxf(0.0f, sb / wb))); +} + +// ============================================================================ +// Runtime-radius version (fallback) +// ============================================================================ + +// Grayscale runtime version +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_filter_shared(const InT* __restrict__ input, OutT* __restrict__ output, + int width, int height, int radius) { + + extern __shared__ float smem[]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int tile_w = BLOCK_X + 2 * radius; + const int tile_h = BLOCK_Y + 2 * radius; + const int smem_size = tile_w * tile_h; + const int thread_id = ty * BLOCK_X + tx; + const int threads_per_block = BLOCK_X * BLOCK_Y; + + // Collaborative loading + for (int i = thread_id; i < smem_size; i += threads_per_block) { + int sy = i / tile_w; + int sx = i % tile_w; + int gx = blockIdx.x * BLOCK_X + sx - radius; + int gy = blockIdx.y * BLOCK_Y + sy - radius; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + smem[i] = static_cast(input[gy * width + gx]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + radius; + const int ly = ty + radius; + const float center = smem[ly * tile_w + lx]; + + float sum = 0.0f; + float weight_sum = 0.0f; + const int lut_width = 2 * radius + 1; + +#pragma unroll 4 + for (int dy = -radius; dy <= radius; ++dy) { +#pragma unroll 4 + for (int dx = -radius; dx <= radius; ++dx) { + float spatial_weight = d_spatial_lut[(dy + radius) * lut_width + (dx + radius)]; + if (spatial_weight == 0.0f) continue; + + float neighbor = smem[(ly + dy) * tile_w + (lx + dx)]; + + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + float w = spatial_weight * color_weight; + sum += neighbor * w; + weight_sum += w; + } + } + + output[y * width + x] = to_output(sum / weight_sum); +} + +// RGB runtime version +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_filter_rgb_shared(const InT* __restrict__ input, + OutT* __restrict__ output, int width, int height, + int radius) { + + extern __shared__ float smem[]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int tile_w = BLOCK_X + 2 * radius; + const int tile_h = BLOCK_Y + 2 * radius; + const int tile_size = tile_w * tile_h; + + float* smem_r = smem; + float* smem_g = smem + tile_size; + float* smem_b = smem + 2 * tile_size; + + const int thread_id = ty * BLOCK_X + tx; + const int threads_per_block = BLOCK_X * BLOCK_Y; + + for (int i = thread_id; i < tile_size; i += threads_per_block) { + int sy = i / tile_w; + int sx = i % tile_w; + int gx = blockIdx.x * BLOCK_X + sx - radius; + int gy = blockIdx.y * BLOCK_Y + sy - radius; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + int gidx = (gy * width + gx) * 3; + smem_r[i] = static_cast(input[gidx]); + smem_g[i] = static_cast(input[gidx + 1]); + smem_b[i] = static_cast(input[gidx + 2]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int lx = tx + radius; + const int ly = ty + radius; + const int lidx = ly * tile_w + lx; + + const float center_r = smem_r[lidx]; + const float center_g = smem_g[lidx]; + const float center_b = smem_b[lidx]; + + float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f; + float wsum_r = 0.0f, wsum_g = 0.0f, wsum_b = 0.0f; + const int lut_width = 2 * radius + 1; + +#pragma unroll 4 + for (int dy = -radius; dy <= radius; ++dy) { +#pragma unroll 4 + for (int dx = -radius; dx <= radius; ++dx) { + float spatial_weight = d_spatial_lut[(dy + radius) * lut_width + (dx + radius)]; + if (spatial_weight == 0.0f) continue; + + int nidx = (ly + dy) * tile_w + (lx + dx); + float nr = smem_r[nidx]; + float ng = smem_g[nidx]; + float nb = smem_b[nidx]; + + int diff_r = static_cast(fabsf(nr - center_r) + 0.5f); + int diff_g = static_cast(fabsf(ng - center_g) + 0.5f); + int diff_b = static_cast(fabsf(nb - center_b) + 0.5f); + diff_r = min(diff_r, COLOR_LUT_SIZE - 1); + diff_g = min(diff_g, COLOR_LUT_SIZE - 1); + diff_b = min(diff_b, COLOR_LUT_SIZE - 1); + + float cw_r = d_color_lut[diff_r]; + float cw_g = d_color_lut[diff_g]; + float cw_b = d_color_lut[diff_b]; + + float w_r = spatial_weight * cw_r; + float w_g = spatial_weight * cw_g; + float w_b = spatial_weight * cw_b; + + sum_r += nr * w_r; + sum_g += ng * w_g; + sum_b += nb * w_b; + wsum_r += w_r; + wsum_g += w_g; + wsum_b += w_b; + } + } + + int out_idx = (y * width + x) * 3; + output[out_idx] = to_output(sum_r / wsum_r); + output[out_idx + 1] = to_output(sum_g / wsum_g); + output[out_idx + 2] = to_output(sum_b / wsum_b); +} + +// ============================================================================ +// LUT initialization +// ============================================================================ + +static void init_spatial_lut(int radius, float sigma_spatial) { + float coeff = -0.5f / (sigma_spatial * sigma_spatial); + int w = 2 * radius + 1; + std::vector lut(LUT_SIZE, 0.0f); + + const int r2 = radius * radius; + for (int dy = -radius; dy <= radius; ++dy) { + for (int dx = -radius; dx <= radius; ++dx) { + // Opt C/F: circular window - zero out corners outside inscribed circle. + // For r=5: 121 positions → 81 inside, 40 outside (33%). + // TEMPLATE kernels with #pragma unroll: compiler eliminates dead iterations + // at compile time (dead code elimination), achieving +13%~+65% speedup. + if (dx * dx + dy * dy > r2) { + lut[(dy + radius) * w + (dx + radius)] = 0.0f; + } else { + lut[(dy + radius) * w + (dx + radius)] = expf((dx * dx + dy * dy) * coeff); + } + } + } + + CUDA_CHECK(cudaMemcpyToSymbol(d_spatial_lut, lut.data(), w * w * sizeof(float))); +} + +static void init_color_lut(float sigma_color) { + float coeff = -0.5f / (sigma_color * sigma_color); + std::vector lut(COLOR_LUT_SIZE); + + for (int i = 0; i < COLOR_LUT_SIZE; ++i) { + // i ∈ [0,255], i*i ∈ [0,65025]: no overflow for int + lut[i] = expf(static_cast(i * i) * coeff); + } + + CUDA_CHECK(cudaMemcpyToSymbol(d_color_lut, lut.data(), COLOR_LUT_SIZE * sizeof(float))); +} + +// Opt B (disabled): cudaFuncSetCacheConfig was tested and showed no benefit on sm_89. +// Moreover, calling it on many template instantiations triggers a JIT crash in the +// CUDA 13.1 PTX compiler under WSL2. The function has been removed entirely. + +// Opt1: LUT cache - only re-upload when params change +static void ensure_luts(int radius, float sigma_spatial, float sigma_color) { + static int cached_radius = -1; + static float cached_sigma_s = -1.f; + static float cached_sigma_c = -1.f; + + if (radius == cached_radius && sigma_spatial == cached_sigma_s && + sigma_color == cached_sigma_c) { + return; + } + + init_spatial_lut(radius, sigma_spatial); + init_color_lut(sigma_color); + + cached_radius = radius; + cached_sigma_s = sigma_spatial; + cached_sigma_c = sigma_color; +} + +// ============================================================================ +// Kernel dispatch with template specialization +// ============================================================================ + +enum class FilterMode { + STANDARD, // Shared memory + LUT (runtime radius) + TEMPLATE, // Template-based (compile-time radius) + SEPARABLE, // Separable approximation (float intermediate) + SEPARABLE_FP16, // Separable approximation (__half intermediate, half bandwidth) + ADAPTIVE, // Adaptive radius: per-pixel radius from local gradient + FUSED // Opt M: Fused H+V single kernel, zero intermediate buffer +}; + +static FilterMode g_filter_mode = FilterMode::TEMPLATE; + +void set_bilateral_filter_mode(int mode) { + g_filter_mode = static_cast(mode); +} + +static FilterMode get_filter_mode() { + static bool initialized = false; + if (!initialized) { + const char* env = getenv("BILATERAL_MODE"); + if (env) { + int mode = atoi(env); + if (mode >= 0 && mode <= 5) { + g_filter_mode = static_cast(mode); + } + } + initialized = true; + } + return g_filter_mode; +} + +// Float-path template launchers removed: the float bilateral_filter_cuda() now uses +// only runtime-radius shared-memory kernels. All template specializations are +// reserved for the uint8 direct path (dispatch_u8_kernel). + +// ============================================================================ +// Opt2: uint8 I/O launchers - kernel reads/writes uint8 directly from GPU +// ============================================================================ + +// Grayscale template uint8 launcher +template +static void launch_u8_gray(const uint8_t* d_in, uint8_t* d_out, int w, int h, cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_filter_gray_template + <<>>(d_in, d_out, w, h); +} + +// RGB template uint8 launcher +template +static void launch_u8_rgb(const uint8_t* d_in, uint8_t* d_out, int w, int h, cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_filter_rgb_template + <<>>(d_in, d_out, w, h); +} + +// Grayscale separable uint8 launcher (uint8→float→uint8 via d_temp) +template +static void launch_u8_sep_gray(const uint8_t* d_in, uint8_t* d_out, float* d_temp, int w, int h, + cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_horizontal_gray<<>>(d_in, d_temp, w, h); + k_bilateral_vertical_gray<<>>(d_temp, d_out, w, h); +} + +// RGB separable uint8 launcher (uint8→float→uint8 via d_temp) +template +static void launch_u8_sep_rgb(const uint8_t* d_in, uint8_t* d_out, float* d_temp, int w, int h, + cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_horizontal_rgb<<>>(d_in, d_temp, w, h); + k_bilateral_vertical_rgb<<>>(d_temp, d_out, w, h); +} + +// Opt N: FP16 intermediate separable launchers — half-bandwidth intermediate buffer. +// H kernel: uint8 → smem(float) → compute(float) → output(__half) +// V kernel: input(__half) → smem(float) → compute(float) → output(uint8) +// Compute stays FP32 for precision; only the H↔V transfer uses FP16. + +template +static void launch_u8_sep_gray_fp16(const uint8_t* d_in, uint8_t* d_out, __half* d_temp, + int w, int h, cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_horizontal_gray<<>>(d_in, d_temp, w, h); + k_bilateral_vertical_gray<<>>(d_temp, d_out, w, h); +} + +template +static void launch_u8_sep_rgb_fp16(const uint8_t* d_in, uint8_t* d_out, __half* d_temp, + int w, int h, cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_horizontal_rgb<<>>(d_in, d_temp, w, h); + k_bilateral_vertical_rgb<<>>(d_temp, d_out, w, h); +} + +// Opt N2: Full FP16 compute launchers — smem + arithmetic in __half +template +static void launch_u8_sep_gray_fp16_compute(const uint8_t* d_in, uint8_t* d_out, __half* d_temp, + int w, int h, cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_horizontal_gray_fp16<<>>(d_in, d_temp, w, h); + k_bilateral_vertical_gray_fp16<<>>(d_temp, d_out, w, h); +} + +template +static void launch_u8_sep_rgb_fp16_compute(const uint8_t* d_in, uint8_t* d_out, __half* d_temp, + int w, int h, cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_horizontal_rgb_fp16<<>>(d_in, d_temp, w, h); + k_bilateral_vertical_rgb_fp16<<>>(d_temp, d_out, w, h); +} + +// Opt M: Fused H+V launchers — single kernel, no intermediate buffer +template +static void launch_u8_fused_gray(const uint8_t* d_in, uint8_t* d_out, int w, int h, + cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_fused_gray<<>>(d_in, d_out, w, h); +} + +template +static void launch_u8_fused_rgb(const uint8_t* d_in, uint8_t* d_out, int w, int h, + cudaStream_t s) { + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((w + BLOCK_X - 1) / BLOCK_X, (h + BLOCK_Y - 1) / BLOCK_Y); + k_bilateral_fused_rgb<<>>(d_in, d_out, w, h); +} + +// ============================================================================ +// Opt1: persistent GPU buffers - allocated once, reused across calls +// Opt3: cudaHostRegister cache - page-lock caller's memory for DMA transfers +// ============================================================================ + +static struct { + uint8_t* d_in_u8 = nullptr; + uint8_t* d_out_u8 = nullptr; + float* d_temp = nullptr; // separable float intermediate + __half* d_temp_h16 = nullptr; // separable FP16 intermediate + uint8_t* d_radius_map = nullptr; // adaptive mode: per-pixel radius + size_t n_u8 = 0; + size_t n_temp = 0; + size_t n_temp_h16 = 0; + size_t n_radius_map = 0; + // Opt3: cached registered host pointers + const uint8_t* h_in_reg = nullptr; + uint8_t* h_out_reg = nullptr; + size_t n_reg = 0; +} g_bufs; + +// Opt3: register caller's heap memory as page-locked so cudaMemcpy uses DMA. +// Called once per unique (h_in, h_out, n) triple; re-registers when they change. +static void ensure_registered(const uint8_t* h_in, uint8_t* h_out, size_t n) { + if (h_in == g_bufs.h_in_reg && h_out == g_bufs.h_out_reg && n == g_bufs.n_reg) + return; + // Unregister previous pointers (ignore errors on first call / already-unregistered) + if (g_bufs.h_in_reg) + cudaHostUnregister(const_cast(g_bufs.h_in_reg)); + if (g_bufs.h_out_reg) + cudaHostUnregister(g_bufs.h_out_reg); + g_bufs.h_in_reg = nullptr; + g_bufs.h_out_reg = nullptr; + g_bufs.n_reg = 0; + // Register new pointers; if either fails, CUDA_CHECK will abort + CUDA_CHECK(cudaHostRegister(const_cast(h_in), n, cudaHostRegisterDefault)); + CUDA_CHECK(cudaHostRegister(h_out, n, cudaHostRegisterDefault)); + g_bufs.h_in_reg = h_in; + g_bufs.h_out_reg = h_out; + g_bufs.n_reg = n; +} + +static void ensure_io_buffers(size_t n_u8) { + if (n_u8 == g_bufs.n_u8) + return; + cudaFree(g_bufs.d_in_u8); + cudaFree(g_bufs.d_out_u8); + CUDA_CHECK(cudaMalloc(&g_bufs.d_in_u8, n_u8)); + CUDA_CHECK(cudaMalloc(&g_bufs.d_out_u8, n_u8)); + g_bufs.n_u8 = n_u8; +} + +static void ensure_temp_buffer(size_t n_bytes) { + if (n_bytes == g_bufs.n_temp) + return; + cudaFree(g_bufs.d_temp); + CUDA_CHECK(cudaMalloc(&g_bufs.d_temp, n_bytes)); + g_bufs.n_temp = n_bytes; +} + +// Opt N: FP16 intermediate buffer for SEPARABLE_FP16 mode +static void ensure_temp_h16_buffer(size_t n_bytes) { + if (n_bytes == g_bufs.n_temp_h16) + return; + cudaFree(g_bufs.d_temp_h16); + CUDA_CHECK(cudaMalloc(&g_bufs.d_temp_h16, n_bytes)); + g_bufs.n_temp_h16 = n_bytes; +} + +// Adaptive mode: per-pixel radius map buffer +static void ensure_radius_map_buffer(size_t n_pixels) { + if (n_pixels == g_bufs.n_radius_map) + return; + cudaFree(g_bufs.d_radius_map); + CUDA_CHECK(cudaMalloc(&g_bufs.d_radius_map, n_pixels)); + g_bufs.n_radius_map = n_pixels; +} + +// ============================================================================ +// Adaptive radius: compute per-pixel radius from local gradient magnitude +// ============================================================================ + +// Sobel-based gradient magnitude → radius map. +// High gradient (edge) → small radius; low gradient (flat) → large radius. +// r_min, r_max are the allowed radius bounds. grad_threshold controls the +// gradient value at which radius reaches r_min (fully edge). +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_compute_radius_map_gray(const uint8_t* __restrict__ input, + uint8_t* __restrict__ radius_map, + int width, int height, + int r_min, int r_max, float inv_grad_threshold) { + const int x = blockIdx.x * BLOCK_X + threadIdx.x; + const int y = blockIdx.y * BLOCK_Y + threadIdx.y; + if (x >= width || y >= height) + return; + + // Clamp coords for border pixels + int x0 = max(x - 1, 0), x2 = min(x + 1, width - 1); + int y0 = max(y - 1, 0), y2 = min(y + 1, height - 1); + + // Sobel gradient: Gx and Gy using 3x3 Sobel operator + float p00 = input[y0 * width + x0], p01 = input[y0 * width + x], p02 = input[y0 * width + x2]; + float p10 = input[y * width + x0], p12 = input[y * width + x2]; + float p20 = input[y2 * width + x0], p21 = input[y2 * width + x], p22 = input[y2 * width + x2]; + + float gx = -p00 + p02 - 2.0f * p10 + 2.0f * p12 - p20 + p22; + float gy = -p00 - 2.0f * p01 - p02 + p20 + 2.0f * p21 + p22; + float grad = sqrtf(gx * gx + gy * gy); + + // Map gradient to radius: linear interpolation + // grad=0 → r_max (flat area, smooth more), grad>=threshold → r_min (edge, smooth less) + float t = fminf(grad * inv_grad_threshold, 1.0f); + int r = __float2int_rn(static_cast(r_max) - t * static_cast(r_max - r_min)); + r = max(r_min, min(r_max, r)); + + radius_map[y * width + x] = static_cast(r); +} + +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_compute_radius_map_rgb(const uint8_t* __restrict__ input, + uint8_t* __restrict__ radius_map, + int width, int height, + int r_min, int r_max, float inv_grad_threshold) { + const int x = blockIdx.x * BLOCK_X + threadIdx.x; + const int y = blockIdx.y * BLOCK_Y + threadIdx.y; + if (x >= width || y >= height) + return; + + int x0 = max(x - 1, 0), x2 = min(x + 1, width - 1); + int y0 = max(y - 1, 0), y2 = min(y + 1, height - 1); + + // Compute gradient on luminance approximation: (R + G + B) / 3 + auto lum = [&](int py, int px) -> float { + int idx = (py * width + px) * 3; + return (static_cast(input[idx]) + + static_cast(input[idx + 1]) + + static_cast(input[idx + 2])) * (1.0f / 3.0f); + }; + + float p00 = lum(y0, x0), p01 = lum(y0, x), p02 = lum(y0, x2); + float p10 = lum(y, x0), p12 = lum(y, x2); + float p20 = lum(y2, x0), p21 = lum(y2, x), p22 = lum(y2, x2); + + float gx = -p00 + p02 - 2.0f * p10 + 2.0f * p12 - p20 + p22; + float gy = -p00 - 2.0f * p01 - p02 + p20 + 2.0f * p21 + p22; + float grad = sqrtf(gx * gx + gy * gy); + + float t = fminf(grad * inv_grad_threshold, 1.0f); + int r = __float2int_rn(static_cast(r_max) - t * static_cast(r_max - r_min)); + r = max(r_min, min(r_max, r)); + + radius_map[y * width + x] = static_cast(r); +} + +// ============================================================================ +// Adaptive bilateral filter kernels: read per-pixel radius from radius_map +// Shared memory is allocated for r_max halo (worst case) so all threads +// in a block can access the full neighborhood regardless of their radius. +// ============================================================================ + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_adaptive_gray(const InT* __restrict__ input, + OutT* __restrict__ output, + const uint8_t* __restrict__ radius_map, + int width, int height, int r_max) { + + extern __shared__ float smem[]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int tile_w = BLOCK_X + 2 * r_max; + const int tile_h = BLOCK_Y + 2 * r_max; + const int tile_size = tile_w * tile_h; + const int thread_id = ty * BLOCK_X + tx; + const int threads_per_block = BLOCK_X * BLOCK_Y; + + // Collaborative loading with r_max halo + for (int i = thread_id; i < tile_size; i += threads_per_block) { + int sy = i / tile_w; + int sx = i % tile_w; + int gx = blockIdx.x * BLOCK_X + sx - r_max; + int gy = blockIdx.y * BLOCK_Y + sy - r_max; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + smem[i] = static_cast(input[gy * width + gx]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + // Read per-pixel radius + const int my_radius = static_cast(radius_map[y * width + x]); + const int lx = tx + r_max; + const int ly = ty + r_max; + const float center = smem[ly * tile_w + lx]; + // Spatial LUT was built with r_max, so use r_max-based width and offset + const int lut_width = 2 * r_max + 1; + const int lut_center = r_max; // center offset in the LUT + + float sum = 0.0f; + float weight_sum = 0.0f; + + for (int dy = -my_radius; dy <= my_radius; ++dy) { + for (int dx = -my_radius; dx <= my_radius; ++dx) { + float neighbor = smem[(ly + dy) * tile_w + (lx + dx)]; + + float spatial_weight = d_spatial_lut[(dy + lut_center) * lut_width + (dx + lut_center)]; + + int diff = static_cast(fabsf(neighbor - center) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + float color_weight = d_color_lut[diff]; + + float w = spatial_weight * color_weight; + sum += neighbor * w; + weight_sum += w; + } + } + + output[y * width + x] = to_output(sum / weight_sum); +} + +template +__global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_SM) +k_bilateral_adaptive_rgb(const InT* __restrict__ input, + OutT* __restrict__ output, + const uint8_t* __restrict__ radius_map, + int width, int height, int r_max) { + + extern __shared__ float smem[]; + + const int tx = threadIdx.x; + const int ty = threadIdx.y; + const int x = blockIdx.x * BLOCK_X + tx; + const int y = blockIdx.y * BLOCK_Y + ty; + + const int tile_w = BLOCK_X + 2 * r_max; + const int tile_h = BLOCK_Y + 2 * r_max; + const int tile_size = tile_w * tile_h; + const int thread_id = ty * BLOCK_X + tx; + const int threads_per_block = BLOCK_X * BLOCK_Y; + + float* smem_r = smem; + float* smem_g = smem + tile_size; + float* smem_b = smem + 2 * tile_size; + + // Collaborative loading with r_max halo + for (int i = thread_id; i < tile_size; i += threads_per_block) { + int sy = i / tile_w; + int sx = i % tile_w; + int gx = blockIdx.x * BLOCK_X + sx - r_max; + int gy = blockIdx.y * BLOCK_Y + sy - r_max; + gx = max(0, min(width - 1, gx)); + gy = max(0, min(height - 1, gy)); + int gidx = (gy * width + gx) * 3; + smem_r[i] = static_cast(input[gidx]); + smem_g[i] = static_cast(input[gidx + 1]); + smem_b[i] = static_cast(input[gidx + 2]); + } + + __syncthreads(); + + if (x >= width || y >= height) + return; + + const int my_radius = static_cast(radius_map[y * width + x]); + const int lx = tx + r_max; + const int ly = ty + r_max; + const int lidx = ly * tile_w + lx; + // Spatial LUT was built with r_max, so use r_max-based width and offset + const int lut_width = 2 * r_max + 1; + const int lut_center = r_max; + + const float center_r = smem_r[lidx]; + const float center_g = smem_g[lidx]; + const float center_b = smem_b[lidx]; + + float sum_r = 0.0f, sum_g = 0.0f, sum_b = 0.0f; + float wsum = 0.0f; + + for (int dy = -my_radius; dy <= my_radius; ++dy) { + for (int dx = -my_radius; dx <= my_radius; ++dx) { + int nidx = (ly + dy) * tile_w + (lx + dx); + float nr = smem_r[nidx]; + float ng = smem_g[nidx]; + float nb = smem_b[nidx]; + + float spatial_weight = d_spatial_lut[(dy + lut_center) * lut_width + (dx + lut_center)]; + + // Mean absolute channel difference (approximation of OpenCV's L2 distance) + int diff = static_cast( + (fabsf(nr - center_r) + fabsf(ng - center_g) + fabsf(nb - center_b)) * + (1.0f / 3.0f) + 0.5f); + diff = min(diff, COLOR_LUT_SIZE - 1); + + float w = spatial_weight * d_color_lut[diff]; + sum_r += nr * w; + sum_g += ng * w; + sum_b += nb * w; + wsum += w; + } + } + + float rcp_wsum = __frcp_rn(wsum); + int out_idx = (y * width + x) * 3; + output[out_idx] = to_output(sum_r * rcp_wsum); + output[out_idx + 1] = to_output(sum_g * rcp_wsum); + output[out_idx + 2] = to_output(sum_b * rcp_wsum); +} + +// Float-path bilateral_filter_cuda: simplified to use only runtime-radius shared-memory +// kernels (no template specializations). Template kernels are only used in the uint8 +// direct path (dispatch_u8_kernel) which is the primary performance path. +void bilateral_filter_cuda(const float* d_input, float* d_output, int width, int height, + int channels, int radius, float sigma_spatial, float sigma_color, + cudaStream_t stream) { + + radius = min(radius, MAX_RADIUS); + ensure_luts(radius, sigma_spatial, sigma_color); + + dim3 block(BLOCK_X, BLOCK_Y); + dim3 grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y); + + if (channels == 1) { + int tile_w = BLOCK_X + 2 * radius; + int tile_h = BLOCK_Y + 2 * radius; + size_t smem = tile_w * tile_h * sizeof(float); + k_bilateral_filter_shared<<>>( + d_input, d_output, width, height, radius); + } else { + int tile_w = BLOCK_X + 2 * radius; + int tile_h = BLOCK_Y + 2 * radius; + size_t smem = 3 * tile_w * tile_h * sizeof(float); + k_bilateral_filter_rgb_shared<<>>( + d_input, d_output, width, height, radius); + } + + CUDA_CHECK(cudaGetLastError()); + if (stream == 0) { + CUDA_CHECK(cudaDeviceSynchronize()); + } +} + +// ============================================================================ +// Opt E: Strip pipelining - internal kernel-only dispatcher (no H2D/D2H) +// Called per-strip or once for the full image. +// All GPU buffers (d_in, d_out, d_temp, d_temp_h16, d_rmap) are pre-allocated +// by the caller. ensure_luts() must also be called before this function. +// ============================================================================ +static void dispatch_u8_kernel( + const uint8_t* d_in, uint8_t* d_out, + float* d_temp, __half* d_temp_h16, uint8_t* d_rmap, + int width, int height, int channels, + int radius, float sigma_spatial, float sigma_color, + FilterMode mode, cudaStream_t stream) { + + const dim3 block(BLOCK_X, BLOCK_Y); + const dim3 grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y); + + if (channels == 1) { + if (mode == FilterMode::FUSED) { + // Opt M: Fused H+V in single kernel — no intermediate buffer + if (radius == 5) { + launch_u8_fused_gray<5>(d_in, d_out, width, height, stream); + } else { + // Fallback to 2-pass separable + size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_shared + <<>>(d_in, d_out, width, height, radius); + } + } else if (mode == FilterMode::ADAPTIVE) { + int r_max = radius; + int r_min = max(2, radius - 1); + float grad_threshold = fmaxf(sigma_color * 1.5f, 20.0f); + float inv_grad_threshold = 1.0f / grad_threshold; + k_compute_radius_map_gray<<>>( + d_in, d_rmap, width, height, r_min, r_max, inv_grad_threshold); + ensure_luts(r_max, sigma_spatial, sigma_color); + int tile_w = BLOCK_X + 2 * r_max; + int tile_h = BLOCK_Y + 2 * r_max; + size_t smem_bytes = static_cast(tile_w) * tile_h * sizeof(float); + k_bilateral_adaptive_gray + <<>>(d_in, d_out, d_rmap, width, height, r_max); + } else if (mode == FilterMode::SEPARABLE_FP16) { + // Opt N: FP16 intermediate buffer — FP32 compute, __half H↔V transfer. + // Opt N2 (full FP16 compute) was tested but ~8% slower due to scalar __half + // having no throughput advantage over FP32 + float↔half conversion overhead. + if (radius == 5) { + launch_u8_sep_gray_fp16<5>(d_in, d_out, d_temp_h16, width, height, stream); + } else { + size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_shared + <<>>(d_in, d_out, width, height, radius); + } + } else if (mode == FilterMode::SEPARABLE) { + // Only radius=5 specialization; others fallback to runtime shared-memory + if (radius == 5) { + launch_u8_sep_gray<5>(d_in, d_out, d_temp, width, height, stream); + } else { + size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_shared + <<>>(d_in, d_out, width, height, radius); + } + } else if (mode == FilterMode::TEMPLATE) { + if (radius == 5) { + launch_u8_gray<5>(d_in, d_out, width, height, stream); + } else { + size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_shared + <<>>(d_in, d_out, width, height, radius); + } + } else { + // STANDARD + size_t smem = (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_shared + <<>>(d_in, d_out, width, height, radius); + } + } else { + // RGB + if (mode == FilterMode::FUSED) { + // Opt M: Fused H+V in single kernel — no intermediate buffer + if (radius == 5) { + launch_u8_fused_rgb<5>(d_in, d_out, width, height, stream); + } else { + size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_rgb_shared + <<>>(d_in, d_out, width, height, radius); + } + } else if (mode == FilterMode::ADAPTIVE) { + int r_max = radius; + int r_min = max(2, radius - 1); + float grad_threshold = fmaxf(sigma_color * 1.5f, 20.0f); + float inv_grad_threshold = 1.0f / grad_threshold; + k_compute_radius_map_rgb<<>>( + d_in, d_rmap, width, height, r_min, r_max, inv_grad_threshold); + ensure_luts(r_max, sigma_spatial, sigma_color); + int tile_w = BLOCK_X + 2 * r_max; + int tile_h = BLOCK_Y + 2 * r_max; + size_t smem_bytes = 3 * static_cast(tile_w) * tile_h * sizeof(float); + k_bilateral_adaptive_rgb + <<>>(d_in, d_out, d_rmap, width, height, r_max); + } else if (mode == FilterMode::SEPARABLE_FP16) { + // Opt N: FP16 intermediate buffer — FP32 compute, __half H↔V transfer. + if (radius == 5) { + launch_u8_sep_rgb_fp16<5>(d_in, d_out, d_temp_h16, width, height, stream); + } else { + size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_rgb_shared + <<>>(d_in, d_out, width, height, radius); + } + } else if (mode == FilterMode::SEPARABLE) { + if (radius == 5) { + launch_u8_sep_rgb<5>(d_in, d_out, d_temp, width, height, stream); + } else { + size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_rgb_shared + <<>>(d_in, d_out, width, height, radius); + } + } else if (mode == FilterMode::TEMPLATE) { + if (radius == 5) { + launch_u8_rgb<5>(d_in, d_out, width, height, stream); + } else { + size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_rgb_shared + <<>>(d_in, d_out, width, height, radius); + } + } else { + // STANDARD + size_t smem = 3 * (BLOCK_X + 2*radius) * (BLOCK_Y + 2*radius) * sizeof(float); + k_bilateral_filter_rgb_shared + <<>>(d_in, d_out, width, height, radius); + } + } +} + +// ============================================================================ +// Opt E: Strip pipeline state - per-strip GPU buffers + CUDA streams. +// Enables H2D / kernel / D2H overlap across strips, hiding PCIe latency. +// ============================================================================ + +static constexpr int MAX_STRIPS = 8; + +static struct { + int count = 0; + cudaStream_t streams[MAX_STRIPS]; + uint8_t* d_in [MAX_STRIPS]; + uint8_t* d_out [MAX_STRIPS]; + float* d_temp[MAX_STRIPS]; // separable float intermediate + __half* d_temph[MAX_STRIPS]; // separable FP16 intermediate + uint8_t* d_rmap[MAX_STRIPS]; // adaptive radius map + size_t cap_in [MAX_STRIPS]; // current allocated bytes + size_t cap_temp[MAX_STRIPS]; + size_t cap_rmap[MAX_STRIPS]; +} g_strip; + +// Lazily allocate / resize per-strip buffers. +static void ensure_strip_buffers(int s, size_t in_bytes, size_t temp_bytes, + size_t temph_bytes, size_t rmap_bytes) { + if (in_bytes > g_strip.cap_in[s]) { + cudaFree(g_strip.d_in[s]); + cudaFree(g_strip.d_out[s]); + CUDA_CHECK(cudaMalloc(&g_strip.d_in[s], in_bytes)); + CUDA_CHECK(cudaMalloc(&g_strip.d_out[s], in_bytes)); + g_strip.cap_in[s] = in_bytes; + } + if (temp_bytes > g_strip.cap_temp[s]) { + cudaFree(g_strip.d_temp[s]); + cudaFree(g_strip.d_temph[s]); + CUDA_CHECK(cudaMalloc(&g_strip.d_temp[s], temp_bytes)); + CUDA_CHECK(cudaMalloc(&g_strip.d_temph[s], temph_bytes)); + g_strip.cap_temp[s] = temp_bytes; + } + if (rmap_bytes > g_strip.cap_rmap[s]) { + cudaFree(g_strip.d_rmap[s]); + CUDA_CHECK(cudaMalloc(&g_strip.d_rmap[s], rmap_bytes)); + g_strip.cap_rmap[s] = rmap_bytes; + } +} + +// Create streams on first use. +static void ensure_strip_streams(int num_strips) { + if (g_strip.count >= num_strips) return; + for (int s = g_strip.count; s < num_strips; ++s) { + CUDA_CHECK(cudaStreamCreateWithFlags(&g_strip.streams[s], cudaStreamNonBlocking)); + g_strip.d_in[s] = g_strip.d_out[s] = nullptr; + g_strip.d_temp[s] = nullptr; + g_strip.d_temph[s] = nullptr; + g_strip.d_rmap[s] = nullptr; + g_strip.cap_in[s] = g_strip.cap_temp[s] = g_strip.cap_rmap[s] = 0; + } + g_strip.count = num_strips; +} + +// Strip pipeline entry point. +// Splits the image into num_strips horizontal strips and pipelines: +// H2D(i) → kernel(i) → D2H(i) on stream[i] +// Streams run concurrently; GPU copy engine overlaps with compute engine. +// +// Each strip input includes halo rows (±radius) so border pixels are correct. +// Only the non-halo rows are copied back to the host output. +static void apply_strip_pipeline( + const uint8_t* h_input, uint8_t* h_output, + int width, int height, int channels, + int radius, float sigma_spatial, float sigma_color, + FilterMode mode, int num_strips) { + + ensure_strip_streams(num_strips); + + const int rows_per_strip = (height + num_strips - 1) / num_strips; + const size_t row_bytes = static_cast(width) * channels; + + for (int s = 0; s < num_strips; ++s) { + const int row_start = s * rows_per_strip; + const int row_end = std::min(row_start + rows_per_strip, height); + if (row_start >= height) break; + + const int halo_top = std::min(radius, row_start); + const int halo_bot = std::min(radius, height - row_end); + const int strip_h = halo_top + (row_end - row_start) + halo_bot; + const size_t in_bytes = static_cast(strip_h) * row_bytes; + + // Temp buffer for separable float intermediate + const size_t temp_bytes = static_cast(strip_h) * width * channels * sizeof(float); + const size_t temph_bytes = static_cast(strip_h) * width * channels * sizeof(__half); + const size_t rmap_bytes = static_cast(strip_h) * width; // 1 byte/pixel + + ensure_strip_buffers(s, in_bytes, temp_bytes, temph_bytes, rmap_bytes); + + // Async H2D: copy strip input with halo + const uint8_t* src = h_input + (row_start - halo_top) * row_bytes; + CUDA_CHECK(cudaMemcpyAsync(g_strip.d_in[s], src, in_bytes, + cudaMemcpyHostToDevice, g_strip.streams[s])); + + // Kernel on this strip (local dimensions: width × strip_h) + dispatch_u8_kernel( + g_strip.d_in[s], g_strip.d_out[s], + g_strip.d_temp[s], g_strip.d_temph[s], g_strip.d_rmap[s], + width, strip_h, channels, + radius, sigma_spatial, sigma_color, mode, g_strip.streams[s]); + + // Async D2H: copy back only the non-halo rows + const size_t out_offset = static_cast(halo_top) * row_bytes; + const size_t out_bytes = static_cast(row_end - row_start) * row_bytes; + uint8_t* dst = h_output + static_cast(row_start) * row_bytes; + CUDA_CHECK(cudaMemcpyAsync(dst, g_strip.d_out[s] + out_offset, out_bytes, + cudaMemcpyDeviceToHost, g_strip.streams[s])); + } + + // Wait for all strips to complete + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void apply_bilateral_filter_cuda(const uint8_t* h_input, uint8_t* h_output, int width, int height, + int channels, int radius, float sigma_spatial, float sigma_color) { + // Opt B: ensure_l1_cache_prefer() was disabled - triggers JIT crash on WSL2 + // (cudaFuncSetCacheConfig on all template instantiations forces simultaneous JIT, + // which crashes libnvidia-ptxjitcompiler.so in WSL2 environment. + // The optimization itself also showed no benefit on sm_89 Ada architecture.) + + const size_t n_u8 = static_cast(width) * height * channels; + + // (Re)allocate only when image size changes + ensure_io_buffers(n_u8); + + // Opt3: page-lock caller's buffers for faster H2D/D2H transfers + ensure_registered(h_input, h_output, n_u8); + + radius = min(radius, MAX_RADIUS); + ensure_luts(radius, sigma_spatial, sigma_color); + + // Opt E: check BILATERAL_STRIP env var for strip pipeline + static int num_strips = -1; + if (num_strips < 0) { + const char* env = getenv("BILATERAL_STRIP"); + num_strips = (env && atoi(env) > 1) ? std::min(atoi(env), MAX_STRIPS) : 1; + } + + // Opt2: dispatch uint8 I/O kernels directly - no float conversion pipeline + const FilterMode mode = get_filter_mode(); + + // Opt E: strip pipeline - H2D/kernel/D2H overlap across num_strips streams + if (num_strips > 1) { + apply_strip_pipeline(h_input, h_output, width, height, channels, + radius, sigma_spatial, sigma_color, mode, num_strips); + return; + } + + // Single-shot path: one synchronous H2D → kernel → D2H + CUDA_CHECK(cudaMemcpy(g_bufs.d_in_u8, h_input, n_u8, cudaMemcpyHostToDevice)); + + // Pre-allocate intermediate buffers required by the selected mode + if (mode == FilterMode::SEPARABLE_FP16) { + // Opt N: FP16 intermediate buffer — half the bandwidth between H and V passes + ensure_temp_h16_buffer(static_cast(width) * height * channels * sizeof(__half)); + } else if (mode == FilterMode::SEPARABLE) { + ensure_temp_buffer(static_cast(width) * height * channels * sizeof(float)); + } else if (mode == FilterMode::ADAPTIVE) { + ensure_radius_map_buffer(static_cast(width) * height); + } + + dispatch_u8_kernel(g_bufs.d_in_u8, g_bufs.d_out_u8, + g_bufs.d_temp, g_bufs.d_temp_h16, g_bufs.d_radius_map, + width, height, channels, + radius, sigma_spatial, sigma_color, mode, 0); + + // Check kernel launch errors, then sync + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); + + CUDA_CHECK(cudaMemcpy(h_output, g_bufs.d_out_u8, n_u8, cudaMemcpyDeviceToHost)); +} diff --git a/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_opencv.cpp b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_opencv.cpp new file mode 100644 index 0000000..6ecd942 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/src/bilateral_filter_opencv.cpp @@ -0,0 +1,93 @@ +#include "bilateral_filter_opencv.h" + +#include +#include + +#ifdef HAVE_OPENCV_CUDA +#include +#endif + +void apply_bilateral_filter_opencv(const ImageData& input, ImageData& output, + const FilterParams& params) { + int cv_type = input.channels == 1 ? CV_8UC1 : CV_8UC3; + cv::Mat src(input.height, input.width, cv_type, const_cast(input.data.data())); + cv::Mat dst; + + int d = 2 * params.radius + 1; + cv::bilateralFilter(src, dst, d, params.sigma_color, params.sigma_spatial); + + output.width = input.width; + output.height = input.height; + output.channels = input.channels; + output.data.resize(output.size()); + std::memcpy(output.data.data(), dst.data, output.size()); +} + +#ifdef HAVE_OPENCV_CUDA +bool apply_bilateral_filter_opencv_cuda(const ImageData& input, ImageData& output, + const FilterParams& params) { + // Check CUDA device availability at runtime + if (cv::cuda::getCudaEnabledDeviceCount() <= 0) { + return false; + } + + int cv_type = input.channels == 1 ? CV_8UC1 : CV_8UC3; + cv::Mat src(input.height, input.width, cv_type, const_cast(input.data.data())); + + // Upload to GPU + cv::cuda::GpuMat d_src, d_dst; + d_src.upload(src); + + int kernel_size = 2 * params.radius + 1; + cv::cuda::bilateralFilter(d_src, d_dst, kernel_size, params.sigma_color, params.sigma_spatial); + + // Download result + cv::Mat dst; + d_dst.download(dst); + + output.width = input.width; + output.height = input.height; + output.channels = input.channels; + output.data.resize(output.size()); + std::memcpy(output.data.data(), dst.data, output.size()); + return true; +} +#endif + +double compute_mae(const ImageData& img1, const ImageData& img2) { + if (img1.width != img2.width || img1.height != img2.height || img1.channels != img2.channels) { + return -1.0; + } + + double sum = 0.0; + size_t count = img1.size(); + + for (size_t i = 0; i < count; ++i) { + double diff = + std::abs(static_cast(img1.data[i]) - static_cast(img2.data[i])); + sum += diff; + } + + return sum / count; +} + +double compute_psnr(const ImageData& img1, const ImageData& img2) { + if (img1.width != img2.width || img1.height != img2.height || img1.channels != img2.channels) { + return -1.0; + } + + double sum_sq = 0.0; + size_t count = img1.size(); + + for (size_t i = 0; i < count; ++i) { + double diff = static_cast(img1.data[i]) - static_cast(img2.data[i]); + sum_sq += diff * diff; + } + + double mse = sum_sq / count; + if (mse < 1e-10) { + return 999.99; // practically identical + } + // MAX = 255 for uint8 + return 10.0 * std::log10(255.0 * 255.0 / mse); +} diff --git a/08_bilateral_filter/Andy1314Chen/src/image_io.cpp b/08_bilateral_filter/Andy1314Chen/src/image_io.cpp new file mode 100644 index 0000000..9df03d5 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/src/image_io.cpp @@ -0,0 +1,102 @@ +#include "image_io.h" +#include +#include +#include +#include +#include + +bool load_raw_image(const char* path, ImageData* img) { + FILE* fp = fopen(path, "rb"); + if (!fp) { + fprintf(stderr, "Failed to open image file: %s\n", path); + return false; + } + + if (fread(&img->width, sizeof(int), 1, fp) != 1 || + fread(&img->height, sizeof(int), 1, fp) != 1 || + fread(&img->channels, sizeof(int), 1, fp) != 1) { + fprintf(stderr, "Failed to read image header\n"); + fclose(fp); + return false; + } + + if (img->width <= 0 || img->height <= 0 || img->channels <= 0 || img->channels > 4) { + fprintf(stderr, "Invalid image dimensions: %dx%d, channels=%d\n", + img->width, img->height, img->channels); + fclose(fp); + return false; + } + + size_t data_size = img->size(); + img->data.resize(data_size); + + if (fread(img->data.data(), sizeof(uint8_t), data_size, fp) != data_size) { + fprintf(stderr, "Failed to read image data\n"); + fclose(fp); + return false; + } + + fclose(fp); + return true; +} + +bool save_raw_image(const char* path, const ImageData& img) { + FILE* fp = fopen(path, "wb"); + if (!fp) { + fprintf(stderr, "Failed to create output file: %s\n", path); + return false; + } + + fwrite(&img.width, sizeof(int), 1, fp); + fwrite(&img.height, sizeof(int), 1, fp); + fwrite(&img.channels, sizeof(int), 1, fp); + fwrite(img.data.data(), sizeof(uint8_t), img.size(), fp); + + fclose(fp); + return true; +} + +bool load_params(const char* path, FilterParams* params) { + std::ifstream file(path); + if (!file.is_open()) { + fprintf(stderr, "Failed to open params file: %s\n", path); + return false; + } + + std::string line; + while (std::getline(file, line)) { + if (line.empty() || line[0] == '#') continue; + + size_t eq_pos = line.find('='); + if (eq_pos == std::string::npos) continue; + + std::string key = line.substr(0, eq_pos); + std::string value = line.substr(eq_pos + 1); + + auto trim = [](std::string& s) { + size_t start = s.find_first_not_of(" \t\r\n"); + size_t end = s.find_last_not_of(" \t\r\n#"); + if (start != std::string::npos && end != std::string::npos) { + s = s.substr(start, end - start + 1); + } + }; + trim(key); + trim(value); + + if (key == "radius") { + params->radius = std::stoi(value); + } else if (key == "sigma_spatial") { + params->sigma_spatial = std::stof(value); + } else if (key == "sigma_color") { + params->sigma_color = std::stof(value); + } + } + + if (params->radius <= 0 || params->sigma_spatial <= 0 || params->sigma_color <= 0) { + fprintf(stderr, "Invalid parameters: radius=%d, sigma_spatial=%.2f, sigma_color=%.2f\n", + params->radius, params->sigma_spatial, params->sigma_color); + return false; + } + + return true; +} diff --git a/08_bilateral_filter/Andy1314Chen/src/main.cpp b/08_bilateral_filter/Andy1314Chen/src/main.cpp new file mode 100644 index 0000000..dba30a5 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/src/main.cpp @@ -0,0 +1,472 @@ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bilateral_filter.h" +#include "bilateral_filter_cuda.cuh" +#include "bilateral_filter_opencv.h" +#include "image_io.h" + +constexpr int WARMUP_RUNS = 5; +constexpr int BENCHMARK_RUNS = 50; + +void print_usage(const char* prog) { + fprintf(stderr, "Usage:\n"); + fprintf(stderr, " %s input.raw params.txt output.raw # CPU only\n", prog); + fprintf(stderr, " %s --cuda input.raw params.txt output.raw # CUDA only\n", prog); + fprintf(stderr, " %s --opencv input.raw params.txt output.raw # OpenCV only\n", prog); + fprintf(stderr, + " %s --bench input.raw params.txt # Benchmark CUDA vs OpenCV\n", prog); + fprintf(stderr, " %s --compare input.raw params.txt # Compare CPU vs OpenCV\n", + prog); + fprintf(stderr, + " %s --compare-all input.raw params.txt # Compare CPU vs CUDA vs OpenCV\n", + prog); +} + +struct BenchmarkResult { + double mean_ms; + double min_ms; + double max_ms; + double stddev_ms; +}; + +BenchmarkResult compute_stats(const std::vector& times) { + BenchmarkResult result = {0.0, 0.0, 0.0, 0.0}; + if (times.empty()) { + return result; + } + + double sum = std::accumulate(times.begin(), times.end(), 0.0); + result.mean_ms = sum / times.size(); + + result.min_ms = *std::min_element(times.begin(), times.end()); + result.max_ms = *std::max_element(times.begin(), times.end()); + + double sq_sum = 0.0; + for (double t : times) { + sq_sum += (t - result.mean_ms) * (t - result.mean_ms); + } + result.stddev_ms = sqrt(sq_sum / times.size()); + + return result; +} + +// Helper: benchmark OpenCV CUDA if available. Returns true if benchmarked. +#ifdef HAVE_OPENCV_CUDA +static bool bench_opencv_cuda(const ImageData& input, const FilterParams& params, + ImageData& output_cv_cuda, BenchmarkResult& cv_cuda_stats, + double /*megapixels*/) { + // Probe: check if OpenCV CUDA bilateral filter works + if (!apply_bilateral_filter_opencv_cuda(input, output_cv_cuda, params)) { + fprintf(stderr, "OpenCV CUDA: no CUDA device available, skipping\n"); + return false; + } + + // Warmup + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_opencv_cuda(input, output_cv_cuda, params); + } + + // Benchmark + std::vector times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_opencv_cuda(input, output_cv_cuda, params); + auto end = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration(end - start).count()); + } + + cv_cuda_stats = compute_stats(times); + return true; +} +#endif + +int main(int argc, char* argv[]) { + if (argc < 4) { + print_usage(argv[0]); + return 1; + } + + bool compare_mode = (strcmp(argv[1], "--compare") == 0); + bool compare_all_mode = (strcmp(argv[1], "--compare-all") == 0); + bool bench_mode = (strcmp(argv[1], "--bench") == 0); + bool cuda_only = (strcmp(argv[1], "--cuda") == 0); + bool opencv_only = (strcmp(argv[1], "--opencv") == 0); + + const char* input_path; + const char* params_path; + const char* output_path = nullptr; + + if (compare_mode || compare_all_mode || bench_mode) { + if (argc != 4) { + print_usage(argv[0]); + return 1; + } + input_path = argv[2]; + params_path = argv[3]; + } else if (cuda_only || opencv_only) { + if (argc != 5) { + print_usage(argv[0]); + return 1; + } + input_path = argv[2]; + params_path = argv[3]; + output_path = argv[4]; + } else { + if (argc != 4) { + print_usage(argv[0]); + return 1; + } + input_path = argv[1]; + params_path = argv[2]; + output_path = argv[3]; + } + + ImageData input; + if (!load_raw_image(input_path, &input)) { + return 1; + } + printf("Loaded image: %dx%d, channels=%d\n", input.width, input.height, input.channels); + + FilterParams params; + if (!load_params(params_path, ¶ms)) { + return 1; + } + printf("Parameters: radius=%d, sigma_spatial=%.2f, sigma_color=%.2f\n", params.radius, + params.sigma_spatial, params.sigma_color); + + double megapixels = static_cast(input.width) * input.height / 1e6; + + if (compare_all_mode) { + ImageData output_cpu, output_cuda, output_opencv; + + output_cuda.width = input.width; + output_cuda.height = input.height; + output_cuda.channels = input.channels; + output_cuda.data.resize(input.size()); + + printf("\nBenchmarking (warmup=%d, runs=%d)...\n", WARMUP_RUNS, BENCHMARK_RUNS); + + // Warmup runs + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_cpu(input, output_cpu, params); + apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width, + input.height, input.channels, params.radius, + params.sigma_spatial, params.sigma_color); + apply_bilateral_filter_opencv(input, output_opencv, params); + } + + // Benchmark CPU + std::vector cpu_times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_cpu(input, output_cpu, params); + auto end = std::chrono::high_resolution_clock::now(); + cpu_times.push_back(std::chrono::duration(end - start).count()); + } + + // Benchmark CUDA + std::vector cuda_times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width, + input.height, input.channels, params.radius, + params.sigma_spatial, params.sigma_color); + auto end = std::chrono::high_resolution_clock::now(); + cuda_times.push_back(std::chrono::duration(end - start).count()); + } + + // Benchmark OpenCV CPU + std::vector cv_times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_opencv(input, output_opencv, params); + auto end = std::chrono::high_resolution_clock::now(); + cv_times.push_back(std::chrono::duration(end - start).count()); + } + + auto cpu_stats = compute_stats(cpu_times); + auto cuda_stats = compute_stats(cuda_times); + auto cv_stats = compute_stats(cv_times); + + double mae_cpu_cv = compute_mae(output_cpu, output_opencv); + double mae_cuda_cv = compute_mae(output_cuda, output_opencv); + double psnr_cpu_cv = compute_psnr(output_cpu, output_opencv); + double psnr_cuda_cv = compute_psnr(output_cuda, output_opencv); + + // OpenCV CUDA baseline +#ifdef HAVE_OPENCV_CUDA + ImageData output_cv_cuda; + BenchmarkResult cv_cuda_stats; + bool have_cv_cuda = + bench_opencv_cuda(input, params, output_cv_cuda, cv_cuda_stats, megapixels); + double mae_cv_cuda = -1.0, psnr_cv_cuda = -1.0; + if (have_cv_cuda) { + mae_cv_cuda = compute_mae(output_cv_cuda, output_opencv); + psnr_cv_cuda = compute_psnr(output_cv_cuda, output_opencv); + } +#endif + + printf("\n=== Benchmark Results (mean ± stddev) ===\n"); + printf("CPU : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cpu_stats.mean_ms, + cpu_stats.stddev_ms, cpu_stats.min_ms, cpu_stats.max_ms, + megapixels / (cpu_stats.mean_ms / 1000.0)); + printf("CUDA : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cuda_stats.mean_ms, + cuda_stats.stddev_ms, cuda_stats.min_ms, cuda_stats.max_ms, + megapixels / (cuda_stats.mean_ms / 1000.0)); + printf("OpenCV : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cv_stats.mean_ms, + cv_stats.stddev_ms, cv_stats.min_ms, cv_stats.max_ms, + megapixels / (cv_stats.mean_ms / 1000.0)); +#ifdef HAVE_OPENCV_CUDA + if (have_cv_cuda) { + printf("OpenCVCUDA: %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", + cv_cuda_stats.mean_ms, cv_cuda_stats.stddev_ms, cv_cuda_stats.min_ms, + cv_cuda_stats.max_ms, megapixels / (cv_cuda_stats.mean_ms / 1000.0)); + } +#endif + + printf("\nSpeedup (based on mean):\n"); + printf(" CUDA vs CPU: %.2fx\n", cpu_stats.mean_ms / cuda_stats.mean_ms); + printf(" CUDA vs OpenCV: %.2fx\n", cv_stats.mean_ms / cuda_stats.mean_ms); + printf(" OpenCV vs CPU: %.2fx\n", cpu_stats.mean_ms / cv_stats.mean_ms); +#ifdef HAVE_OPENCV_CUDA + if (have_cv_cuda) { + printf(" CUDA vs OpenCVCUDA: %.2fx\n", cv_cuda_stats.mean_ms / cuda_stats.mean_ms); + } +#endif + + printf("\nQuality (vs OpenCV CPU):\n"); + printf(" CPU: MAE=%.4f PSNR=%.2f dB %s\n", mae_cpu_cv, psnr_cpu_cv, + mae_cpu_cv < 1.0 ? "✓" : "✗"); + printf(" CUDA: MAE=%.4f PSNR=%.2f dB %s\n", mae_cuda_cv, psnr_cuda_cv, + mae_cuda_cv < 1.0 ? "✓" : "✗"); +#ifdef HAVE_OPENCV_CUDA + if (have_cv_cuda) { + printf(" OpenCVCUDA: MAE=%.4f PSNR=%.2f dB %s\n", mae_cv_cuda, psnr_cv_cuda, + mae_cv_cuda < 1.0 ? "✓" : "✗"); + } +#endif + + } else if (bench_mode) { + // Benchmark CUDA vs OpenCV (skip slow CPU) + ImageData output_cuda, output_opencv; + + output_cuda.width = input.width; + output_cuda.height = input.height; + output_cuda.channels = input.channels; + output_cuda.data.resize(input.size()); + + printf("\nBenchmarking CUDA vs OpenCV (warmup=%d, runs=%d)...\n", WARMUP_RUNS, + BENCHMARK_RUNS); + + // Warmup + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width, + input.height, input.channels, params.radius, + params.sigma_spatial, params.sigma_color); + } + + // Benchmark CUDA + std::vector cuda_times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_cuda(input.data.data(), output_cuda.data.data(), input.width, + input.height, input.channels, params.radius, + params.sigma_spatial, params.sigma_color); + auto end = std::chrono::high_resolution_clock::now(); + cuda_times.push_back(std::chrono::duration(end - start).count()); + } + + // Warmup OpenCV CPU + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_opencv(input, output_opencv, params); + } + + // Benchmark OpenCV CPU + std::vector cv_times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_opencv(input, output_opencv, params); + auto end = std::chrono::high_resolution_clock::now(); + cv_times.push_back(std::chrono::duration(end - start).count()); + } + + auto cuda_stats = compute_stats(cuda_times); + auto cv_stats = compute_stats(cv_times); + double mae = compute_mae(output_cuda, output_opencv); + double psnr = compute_psnr(output_cuda, output_opencv); + + // OpenCV CUDA baseline +#ifdef HAVE_OPENCV_CUDA + ImageData output_cv_cuda; + BenchmarkResult cv_cuda_stats; + bool have_cv_cuda = + bench_opencv_cuda(input, params, output_cv_cuda, cv_cuda_stats, megapixels); + double mae_cv_cuda = -1.0, psnr_cv_cuda = -1.0; + if (have_cv_cuda) { + mae_cv_cuda = compute_mae(output_cv_cuda, output_opencv); + psnr_cv_cuda = compute_psnr(output_cv_cuda, output_opencv); + } +#endif + + printf("\n=== Benchmark Results ===\n"); + printf("CUDA : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cuda_stats.mean_ms, + cuda_stats.stddev_ms, cuda_stats.min_ms, cuda_stats.max_ms, + megapixels / (cuda_stats.mean_ms / 1000.0)); + printf("OpenCV : %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", cv_stats.mean_ms, + cv_stats.stddev_ms, cv_stats.min_ms, cv_stats.max_ms, + megapixels / (cv_stats.mean_ms / 1000.0)); +#ifdef HAVE_OPENCV_CUDA + if (have_cv_cuda) { + printf("OpenCVCUDA: %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", + cv_cuda_stats.mean_ms, cv_cuda_stats.stddev_ms, cv_cuda_stats.min_ms, + cv_cuda_stats.max_ms, megapixels / (cv_cuda_stats.mean_ms / 1000.0)); + } +#endif + + printf("\nSpeedup: CUDA is %.2fx %s than OpenCV\n", + cv_stats.mean_ms > cuda_stats.mean_ms ? cv_stats.mean_ms / cuda_stats.mean_ms + : cuda_stats.mean_ms / cv_stats.mean_ms, + cv_stats.mean_ms > cuda_stats.mean_ms ? "faster" : "slower"); +#ifdef HAVE_OPENCV_CUDA + if (have_cv_cuda) { + printf("Speedup: CUDA is %.2fx %s than OpenCVCUDA\n", + cv_cuda_stats.mean_ms > cuda_stats.mean_ms + ? cv_cuda_stats.mean_ms / cuda_stats.mean_ms + : cuda_stats.mean_ms / cv_cuda_stats.mean_ms, + cv_cuda_stats.mean_ms > cuda_stats.mean_ms ? "faster" : "slower"); + } +#endif + + printf("MAE: %.4f PSNR: %.2f dB %s\n", mae, psnr, mae < 1.0 ? "✓" : "✗"); +#ifdef HAVE_OPENCV_CUDA + if (have_cv_cuda) { + printf("OpenCVCUDA vs OpenCV: MAE=%.4f PSNR=%.2f dB %s\n", mae_cv_cuda, psnr_cv_cuda, + mae_cv_cuda < 1.0 ? "✓" : "✗"); + } +#endif + + } else if (compare_mode) { + ImageData output_cpu, output_opencv; + + printf("\nBenchmarking (warmup=%d, runs=%d)...\n", WARMUP_RUNS, BENCHMARK_RUNS); + + // Warmup + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_cpu(input, output_cpu, params); + apply_bilateral_filter_opencv(input, output_opencv, params); + } + + // Benchmark + std::vector cpu_times, cv_times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_cpu(input, output_cpu, params); + auto end = std::chrono::high_resolution_clock::now(); + cpu_times.push_back(std::chrono::duration(end - start).count()); + } + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_opencv(input, output_opencv, params); + auto end = std::chrono::high_resolution_clock::now(); + cv_times.push_back(std::chrono::duration(end - start).count()); + } + + auto cpu_stats = compute_stats(cpu_times); + auto cv_stats = compute_stats(cv_times); + double mae = compute_mae(output_cpu, output_opencv); + double psnr = compute_psnr(output_cpu, output_opencv); + + printf("\n=== Benchmark Results ===\n"); + printf("CPU : %.2f ± %.2f ms (%.2f MP/s)\n", cpu_stats.mean_ms, cpu_stats.stddev_ms, + megapixels / (cpu_stats.mean_ms / 1000.0)); + printf("OpenCV : %.2f ± %.2f ms (%.2f MP/s)\n", cv_stats.mean_ms, cv_stats.stddev_ms, + megapixels / (cv_stats.mean_ms / 1000.0)); + printf("Speedup: %.2fx\n", cpu_stats.mean_ms / cv_stats.mean_ms); + printf("MAE: %.4f PSNR: %.2f dB %s\n", mae, psnr, mae < 1.0 ? "✓" : "✗"); + + } else if (cuda_only) { + ImageData output; + output.width = input.width; + output.height = input.height; + output.channels = input.channels; + output.data.resize(input.size()); + + // Warmup + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_cuda(input.data.data(), output.data.data(), input.width, + input.height, input.channels, params.radius, + params.sigma_spatial, params.sigma_color); + } + + // Benchmark + std::vector times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_cuda(input.data.data(), output.data.data(), input.width, + input.height, input.channels, params.radius, + params.sigma_spatial, params.sigma_color); + auto end = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration(end - start).count()); + } + + auto stats = compute_stats(times); + printf("CUDA: %.2f ± %.2f ms [min=%.2f, max=%.2f] (%.2f MP/s)\n", stats.mean_ms, + stats.stddev_ms, stats.min_ms, stats.max_ms, megapixels / (stats.mean_ms / 1000.0)); + + if (!save_raw_image(output_path, output)) { + return 1; + } + printf("Output saved to: %s\n", output_path); + + } else if (opencv_only) { + ImageData output; + + // Warmup + for (int i = 0; i < WARMUP_RUNS; ++i) { + apply_bilateral_filter_opencv(input, output, params); + } + + // Benchmark + std::vector times; + for (int i = 0; i < BENCHMARK_RUNS; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_opencv(input, output, params); + auto end = std::chrono::high_resolution_clock::now(); + times.push_back(std::chrono::duration(end - start).count()); + } + + auto stats = compute_stats(times); + printf("OpenCV: %.2f ± %.2f ms (%.2f MP/s)\n", stats.mean_ms, stats.stddev_ms, + megapixels / (stats.mean_ms / 1000.0)); + + if (!save_raw_image(output_path, output)) { + return 1; + } + printf("Output saved to: %s\n", output_path); + + } else { + ImageData output; + + auto start = std::chrono::high_resolution_clock::now(); + apply_bilateral_filter_cpu(input, output, params); + auto end = std::chrono::high_resolution_clock::now(); + + double elapsed_ms = std::chrono::duration(end - start).count(); + printf("CPU time: %.2f ms (%.2f MP/s)\n", elapsed_ms, megapixels / (elapsed_ms / 1000.0)); + + if (!save_raw_image(output_path, output)) { + return 1; + } + printf("Output saved to: %s\n", output_path); + } + + return 0; +} diff --git a/08_bilateral_filter/Andy1314Chen/tests/test_data/params.txt b/08_bilateral_filter/Andy1314Chen/tests/test_data/params.txt new file mode 100644 index 0000000..6fa2360 --- /dev/null +++ b/08_bilateral_filter/Andy1314Chen/tests/test_data/params.txt @@ -0,0 +1,3 @@ +radius = 5 +sigma_spatial = 3.0 +sigma_color = 30.0