From db80aad0f21cedccc85d1eca211c4286a18a198e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 15:44:47 -0400 Subject: [PATCH] added comments for review --- CHANGELOG | 1 + include/cufinufft/impl.h | 1 + include/cufinufft/spreadinterp.h | 24 +++++++++++++++++------- include/cufinufft/utils.h | 25 ++++++++++++++++++++----- src/cuda/common.cu | 19 +++++++++---------- 5 files changed, 48 insertions(+), 22 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ba024e07f..d25d7e5d7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -47,6 +47,7 @@ V 2.3.0beta (7/21/24) precision denormals to 0 and using fma where possible. * cuFINUFFT using intrinsics in foldrescale and other places to increase performance +* cuFINUFFT using SM90 float2 vector atomicAdd where supported V 2.2.0 (12/12/23) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index dcf00f31b..c3021a7ff 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -60,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran Variables and arrays inside the plan struct are set and allocated. Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. + Marco Barbone 07/26/24. Using SM when shared memory available is enough. */ int ier; cuDoubleComplex *d_a = nullptr; // fseries temp data diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 0ab7aba9a..2963d381d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -12,8 +12,10 @@ namespace spreadinterp { template static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even return __fmaf_rn(a, b, c); } else if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even return __fma_rn(a, b, c); } static_assert(std::is_same_v || std::is_same_v, @@ -21,23 +23,31 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T return T{0}; } -template -constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) { +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { constexpr auto x2pi = T(0.159154943091895345554011992339482617); constexpr auto half = T(0.5); #if defined(__CUDA_ARCH__) if constexpr (std::is_same_v) { - const auto result = fma(x, x2pi, half); - return (result - floorf(result)) * static_cast(N); + // fused multiply-add, round to nearest even + auto result = __fmaf_rn(x, x2pi, half); + // subtract, round down + result = __fsub_rd(result, floorf(result)); + // multiply, round down + return __fmul_rd(result, static_cast(N)); } else if constexpr (std::is_same_v) { - const auto result = fma(x, x2pi, half); - return (result - floor(result)) * static_cast(N); + // fused multiply-add, round to nearest even + auto result = __fma_rn(x, x2pi, half); + // subtract, round down + result = __dsub_rd(result, floor(result)); + // multiply, round down + return __dmul_rd(result, static_cast(N)); } else { static_assert(std::is_same_v || std::is_same_v, "Only float and double are supported."); } #else - const auto result = fma(x, x2pi, half); + const auto result = std::fma(x, x2pi, half); return (result - std::floor(result)) * static_cast(N); #endif } diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index f556da8d6..b4db528ae 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -74,11 +74,14 @@ template T infnorm(int n, std::complex *a) { #ifdef __CUDA_ARCH__ __forceinline__ __device__ auto interval(const int ns, const float x) { + // float to int round up and fused multiply-add to round up const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + // float to int round down and fused multiply-add to round down + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); return int2{xstart, xend}; } __forceinline__ __device__ auto interval(const int ns, const double x) { + // same as above const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); return int2{xstart, xend}; @@ -107,17 +110,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #define COMPUTE_CAPABILITY_90_OR_HIGHER 0 #endif +/** + * does a complex atomic add on a shared memory address + * it adds the real and imaginary parts separately + * cuda does not support atomic operations + * on complex numbers on shared memory directly + */ + template -static __forceinline__ __device__ void atomicAddComplexShared( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, + cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); } +/** + * does a complex atomic add on a global memory address + * since cuda 90 atomic operations on complex numbers + * on shared memory are supported so we leverage them + */ template -static __forceinline__ __device__ void atomicAddComplexGlobal( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, + cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); diff --git a/src/cuda/common.cu b/src/cuda/common.cu index eba170a24..19b0cbd1a 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -202,8 +202,7 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z) { - // printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, - // bin_size_x, bin_size_y, bin_size_z); + // Helper to compute the shared memory required for the spreader when using SM int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; if (dim == 1) { @@ -221,17 +220,18 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } -// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize -template int find_bin_size(std::size_t MemSize, int dim, int ns) { +// Function to find bin_size_x == bin_size_y +// where bin_size_x * bin_size_y * bin_size_z < mem_size +// TODO: this can be done without a loop by using a direct formula +template int find_bin_size(std::size_t mem_size, int dim, int ns) { int binsize = 1; // Start with the smallest possible bin size - while (true) { // Calculate the shared memory required for the current bin_size_x and bin_size_y std::size_t required_memory = shared_memory_required(dim, ns, binsize, binsize, binsize); // Check if the required memory is less than the available memory - if (required_memory > MemSize) { + if (required_memory > mem_size) { // If the condition is met, return the current bin_size_x return binsize - 1; } @@ -243,6 +243,9 @@ template int find_bin_size(std::size_t MemSize, int dim, int ns) { template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { + // Marco Barbone 07/26/24. Using the shared memory available on the device, to + // determine the optimal binsize for the spreader. + // TODO: This can still be improved some sizes are hardcoded still int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { @@ -290,10 +293,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { } break; } } - // const auto shared_mem_required = shared_memory_required( - // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); - // printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n", - // opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required); opts->gpu_binsizez = 1; } break; case 3: {