From db80aad0f21cedccc85d1eca211c4286a18a198e Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Fri, 26 Jul 2024 15:44:47 -0400
Subject: [PATCH] added comments for review

---
 CHANGELOG                        |  1 +
 include/cufinufft/impl.h         |  1 +
 include/cufinufft/spreadinterp.h | 24 +++++++++++++++++-------
 include/cufinufft/utils.h        | 25 ++++++++++++++++++++-----
 src/cuda/common.cu               | 19 +++++++++----------
 5 files changed, 48 insertions(+), 22 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index ba024e07f..d25d7e5d7 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -47,6 +47,7 @@ V 2.3.0beta (7/21/24)
   precision denormals to 0 and using fma where possible.
 * cuFINUFFT using intrinsics in foldrescale and other places to increase
   performance
+* cuFINUFFT using SM90 float2 vector atomicAdd where supported
 
 V 2.2.0 (12/12/23)
 
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index dcf00f31b..c3021a7ff 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -60,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
           Variables and arrays inside the plan struct are set and allocated.
 
       Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
+      Marco Barbone 07/26/24. Using SM when shared memory available is enough.
   */
   int ier;
   cuDoubleComplex *d_a = nullptr; // fseries temp data
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index 0ab7aba9a..2963d381d 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -12,8 +12,10 @@ namespace spreadinterp {
 template<typename T>
 static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) {
   if constexpr (std::is_same_v<T, float>) {
+    // fused multiply-add, round to nearest even
     return __fmaf_rn(a, b, c);
   } else if constexpr (std::is_same_v<T, double>) {
+    // fused multiply-add, round to nearest even
     return __fma_rn(a, b, c);
   }
   static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
@@ -21,23 +23,31 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T
   return T{0};
 }
 
-template<typename T, typename V>
-constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) {
+template<typename T>
+constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
   constexpr auto x2pi = T(0.159154943091895345554011992339482617);
   constexpr auto half = T(0.5);
 #if defined(__CUDA_ARCH__)
   if constexpr (std::is_same_v<T, float>) {
-    const auto result = fma(x, x2pi, half);
-    return (result - floorf(result)) * static_cast<T>(N);
+    // fused multiply-add, round to nearest even
+    auto result = __fmaf_rn(x, x2pi, half);
+    // subtract, round down
+    result = __fsub_rd(result, floorf(result));
+    // multiply, round down
+    return __fmul_rd(result, static_cast<T>(N));
   } else if constexpr (std::is_same_v<T, double>) {
-    const auto result = fma(x, x2pi, half);
-    return (result - floor(result)) * static_cast<T>(N);
+    // fused multiply-add, round to nearest even
+    auto result = __fma_rn(x, x2pi, half);
+    // subtract, round down
+    result = __dsub_rd(result, floor(result));
+    // multiply, round down
+    return __dmul_rd(result, static_cast<T>(N));
   } else {
     static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
                   "Only float and double are supported.");
   }
 #else
-  const auto result = fma(x, x2pi, half);
+  const auto result = std::fma(x, x2pi, half);
   return (result - std::floor(result)) * static_cast<T>(N);
 #endif
 }
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index f556da8d6..b4db528ae 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -74,11 +74,14 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
 
 #ifdef __CUDA_ARCH__
 __forceinline__ __device__ auto interval(const int ns, const float x) {
+  // float to int round up and fused multiply-add to round up
   const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
-  const auto xend   = __float2int_rd(__fmaf_rd(ns, .5f, x));
+  // float to int round down and fused multiply-add to round down
+  const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
   return int2{xstart, xend};
 }
 __forceinline__ __device__ auto interval(const int ns, const double x) {
+  // same as above
   const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
   const auto xend   = __double2int_rd(__fma_rd(ns, .5, x));
   return int2{xstart, xend};
@@ -107,17 +110,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) {
 #define COMPUTE_CAPABILITY_90_OR_HIGHER 0
 #endif
 
+/**
+ * does a complex atomic add on a shared memory address
+ * it adds the real and imaginary parts separately
+ * cuda does not support atomic operations
+ * on complex numbers on shared memory directly
+ */
+
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexShared(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   const auto raw_address = reinterpret_cast<T *>(address);
   atomicAdd(raw_address, res.x);
   atomicAdd(raw_address + 1, res.y);
 }
 
+/**
+ * does a complex atomic add on a global memory address
+ * since cuda 90 atomic operations on complex numbers
+ * on shared memory are supported so we leverage them
+ */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index eba170a24..19b0cbd1a 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -202,8 +202,7 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<doubl
 template<typename T>
 std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
                                    int bin_size_z) {
-  //  printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns,
-  //         bin_size_x, bin_size_y, bin_size_z);
+  // Helper to compute the shared memory required for the spreader when using SM
   int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2;
 
   if (dim == 1) {
@@ -221,17 +220,18 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size
   return adjusted_ns * sizeof(cuda_complex<T>);
 }
 
-// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize
-template<typename T> int find_bin_size(std::size_t MemSize, int dim, int ns) {
+// Function to find bin_size_x == bin_size_y
+// where bin_size_x * bin_size_y * bin_size_z < mem_size
+// TODO: this can be done without a loop by using a direct formula
+template<typename T> int find_bin_size(std::size_t mem_size, int dim, int ns) {
   int binsize = 1; // Start with the smallest possible bin size
-
   while (true) {
     // Calculate the shared memory required for the current bin_size_x and bin_size_y
     std::size_t required_memory =
         shared_memory_required<T>(dim, ns, binsize, binsize, binsize);
 
     // Check if the required memory is less than the available memory
-    if (required_memory > MemSize) {
+    if (required_memory > mem_size) {
       // If the condition is met, return the current bin_size_x
       return binsize - 1;
     }
@@ -243,6 +243,9 @@ template<typename T> int find_bin_size(std::size_t MemSize, int dim, int ns) {
 
 template<typename T>
 void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
+  // Marco Barbone 07/26/24. Using the shared memory available on the device, to
+  // determine the optimal binsize for the spreader.
+  // TODO: This can still be improved some sizes are hardcoded still
   int shared_mem_per_block{}, device_id{};
   switch (dim) {
   case 1: {
@@ -290,10 +293,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
       } break;
       }
     }
-    //      const auto shared_mem_required = shared_memory_required<T>(
-    //          dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez);
-    //      printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n",
-    //             opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required);
     opts->gpu_binsizez = 1;
   } break;
   case 3: {