From ae783da138028538738616332675c0da73b5bb1c Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Wed, 24 Jul 2024 17:31:22 -0400
Subject: [PATCH] picked good defaults for method

---
 CMakeLists.txt                  |  4 ++--
 examples/CMakeLists.txt         |  4 ----
 include/cufinufft/impl.h        | 29 ++++++++++++++---------------
 perftest/cuda/bench.py          | 10 ++++++----
 src/cuda/3d/spread3d_wrapper.cu |  1 +
 src/cuda/common.cu              | 15 ++++++---------
 6 files changed, 29 insertions(+), 34 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93a34f2af..3c9b84f3b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -271,7 +271,7 @@ if (FINUFFT_USE_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     add_subdirectory(src/cuda)
-    if (BUILD_TESTING OR FINUFFT_BUILD_TESTS)
+    if (FINUFFT_BUILD_TESTS)
         add_subdirectory(perftest/cuda)
         add_subdirectory(test/cuda)
     endif ()
@@ -280,7 +280,7 @@ if (FINUFFT_USE_CUDA)
 endif ()
 
 # Add tests defined in their own directory
-if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS))
+if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS)
     add_subdirectory(test)
     add_subdirectory(perftest)
 endif ()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index af6f067bc..8b5afa4f5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,7 +21,3 @@ if(FINUFFT_USE_OPENMP)
     enable_asan(${EXAMPLE})
   endforeach()
 endif()
-
-if (FINUFFT_USE_CUDA)
-  add_subdirectory(cuda)
-endif()
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 4a1c6ae31..7d63df51e 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -144,24 +144,23 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
      * For type 2, we always default to method 1 (GM). */
 
     // query the device for the amount of shared memory available
-    int shared_mem_per_block{};
-    cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                           device_id);
-    RETURN_IF_CUDA_ERROR
-    // compute the amount of shared memory required for the method
-    const auto shared_mem_required =
-        shared_memory_required<T>(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
-                                  d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
-    printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block,
-           shared_mem_required);
-    if ((shared_mem_required > shared_mem_per_block)) {
+    if (dim == 3 && std::is_same_v<T, double>) {
       d_plan->opts.gpu_method = 1;
-      printf("choosing method 1\n");
     } else {
-      d_plan->opts.gpu_method = 2;
-      printf("choosing method 2\n");
+      int shared_mem_per_block{};
+      cudaDeviceGetAttribute(&shared_mem_per_block,
+                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+      RETURN_IF_CUDA_ERROR
+      // compute the amount of shared memory required for the method
+      const auto shared_mem_required = shared_memory_required<T>(
+          dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+          d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
+      if ((shared_mem_required > shared_mem_per_block)) {
+        d_plan->opts.gpu_method = 1;
+      } else {
+        d_plan->opts.gpu_method = 2;
+      }
     }
-    printf("using method %d\n", d_plan->opts.gpu_method);
   }
 
   int fftsign = (iflag >= 0) ? 1 : -1;
diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py
index 8a9e757a3..aa21acd52 100644
--- a/perftest/cuda/bench.py
+++ b/perftest/cuda/bench.py
@@ -37,7 +37,7 @@ def build_args(args):
 # example command to run:
 # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
 # example arguments
-args = {"--prec": "f",
+args = {"--prec": "d",
         "--n_runs": "5",
         "--method": "0",
         "--sort": "1",
@@ -71,8 +71,10 @@ def build_args(args):
 if stderr != '':
     print(stderr)
     exit(0)
-for i in range(1, 7):
-    args["--tol"] = "1E-" + str(i)
+max_range = 8 if args["--prec"] == "d" else 7
+
+for i in range(1, max_range):
+    args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i)
     print("Running with tol = 1E-" + str(i))
     for method in ['2', '1']:
         args["--method"] = method
@@ -180,4 +182,4 @@ def build_args(args):
 plt.savefig("bench.png")
 plt.savefig("bench.svg")
 plt.savefig("bench.pdf")
-plt.show()
\ No newline at end of file
+plt.show()
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
index bf78ed905..4fb2b073d 100644
--- a/src/cuda/3d/spread3d_wrapper.cu
+++ b/src/cuda/3d/spread3d_wrapper.cu
@@ -280,6 +280,7 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
 
   blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
   blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
+  blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
   blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
 
   ghost_bin_pts_index<<<blocks, threadsPerBlock, 0, stream>>>(
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index 64c5639dc..ea54a4c77 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -256,11 +256,15 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
       if (const auto err = cudaGetLastError(); err != cudaSuccess) {
         throw std::runtime_error(cudaGetErrorString(err));
       }
+      // use half of the available shared memory if double precision
+      if constexpr (std::is_same_v<T, double>) {
+        shared_mem_per_block /= 2;
+      }
       const int bin_size =
           shared_mem_per_block / sizeof(cuda_complex<T>) - ((ns + 1) / 2) * 2;
-      // find the power of 2 that is less than bin_size
-      // this makes the bin_size use the maximum shared memory available
+
       opts->gpu_binsizex             = bin_size;
+      opts->gpu_binsizex             = 1024;
       const auto shared_mem_required = shared_memory_required<T>(
           dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez);
       //      printf("binsizex: %d, shared_mem_required %ld (bytes)\n",
@@ -310,13 +314,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
         opts->gpu_binsizex = 16;
         opts->gpu_binsizey = 16;
         opts->gpu_binsizez = 2;
-        //        const auto shared_mem_required = shared_memory_required<T>(
-        //            dim, ns, opts->gpu_binsizex, opts->gpu_binsizey,
-        //            opts->gpu_binsizez);
-        //        printf(
-        //            "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld
-        //            (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey,
-        //            opts->gpu_binsizez, shared_mem_required);
       }
     } break;
     case 4: {