From ae783da138028538738616332675c0da73b5bb1c Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 24 Jul 2024 17:31:22 -0400 Subject: [PATCH] picked good defaults for method --- CMakeLists.txt | 4 ++-- examples/CMakeLists.txt | 4 ---- include/cufinufft/impl.h | 29 ++++++++++++++--------------- perftest/cuda/bench.py | 10 ++++++---- src/cuda/3d/spread3d_wrapper.cu | 1 + src/cuda/common.cu | 15 ++++++--------- 6 files changed, 29 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93a34f2af..3c9b84f3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -271,7 +271,7 @@ if (FINUFFT_USE_CUDA) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING OR FINUFFT_BUILD_TESTS) + if (FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) add_subdirectory(test/cuda) endif () @@ -280,7 +280,7 @@ if (FINUFFT_USE_CUDA) endif () # Add tests defined in their own directory -if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS)) +if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS) add_subdirectory(test) add_subdirectory(perftest) endif () diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index af6f067bc..8b5afa4f5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,7 +21,3 @@ if(FINUFFT_USE_OPENMP) enable_asan(${EXAMPLE}) endforeach() endif() - -if (FINUFFT_USE_CUDA) - add_subdirectory(cuda) -endif() diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 4a1c6ae31..7d63df51e 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -144,24 +144,23 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * For type 2, we always default to method 1 (GM). */ // query the device for the amount of shared memory available - int shared_mem_per_block{}; - cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, - device_id); - RETURN_IF_CUDA_ERROR - // compute the amount of shared memory required for the method - const auto shared_mem_required = - shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block, - shared_mem_required); - if ((shared_mem_required > shared_mem_per_block)) { + if (dim == 3 && std::is_same_v) { d_plan->opts.gpu_method = 1; - printf("choosing method 1\n"); } else { - d_plan->opts.gpu_method = 2; - printf("choosing method 2\n"); + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } } - printf("using method %d\n", d_plan->opts.gpu_method); } int fftsign = (iflag >= 0) ? 1 : -1; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8a9e757a3..aa21acd52 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,7 +37,7 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "f", +args = {"--prec": "d", "--n_runs": "5", "--method": "0", "--sort": "1", @@ -71,8 +71,10 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -for i in range(1, 7): - args["--tol"] = "1E-" + str(i) +max_range = 8 if args["--prec"] == "d" else 7 + +for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) print("Running with tol = 1E-" + str(i)) for method in ['2', '1']: args["--method"] = method @@ -180,4 +182,4 @@ def build_args(args): plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") -plt.show() \ No newline at end of file +plt.show() diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index bf78ed905..4fb2b073d 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -280,6 +280,7 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; + blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; ghost_bin_pts_index<<>>( diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 64c5639dc..ea54a4c77 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -256,11 +256,15 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } + // use half of the available shared memory if double precision + if constexpr (std::is_same_v) { + shared_mem_per_block /= 2; + } const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - // find the power of 2 that is less than bin_size - // this makes the bin_size use the maximum shared memory available + opts->gpu_binsizex = bin_size; + opts->gpu_binsizex = 1024; const auto shared_mem_required = shared_memory_required( dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", @@ -310,13 +314,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { opts->gpu_binsizex = 16; opts->gpu_binsizey = 16; opts->gpu_binsizez = 2; - // const auto shared_mem_required = shared_memory_required( - // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, - // opts->gpu_binsizez); - // printf( - // "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld - // (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey, - // opts->gpu_binsizez, shared_mem_required); } } break; case 4: {