Skip to content

Commit

Permalink
picked good defaults for method
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Jul 24, 2024
1 parent bf6328b commit ae783da
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 34 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ if (FINUFFT_USE_CUDA)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
add_subdirectory(src/cuda)
if (BUILD_TESTING OR FINUFFT_BUILD_TESTS)
if (FINUFFT_BUILD_TESTS)
add_subdirectory(perftest/cuda)
add_subdirectory(test/cuda)
endif ()
Expand All @@ -280,7 +280,7 @@ if (FINUFFT_USE_CUDA)
endif ()

# Add tests defined in their own directory
if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS))
if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS)
add_subdirectory(test)
add_subdirectory(perftest)
endif ()
Expand Down
4 changes: 0 additions & 4 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,3 @@ if(FINUFFT_USE_OPENMP)
enable_asan(${EXAMPLE})
endforeach()
endif()

if (FINUFFT_USE_CUDA)
add_subdirectory(cuda)
endif()
29 changes: 14 additions & 15 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,24 +144,23 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
* For type 2, we always default to method 1 (GM). */

// query the device for the amount of shared memory available
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin,
device_id);
RETURN_IF_CUDA_ERROR
// compute the amount of shared memory required for the method
const auto shared_mem_required =
shared_memory_required<T>(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block,
shared_mem_required);
if ((shared_mem_required > shared_mem_per_block)) {
if (dim == 3 && std::is_same_v<T, double>) {
d_plan->opts.gpu_method = 1;
printf("choosing method 1\n");
} else {
d_plan->opts.gpu_method = 2;
printf("choosing method 2\n");
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block,
cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
RETURN_IF_CUDA_ERROR
// compute the amount of shared memory required for the method
const auto shared_mem_required = shared_memory_required<T>(
dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
if ((shared_mem_required > shared_mem_per_block)) {
d_plan->opts.gpu_method = 1;
} else {
d_plan->opts.gpu_method = 2;
}
}
printf("using method %d\n", d_plan->opts.gpu_method);
}

int fftsign = (iflag >= 0) ? 1 : -1;
Expand Down
10 changes: 6 additions & 4 deletions perftest/cuda/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def build_args(args):
# example command to run:
# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6
# example arguments
args = {"--prec": "f",
args = {"--prec": "d",
"--n_runs": "5",
"--method": "0",
"--sort": "1",
Expand Down Expand Up @@ -71,8 +71,10 @@ def build_args(args):
if stderr != '':
print(stderr)
exit(0)
for i in range(1, 7):
args["--tol"] = "1E-" + str(i)
max_range = 8 if args["--prec"] == "d" else 7

for i in range(1, max_range):
args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i)
print("Running with tol = 1E-" + str(i))
for method in ['2', '1']:
args["--method"] = method
Expand Down Expand Up @@ -180,4 +182,4 @@ def build_args(args):
plt.savefig("bench.png")
plt.savefig("bench.svg")
plt.savefig("bench.pdf")
plt.show()
plt.show()
1 change: 1 addition & 0 deletions src/cuda/3d/spread3d_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,

blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;

ghost_bin_pts_index<<<blocks, threadsPerBlock, 0, stream>>>(
Expand Down
15 changes: 6 additions & 9 deletions src/cuda/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,15 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
if (const auto err = cudaGetLastError(); err != cudaSuccess) {
throw std::runtime_error(cudaGetErrorString(err));
}
// use half of the available shared memory if double precision
if constexpr (std::is_same_v<T, double>) {
shared_mem_per_block /= 2;
}
const int bin_size =
shared_mem_per_block / sizeof(cuda_complex<T>) - ((ns + 1) / 2) * 2;
// find the power of 2 that is less than bin_size
// this makes the bin_size use the maximum shared memory available

opts->gpu_binsizex = bin_size;
opts->gpu_binsizex = 1024;
const auto shared_mem_required = shared_memory_required<T>(
dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez);
// printf("binsizex: %d, shared_mem_required %ld (bytes)\n",
Expand Down Expand Up @@ -310,13 +314,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
opts->gpu_binsizex = 16;
opts->gpu_binsizey = 16;
opts->gpu_binsizez = 2;
// const auto shared_mem_required = shared_memory_required<T>(
// dim, ns, opts->gpu_binsizex, opts->gpu_binsizey,
// opts->gpu_binsizez);
// printf(
// "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld
// (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey,
// opts->gpu_binsizez, shared_mem_required);
}
} break;
case 4: {
Expand Down

0 comments on commit ae783da

Please sign in to comment.