From 5188adac206485c1e95593b5059b469c23a6a87a Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Mon, 29 Apr 2024 16:12:57 -0400 Subject: [PATCH 1/4] cufinufft with modeord: 0 CMCL-style, 1 FFT-style --- CHANGELOG | 2 + include/cufinufft/cudeconvolve.h | 12 ++--- include/cufinufft_opts.h | 3 ++ python/cufinufft/cufinufft/_cufinufft.py | 3 +- python/cufinufft/cufinufft/_plan.py | 4 +- python/cufinufft/tests/test_basic.py | 34 ++++++++++++- src/cuda/cufinufft.cu | 2 + src/cuda/deconvolve_wrapper.cu | 62 ++++++++++++------------ 8 files changed, 82 insertions(+), 40 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 14c04dceb..2a068b7ab 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,8 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). +* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode + order, 1 FFT-style mode order. * CPU plan stage prevents now caps # threads at omp_get_max_threads (being 1 for single-thread build); warns if this cap was activated (PR 431) * new docs troubleshooting accuracy limitations due to condition number of the diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h index 4daa4767e..bf4c49e63 100644 --- a/include/cufinufft/cudeconvolve.h +++ b/include/cufinufft/cudeconvolve.h @@ -6,22 +6,22 @@ namespace cufinufft { namespace deconvolve { template -__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1); +__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord); template -__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf2); +__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf2, int modeord); template __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2); + T *fwkerhalf1, T *fwkerhalf2, int modeord); template __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2); + T *fwkerhalf1, T *fwkerhalf2, int modeord); template __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord); template __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord); template int cudeconvolve1d(cufinufft_plan_t *d_mem, int blksize); diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h index 9760884ae..a3da46f0d 100644 --- a/include/cufinufft_opts.h +++ b/include/cufinufft_opts.h @@ -26,6 +26,9 @@ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults int gpu_device_id; void *gpu_stream; + + int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order + // 1 FFT-style mode order } cufinufft_opts; #endif diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py index 429ce9796..2fccba192 100644 --- a/python/cufinufft/cufinufft/_cufinufft.py +++ b/python/cufinufft/cufinufft/_cufinufft.py @@ -63,7 +63,8 @@ def _get_NufftOpts(): ('gpu_spreadinterponly', c_int), ('gpu_maxbatchsize', c_int), ('gpu_device_id', c_int), - ('gpu_stream', c_void_p) + ('gpu_stream', c_void_p), + ('modeord', c_int) ] return fields diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py index 263c12cd7..67bcee59b 100644 --- a/python/cufinufft/cufinufft/_plan.py +++ b/python/cufinufft/cufinufft/_plan.py @@ -65,7 +65,9 @@ class Plan: memory), ``gpu_sort`` (for ``gpu_method == 1``, 0: no sort, 1: sort), ``gpu_kerevalmeth`` (0: direct exp(sqrt), Horner evaluation), ``gpu_device_id`` (GPU - ID), and ``gpu_stream`` (CUDA stream pointer). + ID), ``gpu_stream`` (CUDA stream pointer) and + ``modeord`` (0: CMCL-compatible mode ordering, + 1: FFT-style mode ordering). """ def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None, diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py index f4bc5713a..6d65ac85f 100644 --- a/python/cufinufft/tests/test_basic.py +++ b/python/cufinufft/tests/test_basic.py @@ -9,7 +9,7 @@ # NOTE: Tests below fail for tolerance 1e-4 (error executing plan). DTYPES = [np.float32, np.float64] -SHAPES = [(16,), (16, 16), (16, 16, 16)] +SHAPES = [(16,), (16, 16), (16, 16, 16), (19,), (17, 19), (17, 19, 24)] MS = [256, 1024, 4096] TOLS = [1e-3, 1e-6] OUTPUT_ARGS = [False, True] @@ -47,6 +47,38 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): utils.verify_type1(k, c, fk, tol) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("M", MS) +@pytest.mark.parametrize("tol", TOLS) +@pytest.mark.parametrize("output_arg", OUTPUT_ARGS) +def test_type1_modeord(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): + complex_dtype = utils._complex_dtype(dtype) + + k, c = utils.type1_problem(dtype, shape, M) + + k_gpu = to_gpu(k) + c_gpu = to_gpu(c) + + plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=1) + + # Since k_gpu is an array of shape (dim, M), this will expand to + # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all + # dimensions with the same call. + plan.setpts(*k_gpu) + + if output_arg: + fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype) + plan.execute(c_gpu, out=fk_gpu) + else: + fk_gpu = plan.execute(c_gpu) + + fk = to_cpu(fk_gpu) + fk = np.fft.fftshift(fk) + + utils.verify_type1(k, c, fk, tol) + + @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("shape", SHAPES) @pytest.mark.parametrize("M", MS) diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index 60cdd4482..091158299 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -121,5 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts) // By default, only use device 0 opts->gpu_device_id = 0; + + opts->modeord = 0; } } diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu index ffb65b8da..5ef561b6b 100644 --- a/src/cuda/deconvolve_wrapper.cu +++ b/src/cuda/deconvolve_wrapper.cu @@ -11,13 +11,13 @@ namespace cufinufft { namespace deconvolve { /* Kernel for copying fw to fk with amplication by prefac/ker */ // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up -// to N/2-1) +// to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1). template -__global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1) { +__global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord) { for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) { - int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2; + int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i ); - T kervalue = fwkerhalf1[abs(i - ms / 2)]; + T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)]; fk[i].x = fw[w1].x / kervalue; fk[i].y = fw[w1].y / kervalue; } @@ -25,16 +25,16 @@ __global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex template __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, - T *fwkerhalf2) { + T *fwkerhalf2, int modeord) { for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) { int k1 = i % ms; int k2 = i / ms; int outidx = k1 + k2 * ms; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; + int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); + int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); int inidx = w1 + w2 * nf1; - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)]; + T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)]; fk[outidx].x = fw[inidx].x / kervalue; fk[outidx].y = fw[inidx].y / kervalue; } @@ -42,18 +42,18 @@ __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex template __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) { + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) { for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) { int k1 = i % ms; int k2 = (i / ms) % mt; int k3 = (i / ms / mt); int outidx = k1 + k2 * ms + k3 * ms * mt; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; - int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2; + int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); + int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); + int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 ); int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2; - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)]; + T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)]; fk[outidx].x = fw[inidx].x / kervalue; fk[outidx].y = fw[inidx].y / kervalue; } @@ -61,11 +61,11 @@ __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, /* Kernel for copying fk to fw with same amplication */ template -__global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1) { +__global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord) { for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) { - int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2; + int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i ); - T kervalue = fwkerhalf1[abs(i - ms / 2)]; + T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)]; fw[w1].x = fk[i].x / kervalue; fw[w1].y = fk[i].y / kervalue; } @@ -73,16 +73,16 @@ __global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex template __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, - T *fwkerhalf2) { + T *fwkerhalf2, int modeord) { for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) { int k1 = i % ms; int k2 = i / ms; int inidx = k1 + k2 * ms; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; + int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); + int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); int outidx = w1 + w2 * nf1; - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)]; + T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)]; fw[outidx].x = fk[inidx].x / kervalue; fw[outidx].y = fk[inidx].y / kervalue; } @@ -90,18 +90,18 @@ __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw template __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) { + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) { for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) { int k1 = i % ms; int k2 = (i / ms) % mt; int k3 = (i / ms / mt); int inidx = k1 + k2 * ms + k3 * ms * mt; - int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2; - int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2; - int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2; + int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); + int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); + int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 ); int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2; - T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)]; + T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)]; fw[outidx].x = fk[inidx].x / kervalue; fw[outidx].y = fk[inidx].y / kervalue; } @@ -125,13 +125,13 @@ int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize) if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1); + d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord); } } else { checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex), stream)); for (int t = 0; t < blksize; t++) { amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1); + d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord); } } return 0; @@ -158,14 +158,14 @@ int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize) for (int t = 0; t < blksize; t++) { deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes, d_plan->fwkerhalf1, - d_plan->fwkerhalf2); + d_plan->fwkerhalf2, d_plan->opts.modeord); } } else { checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex), stream)); for (int t = 0; t < blksize; t++) { amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes, d_plan->fwkerhalf1, - d_plan->fwkerhalf2); + d_plan->fwkerhalf2, d_plan->opts.modeord); } } return 0; @@ -193,7 +193,7 @@ int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) for (int t = 0; t < blksize; t++) { deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes, - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3); + d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord); } } else { checkCudaErrors( @@ -201,7 +201,7 @@ int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) for (int t = 0; t < blksize; t++) { amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes, - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3); + d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord); } } return 0; From 67fa61eda2dea223c532c516c3ea554e9f97a647 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Tue, 28 May 2024 10:18:25 -0400 Subject: [PATCH 2/4] cufinufft python: improve testing of modeord option --- python/cufinufft/tests/test_basic.py | 52 ++++++++-------------------- 1 file changed, 15 insertions(+), 37 deletions(-) diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py index 6d65ac85f..b66e00f18 100644 --- a/python/cufinufft/tests/test_basic.py +++ b/python/cufinufft/tests/test_basic.py @@ -14,6 +14,7 @@ TOLS = [1e-3, 1e-6] OUTPUT_ARGS = [False, True] CONTIGUOUS = [False, True] +MODEORDS = [0, 1] @pytest.mark.parametrize("dtype", DTYPES) @@ -21,7 +22,8 @@ @pytest.mark.parametrize("M", MS) @pytest.mark.parametrize("tol", TOLS) @pytest.mark.parametrize("output_arg", OUTPUT_ARGS) -def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): +@pytest.mark.parametrize("modeord", MODEORDS) +def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, modeord): complex_dtype = utils._complex_dtype(dtype) k, c = utils.type1_problem(dtype, shape, M) @@ -29,7 +31,7 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): k_gpu = to_gpu(k) c_gpu = to_gpu(c) - plan = Plan(1, shape, eps=tol, dtype=complex_dtype) + plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=modeord) # Since k_gpu is an array of shape (dim, M), this will expand to # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all @@ -43,38 +45,8 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): fk_gpu = plan.execute(c_gpu) fk = to_cpu(fk_gpu) - - utils.verify_type1(k, c, fk, tol) - - -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("shape", SHAPES) -@pytest.mark.parametrize("M", MS) -@pytest.mark.parametrize("tol", TOLS) -@pytest.mark.parametrize("output_arg", OUTPUT_ARGS) -def test_type1_modeord(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): - complex_dtype = utils._complex_dtype(dtype) - - k, c = utils.type1_problem(dtype, shape, M) - - k_gpu = to_gpu(k) - c_gpu = to_gpu(c) - - plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=1) - - # Since k_gpu is an array of shape (dim, M), this will expand to - # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all - # dimensions with the same call. - plan.setpts(*k_gpu) - - if output_arg: - fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype) - plan.execute(c_gpu, out=fk_gpu) - else: - fk_gpu = plan.execute(c_gpu) - - fk = to_cpu(fk_gpu) - fk = np.fft.fftshift(fk) + if modeord == 1: + fk = np.fft.fftshift(fk) utils.verify_type1(k, c, fk, tol) @@ -85,12 +57,13 @@ def test_type1_modeord(to_gpu, to_cpu, dtype, shape, M, tol, output_arg): @pytest.mark.parametrize("tol", TOLS) @pytest.mark.parametrize("output_arg", OUTPUT_ARGS) @pytest.mark.parametrize("contiguous", CONTIGUOUS) -def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous): +@pytest.mark.parametrize("modeord", MODEORDS) +def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous, modeord): complex_dtype = utils._complex_dtype(dtype) k, fk = utils.type2_problem(dtype, shape, M) - plan = Plan(2, shape, eps=tol, dtype=complex_dtype) + plan = Plan(2, shape, eps=tol, dtype=complex_dtype, modeord=modeord) check_result = True @@ -113,7 +86,12 @@ def _execute(*args, **kwargs): return plan.execute(*args, **kwargs) k_gpu = to_gpu(k) - fk_gpu = to_gpu(fk) + + if modeord == 1: + _fk = np.fft.ifftshift(fk) + else: + _fk = fk + fk_gpu = to_gpu(_fk) plan.setpts(*k_gpu) From 63623c05028f19b00fa2e9f507842fbf1ea506cc Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Wed, 29 May 2024 18:12:12 -0400 Subject: [PATCH 3/4] more human readable code for cuda kernels in deconvolve_wrapper.cu --- src/cuda/deconvolve_wrapper.cu | 184 ++++++++++++++++++++++++++------- 1 file changed, 144 insertions(+), 40 deletions(-) diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu index 5ef561b6b..f0213b60d 100644 --- a/src/cuda/deconvolve_wrapper.cu +++ b/src/cuda/deconvolve_wrapper.cu @@ -14,10 +14,21 @@ namespace deconvolve { // to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1). template __global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord) { + int pivot1, w1, fwkerind1; + T kervalue; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) { - int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i ); + if (modeord == 0) { + pivot1 = i - ms / 2; + w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1; + fwkerind1 = abs(pivot1); + } else { + pivot1 = i - ms + ms / 2; + w1 = (pivot1 >= 0) ? nf1 + i - ms : i; + fwkerind1 = (pivot1 >= 0) ? ms - i : i; + } - T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)]; + kervalue = fwkerhalf1[fwkerind1]; fk[i].x = fw[w1].x / kervalue; fk[i].y = fw[w1].y / kervalue; } @@ -26,15 +37,33 @@ __global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex template __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, int modeord) { + int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2; + int k1, k2, inidx, outidx; + T kervalue; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = i / ms; - int outidx = k1 + k2 * ms; - int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); - int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); - int inidx = w1 + w2 * nf1; - - T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)]; + k1 = i % ms; + k2 = i / ms; + outidx = k1 + k2 * ms; + + if (modeord == 0) { + pivot1 = k1 - ms / 2; + pivot2 = k2 - mt / 2; + w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1; + w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2; + fwkerind1 = abs(pivot1); + fwkerind2 = abs(pivot2); + } else { + pivot1 = k1 - ms + ms / 2; + pivot2 = k2 - mt + mt / 2; + w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1; + w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2; + fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1; + fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2; + } + + inidx = w1 + w2 * nf1; + kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2]; fk[outidx].x = fw[inidx].x / kervalue; fk[outidx].y = fw[inidx].y / kervalue; } @@ -43,17 +72,40 @@ __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex template __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) { + int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3; + int k1, k2, k3, inidx, outidx; + T kervalue; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = (i / ms) % mt; - int k3 = (i / ms / mt); - int outidx = k1 + k2 * ms + k3 * ms * mt; - int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); - int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); - int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 ); - int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2; - - T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)]; + k1 = i % ms; + k2 = (i / ms) % mt; + k3 = (i / ms / mt); + outidx = k1 + k2 * ms + k3 * ms * mt; + + if (modeord == 0) { + pivot1 = k1 - ms / 2; + pivot2 = k2 - mt / 2; + pivot3 = k3 - mu / 2; + w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1; + w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2; + w3 = (pivot3 >= 0) ? pivot3 : nf3 + pivot3; + fwkerind1 = abs(pivot1); + fwkerind2 = abs(pivot2); + fwkerind3 = abs(pivot3); + } else { + pivot1 = k1 - ms + ms / 2; + pivot2 = k2 - mt + mt / 2; + pivot3 = k3 - mu + mu / 2; + w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1; + w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2; + w3 = (pivot3 >= 0) ? nf3 + k3 - mu : k3; + fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1; + fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2; + fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3; + } + + inidx = w1 + w2 * nf1 + w3 * nf1 * nf2; + kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3]; fk[outidx].x = fw[inidx].x / kervalue; fk[outidx].y = fw[inidx].y / kervalue; } @@ -62,10 +114,21 @@ __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, /* Kernel for copying fk to fw with same amplication */ template __global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord) { + int pivot1, w1, fwkerind1; + T kervalue; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) { - int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i ); + if (modeord == 0) { + pivot1 = i - ms / 2; + w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1; + fwkerind1 = abs(pivot1); + } else { + pivot1 = i - ms + ms / 2; + w1 = (pivot1 >= 0) ? nf1 + i - ms : i; + fwkerind1 = (pivot1 >= 0) ? ms - i : i; + } - T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)]; + kervalue = fwkerhalf1[fwkerind1]; fw[w1].x = fk[i].x / kervalue; fw[w1].y = fk[i].y / kervalue; } @@ -74,15 +137,33 @@ __global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex template __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, int modeord) { + int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2; + int k1, k2, inidx, outidx; + T kervalue; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = i / ms; - int inidx = k1 + k2 * ms; - int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); - int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); - int outidx = w1 + w2 * nf1; - - T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)]; + k1 = i % ms; + k2 = i / ms; + inidx = k1 + k2 * ms; + + if (modeord == 0) { + pivot1 = k1 - ms / 2; + pivot2 = k2 - mt / 2; + w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1; + w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2; + fwkerind1 = abs(pivot1); + fwkerind2 = abs(pivot2); + } else { + pivot1 = k1 - ms + ms / 2; + pivot2 = k2 - mt + mt / 2; + w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1; + w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2; + fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1; + fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2; + } + + outidx = w1 + w2 * nf1; + kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2]; fw[outidx].x = fk[inidx].x / kervalue; fw[outidx].y = fk[inidx].y / kervalue; } @@ -91,17 +172,40 @@ __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw template __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) { + int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3; + int k1, k2, k3, inidx, outidx; + T kervalue; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) { - int k1 = i % ms; - int k2 = (i / ms) % mt; - int k3 = (i / ms / mt); - int inidx = k1 + k2 * ms + k3 * ms * mt; - int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 ); - int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 ); - int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 ); - int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2; - - T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)]; + k1 = i % ms; + k2 = (i / ms) % mt; + k3 = (i / ms / mt); + inidx = k1 + k2 * ms + k3 * ms * mt; + + if (modeord == 0) { + pivot1 = k1 - ms / 2; + pivot2 = k2 - mt / 2; + pivot3 = k3 - mu / 2; + w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1; + w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2; + w3 = (pivot3 >= 0) ? pivot3 : nf3 + pivot3; + fwkerind1 = abs(pivot1); + fwkerind2 = abs(pivot2); + fwkerind3 = abs(pivot3); + } else { + pivot1 = k1 - ms + ms / 2; + pivot2 = k2 - mt + mt / 2; + pivot3 = k3 - mu + mu / 2; + w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1; + w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2; + w3 = (pivot3 >= 0) ? nf3 + k3 - mu : k3; + fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1; + fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2; + fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3; + } + + outidx = w1 + w2 * nf1 + w3 * nf1 * nf2; + kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3]; fw[outidx].x = fk[inidx].x / kervalue; fw[outidx].y = fk[inidx].y / kervalue; } From bb78e88f89ec21eba34de71facc82510fcf68196 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Wed, 29 May 2024 18:23:05 -0400 Subject: [PATCH 4/4] templating mordeord in cuda deconv kernels --- include/cufinufft/cudeconvolve.h | 30 ++++++------- src/cuda/1d/cufinufft1d.cu | 18 ++++++-- src/cuda/2d/cufinufft2d.cu | 18 ++++++-- src/cuda/3d/cufinufft3d.cu | 18 ++++++-- src/cuda/deconvolve_wrapper.cu | 72 +++++++++++++++++--------------- 5 files changed, 96 insertions(+), 60 deletions(-) diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h index bf4c49e63..6395f16c3 100644 --- a/include/cufinufft/cudeconvolve.h +++ b/include/cufinufft/cudeconvolve.h @@ -5,29 +5,29 @@ namespace cufinufft { namespace deconvolve { -template -__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord); -template -__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf2, int modeord); -template +template +__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1); +template +__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf2); +template __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2, int modeord); -template + T *fwkerhalf1, T *fwkerhalf2); +template __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2, int modeord); + T *fwkerhalf1, T *fwkerhalf2); -template +template __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord); -template + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); +template __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord); + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3); -template +template int cudeconvolve1d(cufinufft_plan_t *d_mem, int blksize); -template +template int cudeconvolve2d(cufinufft_plan_t *d_mem, int blksize); -template +template int cudeconvolve3d(cufinufft_plan_t *d_mem, int blksize); } // namespace deconvolve } // namespace cufinufft diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 246a064f6..62574c06b 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -59,8 +59,13 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla return FINUFFT_ERR_CUDA_FAILURE; // Step 3: deconvolve and shuffle - if ((ier = cudeconvolve1d(d_plan, blksize))) - return ier; + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve1d(d_plan, blksize))) + return ier; + } else { + if ((ier = cudeconvolve1d(d_plan, blksize))) + return ier; + } } return 0; @@ -95,8 +100,13 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla d_plan->fk = d_fkstart; // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if ((ier = cudeconvolve1d(d_plan, blksize))) - return ier; + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve1d(d_plan, blksize))) + return ier; + } else { + if ((ier = cudeconvolve1d(d_plan, blksize))) + return ier; + } // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index b566f49ce..3cd85281f 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -59,8 +59,13 @@ int cufinufft2d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla return FINUFFT_ERR_CUDA_FAILURE; // Step 3: deconvolve and shuffle - if ((ier = cudeconvolve2d(d_plan, blksize))) - return ier; + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve2d(d_plan, blksize))) + return ier; + } else { + if ((ier = cudeconvolve2d(d_plan, blksize))) + return ier; + } } return 0; @@ -95,8 +100,13 @@ int cufinufft2d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla d_plan->fk = d_fkstart; // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if ((ier = cudeconvolve2d(d_plan, blksize))) - return ier; + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve2d(d_plan, blksize))) + return ier; + } else { + if ((ier = cudeconvolve2d(d_plan, blksize))) + return ier; + } // Step 2: FFT cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag); diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index fa02ef860..c1209039a 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -57,8 +57,13 @@ int cufinufft3d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla return FINUFFT_ERR_CUDA_FAILURE; // Step 3: deconvolve and shuffle - if ((ier = cudeconvolve3d(d_plan, blksize))) - return ier; + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve3d(d_plan, blksize))) + return ier; + } else { + if ((ier = cudeconvolve3d(d_plan, blksize))) + return ier; + } } return 0; @@ -91,8 +96,13 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla d_plan->fk = d_fkstart; // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw - if ((ier = cudeconvolve3d(d_plan, blksize))) - return ier; + if (d_plan->opts.modeord == 0) { + if ((ier = cudeconvolve3d(d_plan, blksize))) + return ier; + } else { + if ((ier = cudeconvolve3d(d_plan, blksize))) + return ier; + } // Step 2: FFT RETURN_IF_CUDA_ERROR diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu index f0213b60d..75d8adda3 100644 --- a/src/cuda/deconvolve_wrapper.cu +++ b/src/cuda/deconvolve_wrapper.cu @@ -12,8 +12,8 @@ namespace deconvolve { /* Kernel for copying fw to fk with amplication by prefac/ker */ // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up // to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1). -template -__global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord) { +template +__global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1) { int pivot1, w1, fwkerind1; T kervalue; @@ -34,9 +34,9 @@ __global__ void deconvolve_1d(int ms, int nf1, cuda_complex *fw, cuda_complex } } -template +template __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, - T *fwkerhalf2, int modeord) { + T *fwkerhalf2) { int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2; int k1, k2, inidx, outidx; T kervalue; @@ -69,9 +69,9 @@ __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex } } -template +template __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, - cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) { + cuda_complex *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) { int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3; int k1, k2, k3, inidx, outidx; T kervalue; @@ -112,8 +112,8 @@ __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, } /* Kernel for copying fk to fw with same amplication */ -template -__global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, int modeord) { +template +__global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1) { int pivot1, w1, fwkerind1; T kervalue; @@ -134,9 +134,9 @@ __global__ void amplify_1d(int ms, int nf1, cuda_complex *fw, cuda_complex } } -template +template __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw, cuda_complex *fk, T *fwkerhalf1, - T *fwkerhalf2, int modeord) { + T *fwkerhalf2) { int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2; int k1, k2, inidx, outidx; T kervalue; @@ -169,9 +169,9 @@ __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex *fw } } -template +template __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex *fw, cuda_complex *fk, - T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) { + T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) { int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3; int k1, k2, k3, inidx, outidx; T kervalue; @@ -211,7 +211,7 @@ __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cu } } -template +template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize) /* wrapper for deconvolution & amplication in 1D. @@ -228,20 +228,20 @@ int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize) if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { - deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord); + deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, + d_plan->fk + t * nmodes, d_plan->fwkerhalf1); } } else { checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex), stream)); for (int t = 0; t < blksize; t++) { - amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, - d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord); + amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1, + d_plan->fk + t * nmodes, d_plan->fwkerhalf1); } } return 0; } -template +template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize) /* wrapper for deconvolution & amplication in 2D. @@ -260,22 +260,22 @@ int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize) if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { - deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, + deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes, d_plan->fwkerhalf1, - d_plan->fwkerhalf2, d_plan->opts.modeord); + d_plan->fwkerhalf2); } } else { checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex), stream)); for (int t = 0; t < blksize; t++) { - amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, + amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes, d_plan->fwkerhalf1, - d_plan->fwkerhalf2, d_plan->opts.modeord); + d_plan->fwkerhalf2); } } return 0; } -template +template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) /* wrapper for deconvolution & amplication in 3D. @@ -295,28 +295,34 @@ int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize) int maxbatchsize = d_plan->maxbatchsize; if (d_plan->spopts.spread_direction == 1) { for (int t = 0; t < blksize; t++) { - deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes, - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord); + d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3); } } else { checkCudaErrors( cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex), stream)); for (int t = 0; t < blksize; t++) { - amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( + amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>( ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes, - d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord); + d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3); } } return 0; } -template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); -template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); -template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize); -template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize); -template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize); -template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve1d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve2d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize); +template int cudeconvolve3d(cufinufft_plan_t *d_plan, int blksize); } // namespace deconvolve } // namespace cufinufft