Skip to content

Commit

Permalink
Merge pull request #447 from lu1and10/cufinufft-modeord
Browse files Browse the repository at this point in the history
cufinufft modeord
  • Loading branch information
ahbarnett authored Jun 4, 2024
2 parents 9509b88 + a404a80 commit 7535a82
Show file tree
Hide file tree
Showing 11 changed files with 251 additions and 91 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
List of features / changes made / release notes, in reverse chronological order.
If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).

* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
order, 1 FFT-style mode order.
* New doc page: migration guide from NFFT3 (2d1 case only).
* New foldrescale, removes [-3pi,3pi) restriction on NU points, and slight
speedup at large tols. Deprecates both opts.chkbnds and error code
Expand Down
18 changes: 9 additions & 9 deletions include/cufinufft/cudeconvolve.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,29 @@

namespace cufinufft {
namespace deconvolve {
template <typename T>
template <typename T, int modeord>
__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1);
template <typename T>
template <typename T, int modeord>
__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2);
template <typename T>
template <typename T, int modeord>
__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
T *fwkerhalf1, T *fwkerhalf2);
template <typename T>
template <typename T, int modeord>
__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
T *fwkerhalf1, T *fwkerhalf2);

template <typename T>
template <typename T, int modeord>
__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
template <typename T>
template <typename T, int modeord>
__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);

template <typename T>
template <typename T, int modeord>
int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
template <typename T>
template <typename T, int modeord>
int cudeconvolve2d(cufinufft_plan_t<T> *d_mem, int blksize);
template <typename T>
template <typename T, int modeord>
int cudeconvolve3d(cufinufft_plan_t<T> *d_mem, int blksize);
} // namespace deconvolve
} // namespace cufinufft
Expand Down
3 changes: 3 additions & 0 deletions include/cufinufft_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults
int gpu_device_id;

void *gpu_stream;

int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
// 1 FFT-style mode order
} cufinufft_opts;

#endif
3 changes: 2 additions & 1 deletion python/cufinufft/cufinufft/_cufinufft.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def _get_NufftOpts():
('gpu_spreadinterponly', c_int),
('gpu_maxbatchsize', c_int),
('gpu_device_id', c_int),
('gpu_stream', c_void_p)
('gpu_stream', c_void_p),
('modeord', c_int)
]
return fields

Expand Down
4 changes: 3 additions & 1 deletion python/cufinufft/cufinufft/_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ class Plan:
memory), ``gpu_sort`` (for ``gpu_method == 1``, 0: no
sort, 1: sort), ``gpu_kerevalmeth`` (0: direct
exp(sqrt), Horner evaluation), ``gpu_device_id`` (GPU
ID), and ``gpu_stream`` (CUDA stream pointer).
ID), ``gpu_stream`` (CUDA stream pointer) and
``modeord`` (0: CMCL-compatible mode ordering,
1: FFT-style mode ordering).
"""

def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
Expand Down
22 changes: 16 additions & 6 deletions python/cufinufft/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,29 @@
# NOTE: Tests below fail for tolerance 1e-4 (error executing plan).

DTYPES = [np.float32, np.float64]
SHAPES = [(16,), (16, 16), (16, 16, 16)]
SHAPES = [(16,), (16, 16), (16, 16, 16), (19,), (17, 19), (17, 19, 24)]
MS = [256, 1024, 4096]
TOLS = [1e-3, 1e-6]
OUTPUT_ARGS = [False, True]
CONTIGUOUS = [False, True]
MODEORDS = [0, 1]


@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("M", MS)
@pytest.mark.parametrize("tol", TOLS)
@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
@pytest.mark.parametrize("modeord", MODEORDS)
def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, modeord):
complex_dtype = utils._complex_dtype(dtype)

k, c = utils.type1_problem(dtype, shape, M)

k_gpu = to_gpu(k)
c_gpu = to_gpu(c)

plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=modeord)

# Since k_gpu is an array of shape (dim, M), this will expand to
# plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
Expand All @@ -43,6 +45,8 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
fk_gpu = plan.execute(c_gpu)

fk = to_cpu(fk_gpu)
if modeord == 1:
fk = np.fft.fftshift(fk)

utils.verify_type1(k, c, fk, tol)

Expand All @@ -53,12 +57,13 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
@pytest.mark.parametrize("tol", TOLS)
@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
@pytest.mark.parametrize("contiguous", CONTIGUOUS)
def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous):
@pytest.mark.parametrize("modeord", MODEORDS)
def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous, modeord):
complex_dtype = utils._complex_dtype(dtype)

k, fk = utils.type2_problem(dtype, shape, M)

plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
plan = Plan(2, shape, eps=tol, dtype=complex_dtype, modeord=modeord)

check_result = True

Expand All @@ -81,7 +86,12 @@ def _execute(*args, **kwargs):
return plan.execute(*args, **kwargs)

k_gpu = to_gpu(k)
fk_gpu = to_gpu(fk)

if modeord == 1:
_fk = np.fft.ifftshift(fk)
else:
_fk = fk
fk_gpu = to_gpu(_fk)

plan.setpts(*k_gpu)

Expand Down
18 changes: 14 additions & 4 deletions src/cuda/1d/cufinufft1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,13 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
return ier;
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
return ier;
}
}

return 0;
Expand Down Expand Up @@ -95,8 +100,13 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
return ier;
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
return ier;
} else {
if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
return ier;
}

// Step 2: FFT
cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down
18 changes: 14 additions & 4 deletions src/cuda/2d/cufinufft2d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,13 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
return ier;
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
return ier;
}
}

return 0;
Expand Down Expand Up @@ -95,8 +100,13 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
return ier;
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
return ier;
} else {
if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
return ier;
}

// Step 2: FFT
cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
Expand Down
18 changes: 14 additions & 4 deletions src/cuda/3d/cufinufft3d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,13 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
return FINUFFT_ERR_CUDA_FAILURE;

// Step 3: deconvolve and shuffle
if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
return ier;
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
return ier;
}
}

return 0;
Expand Down Expand Up @@ -91,8 +96,13 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
d_plan->fk = d_fkstart;

// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
return ier;
if (d_plan->opts.modeord == 0) {
if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
return ier;
} else {
if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
return ier;
}

// Step 2: FFT
RETURN_IF_CUDA_ERROR
Expand Down
2 changes: 2 additions & 0 deletions src/cuda/cufinufft.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,5 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)

// By default, only use device 0
opts->gpu_device_id = 0;

opts->modeord = 0;
}
}
Loading

0 comments on commit 7535a82

Please sign in to comment.