From 5188adac206485c1e95593b5059b469c23a6a87a Mon Sep 17 00:00:00 2001
From: Libin Lu <llu@flatironinstitute.org>
Date: Mon, 29 Apr 2024 16:12:57 -0400
Subject: [PATCH 1/4] cufinufft with modeord: 0 CMCL-style, 1 FFT-style

---
 CHANGELOG                                |  2 +
 include/cufinufft/cudeconvolve.h         | 12 ++---
 include/cufinufft_opts.h                 |  3 ++
 python/cufinufft/cufinufft/_cufinufft.py |  3 +-
 python/cufinufft/cufinufft/_plan.py      |  4 +-
 python/cufinufft/tests/test_basic.py     | 34 ++++++++++++-
 src/cuda/cufinufft.cu                    |  2 +
 src/cuda/deconvolve_wrapper.cu           | 62 ++++++++++++------------
 8 files changed, 82 insertions(+), 40 deletions(-)
diff --git a/CHANGELOG b/CHANGELOG
index 14c04dceb..2a068b7ab 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,8 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
+* cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode
+  order, 1 FFT-style mode order.
 * CPU plan stage prevents now caps # threads at omp_get_max_threads (being 1
   for single-thread build); warns if this cap was activated (PR 431)
 * new docs troubleshooting accuracy limitations due to condition number of the
diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h
index 4daa4767e..bf4c49e63 100644
--- a/include/cufinufft/cudeconvolve.h
+++ b/include/cufinufft/cudeconvolve.h
@@ -6,22 +6,22 @@
 namespace cufinufft {
 namespace deconvolve {
 template <typename T>
-__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1);
+__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord);
 template <typename T>
-__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2);
+__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2, int modeord);
 template <typename T>
 __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                              T *fwkerhalf1, T *fwkerhalf2);
+                              T *fwkerhalf1, T *fwkerhalf2, int modeord);
 template <typename T>
 __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2);
+                           T *fwkerhalf1, T *fwkerhalf2, int modeord);
 
 template <typename T>
 __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord);
 template <typename T>
 __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord);
 
 template <typename T>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h
index 9760884ae..a3da46f0d 100644
--- a/include/cufinufft_opts.h
+++ b/include/cufinufft_opts.h
@@ -26,6 +26,9 @@ typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults
     int gpu_device_id;
 
     void *gpu_stream;
+
+    int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
+                 //                  1 FFT-style mode order
 } cufinufft_opts;
 
 #endif
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
index 429ce9796..2fccba192 100644
--- a/python/cufinufft/cufinufft/_cufinufft.py
+++ b/python/cufinufft/cufinufft/_cufinufft.py
@@ -63,7 +63,8 @@ def _get_NufftOpts():
         ('gpu_spreadinterponly', c_int),
         ('gpu_maxbatchsize', c_int),
         ('gpu_device_id', c_int),
-        ('gpu_stream', c_void_p)
+        ('gpu_stream', c_void_p),
+        ('modeord', c_int)
     ]
     return fields
 
diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
index 263c12cd7..67bcee59b 100644
--- a/python/cufinufft/cufinufft/_plan.py
+++ b/python/cufinufft/cufinufft/_plan.py
@@ -65,7 +65,9 @@ class Plan:
                         memory), ``gpu_sort`` (for ``gpu_method == 1``, 0: no
                         sort, 1: sort), ``gpu_kerevalmeth`` (0: direct
                         exp(sqrt), Horner evaluation), ``gpu_device_id`` (GPU
-                        ID), and ``gpu_stream`` (CUDA stream pointer).
+                        ID), ``gpu_stream`` (CUDA stream pointer) and
+                        ``modeord`` (0: CMCL-compatible mode ordering,
+                        1: FFT-style mode ordering).
     """
 
     def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
index f4bc5713a..6d65ac85f 100644
--- a/python/cufinufft/tests/test_basic.py
+++ b/python/cufinufft/tests/test_basic.py
@@ -9,7 +9,7 @@
 # NOTE: Tests below fail for tolerance 1e-4 (error executing plan).
 
 DTYPES = [np.float32, np.float64]
-SHAPES = [(16,), (16, 16), (16, 16, 16)]
+SHAPES = [(16,), (16, 16), (16, 16, 16), (19,), (17, 19), (17, 19, 24)]
 MS = [256, 1024, 4096]
 TOLS = [1e-3, 1e-6]
 OUTPUT_ARGS = [False, True]
@@ -47,6 +47,38 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
     utils.verify_type1(k, c, fk, tol)
 
 
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("M", MS)
+@pytest.mark.parametrize("tol", TOLS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_type1_modeord(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
+    complex_dtype = utils._complex_dtype(dtype)
+
+    k, c = utils.type1_problem(dtype, shape, M)
+
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
+
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=1)
+
+    # Since k_gpu is an array of shape (dim, M), this will expand to
+    # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
+    # dimensions with the same call.
+    plan.setpts(*k_gpu)
+
+    if output_arg:
+        fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
+        plan.execute(c_gpu, out=fk_gpu)
+    else:
+        fk_gpu = plan.execute(c_gpu)
+
+    fk = to_cpu(fk_gpu)
+    fk = np.fft.fftshift(fk)
+
+    utils.verify_type1(k, c, fk, tol)
+
+
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("shape", SHAPES)
 @pytest.mark.parametrize("M", MS)
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 60cdd4482..091158299 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -121,5 +121,7 @@ void cufinufft_default_opts(cufinufft_opts *opts)
 
     // By default, only use device 0
     opts->gpu_device_id = 0;
+
+    opts->modeord = 0;
 }
 }
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index ffb65b8da..5ef561b6b 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -11,13 +11,13 @@ namespace cufinufft {
 namespace deconvolve {
 /* Kernel for copying fw to fk with amplication by prefac/ker */
 // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up
-// to N/2-1)
+// to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1).
 template <typename T>
-__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
+__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2;
+        int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i );
 
-        T kervalue = fwkerhalf1[abs(i - ms / 2)];
+        T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)];
         fk[i].x = fw[w1].x / kervalue;
         fk[i].y = fw[w1].y / kervalue;
     }
@@ -25,16 +25,16 @@ __global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex
 
 template <typename T>
 __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                              T *fwkerhalf2) {
+                              T *fwkerhalf2, int modeord) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
         int k1 = i % ms;
         int k2 = i / ms;
         int outidx = k1 + k2 * ms;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
+        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
+        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
         int inidx = w1 + w2 * nf1;
 
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)];
+        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)];
         fk[outidx].x = fw[inidx].x / kervalue;
         fk[outidx].y = fw[inidx].y / kervalue;
     }
@@ -42,18 +42,18 @@ __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T>
 
 template <typename T>
 __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
+                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
         int k1 = i % ms;
         int k2 = (i / ms) % mt;
         int k3 = (i / ms / mt);
         int outidx = k1 + k2 * ms + k3 * ms * mt;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
-        int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2;
+        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
+        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
+        int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 );
         int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
 
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)];
+        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)];
         fk[outidx].x = fw[inidx].x / kervalue;
         fk[outidx].y = fw[inidx].y / kervalue;
     }
@@ -61,11 +61,11 @@ __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
 
 /* Kernel for copying fk to fw with same amplication */
 template <typename T>
-__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
+__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2;
+        int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i );
 
-        T kervalue = fwkerhalf1[abs(i - ms / 2)];
+        T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)];
         fw[w1].x = fk[i].x / kervalue;
         fw[w1].y = fk[i].y / kervalue;
     }
@@ -73,16 +73,16 @@ __global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T>
 
 template <typename T>
 __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                           T *fwkerhalf2) {
+                           T *fwkerhalf2, int modeord) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
         int k1 = i % ms;
         int k2 = i / ms;
         int inidx = k1 + k2 * ms;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
+        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
+        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
         int outidx = w1 + w2 * nf1;
 
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)];
+        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)];
         fw[outidx].x = fk[inidx].x / kervalue;
         fw[outidx].y = fk[inidx].y / kervalue;
     }
@@ -90,18 +90,18 @@ __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw
 
 template <typename T>
 __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
+                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
         int k1 = i % ms;
         int k2 = (i / ms) % mt;
         int k3 = (i / ms / mt);
         int inidx = k1 + k2 * ms + k3 * ms * mt;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
-        int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2;
+        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
+        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
+        int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 );
         int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
 
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)];
+        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)];
         fw[outidx].x = fk[inidx].x / kervalue;
         fw[outidx].y = fk[inidx].y / kervalue;
     }
@@ -125,13 +125,13 @@ int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
     if (d_plan->spopts.spread_direction == 1) {
         for (int t = 0; t < blksize; t++) {
             deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
+                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord);
         }
     } else {
         checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream));
         for (int t = 0; t < blksize; t++) {
             amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
+                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord);
         }
     }
     return 0;
@@ -158,14 +158,14 @@ int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
         for (int t = 0; t < blksize; t++) {
             deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
                                                                         d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                        d_plan->fwkerhalf2);
+                                                                        d_plan->fwkerhalf2, d_plan->opts.modeord);
         }
     } else {
         checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream));
         for (int t = 0; t < blksize; t++) {
             amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
                                                                      d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                     d_plan->fwkerhalf2);
+                                                                     d_plan->fwkerhalf2, d_plan->opts.modeord);
         }
     }
     return 0;
@@ -193,7 +193,7 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
         for (int t = 0; t < blksize; t++) {
             deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
                 ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
+                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord);
         }
     } else {
         checkCudaErrors(
@@ -201,7 +201,7 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
         for (int t = 0; t < blksize; t++) {
             amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
                 ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
+                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord);
         }
     }
     return 0;

From 67fa61eda2dea223c532c516c3ea554e9f97a647 Mon Sep 17 00:00:00 2001
From: Libin Lu <llu@flatironinstitute.org>
Date: Tue, 28 May 2024 10:18:25 -0400
Subject: [PATCH 2/4] cufinufft python: improve testing of modeord option

---
 python/cufinufft/tests/test_basic.py | 52 ++++++++--------------------
 1 file changed, 15 insertions(+), 37 deletions(-)

diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
index 6d65ac85f..b66e00f18 100644
--- a/python/cufinufft/tests/test_basic.py
+++ b/python/cufinufft/tests/test_basic.py
@@ -14,6 +14,7 @@
 TOLS = [1e-3, 1e-6]
 OUTPUT_ARGS = [False, True]
 CONTIGUOUS = [False, True]
+MODEORDS = [0, 1]
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
@@ -21,7 +22,8 @@
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
+@pytest.mark.parametrize("modeord", MODEORDS)
+def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, modeord):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, c = utils.type1_problem(dtype, shape, M)
@@ -29,7 +31,7 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
     k_gpu = to_gpu(k)
     c_gpu = to_gpu(c)
 
-    plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=modeord)
 
     # Since k_gpu is an array of shape (dim, M), this will expand to
     # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
@@ -43,38 +45,8 @@ def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
         fk_gpu = plan.execute(c_gpu)
 
     fk = to_cpu(fk_gpu)
-
-    utils.verify_type1(k, c, fk, tol)
-
-
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("shape", SHAPES)
-@pytest.mark.parametrize("M", MS)
-@pytest.mark.parametrize("tol", TOLS)
-@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_type1_modeord(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
-    complex_dtype = utils._complex_dtype(dtype)
-
-    k, c = utils.type1_problem(dtype, shape, M)
-
-    k_gpu = to_gpu(k)
-    c_gpu = to_gpu(c)
-
-    plan = Plan(1, shape, eps=tol, dtype=complex_dtype, modeord=1)
-
-    # Since k_gpu is an array of shape (dim, M), this will expand to
-    # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
-    # dimensions with the same call.
-    plan.setpts(*k_gpu)
-
-    if output_arg:
-        fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
-        plan.execute(c_gpu, out=fk_gpu)
-    else:
-        fk_gpu = plan.execute(c_gpu)
-
-    fk = to_cpu(fk_gpu)
-    fk = np.fft.fftshift(fk)
+    if modeord == 1:
+        fk = np.fft.fftshift(fk)
 
     utils.verify_type1(k, c, fk, tol)
 
@@ -85,12 +57,13 @@ def test_type1_modeord(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
 @pytest.mark.parametrize("contiguous", CONTIGUOUS)
-def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous):
+@pytest.mark.parametrize("modeord", MODEORDS)
+def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous, modeord):
     complex_dtype = utils._complex_dtype(dtype)
 
     k, fk = utils.type2_problem(dtype, shape, M)
 
-    plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
+    plan = Plan(2, shape, eps=tol, dtype=complex_dtype, modeord=modeord)
 
     check_result = True
 
@@ -113,7 +86,12 @@ def _execute(*args, **kwargs):
             return plan.execute(*args, **kwargs)
 
     k_gpu = to_gpu(k)
-    fk_gpu = to_gpu(fk)
+
+    if modeord == 1:
+        _fk = np.fft.ifftshift(fk)
+    else:
+        _fk = fk
+    fk_gpu = to_gpu(_fk)
 
     plan.setpts(*k_gpu)
 

From 63623c05028f19b00fa2e9f507842fbf1ea506cc Mon Sep 17 00:00:00 2001
From: Libin Lu <llu@flatironinstitute.org>
Date: Wed, 29 May 2024 18:12:12 -0400
Subject: [PATCH 3/4] more human readable code for cuda kernels in
 deconvolve_wrapper.cu

---
 src/cuda/deconvolve_wrapper.cu | 184 ++++++++++++++++++++++++++-------
 1 file changed, 144 insertions(+), 40 deletions(-)

diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index 5ef561b6b..f0213b60d 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -14,10 +14,21 @@ namespace deconvolve {
 // to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1).
 template <typename T>
 __global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord) {
+    int pivot1, w1, fwkerind1;
+    T kervalue;
+
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i );
+        if (modeord == 0) {
+            pivot1 = i - ms / 2;
+            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+            fwkerind1 = abs(pivot1);
+        } else {
+            pivot1 = i - ms + ms / 2;
+            w1 = (pivot1 >= 0) ? nf1 + i - ms : i;
+            fwkerind1 = (pivot1 >= 0) ? ms - i : i;
+        }
 
-        T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)];
+        kervalue = fwkerhalf1[fwkerind1];
         fk[i].x = fw[w1].x / kervalue;
         fk[i].y = fw[w1].y / kervalue;
     }
@@ -26,15 +37,33 @@ __global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex
 template <typename T>
 __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
                               T *fwkerhalf2, int modeord) {
+    int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
+    int k1, k2, inidx, outidx;
+    T kervalue;
+
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = i / ms;
-        int outidx = k1 + k2 * ms;
-        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
-        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
-        int inidx = w1 + w2 * nf1;
-
-        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)];
+        k1 = i % ms;
+        k2 = i / ms;
+        outidx = k1 + k2 * ms;
+
+        if (modeord == 0) {
+            pivot1 = k1 - ms / 2;
+            pivot2 = k2 - mt / 2;
+            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+            fwkerind1 = abs(pivot1);
+            fwkerind2 = abs(pivot2);
+        } else {
+            pivot1 = k1 - ms + ms / 2;
+            pivot2 = k2 - mt + mt / 2;
+            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
+        }
+
+        inidx = w1 + w2 * nf1;
+        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2];
         fk[outidx].x = fw[inidx].x / kervalue;
         fk[outidx].y = fw[inidx].y / kervalue;
     }
@@ -43,17 +72,40 @@ __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T>
 template <typename T>
 __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw,
                               cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) {
+    int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
+    int k1, k2, k3, inidx, outidx;
+    T kervalue;
+
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = (i / ms) % mt;
-        int k3 = (i / ms / mt);
-        int outidx = k1 + k2 * ms + k3 * ms * mt;
-        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
-        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
-        int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 );
-        int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
-
-        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)];
+        k1 = i % ms;
+        k2 = (i / ms) % mt;
+        k3 = (i / ms / mt);
+        outidx = k1 + k2 * ms + k3 * ms * mt;
+
+        if (modeord == 0) {
+            pivot1 = k1 - ms / 2;
+            pivot2 = k2 - mt / 2;
+            pivot3 = k3 - mu / 2;
+            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+            w3 = (pivot3 >= 0) ? pivot3 : nf3 + pivot3;
+            fwkerind1 = abs(pivot1);
+            fwkerind2 = abs(pivot2);
+            fwkerind3 = abs(pivot3);
+        } else {
+            pivot1 = k1 - ms + ms / 2;
+            pivot2 = k2 - mt + mt / 2;
+            pivot3 = k3 - mu + mu / 2;
+            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+            w3 = (pivot3 >= 0) ? nf3 + k3 - mu : k3;
+            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
+            fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3;
+        }
+
+        inidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
+        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3];
         fk[outidx].x = fw[inidx].x / kervalue;
         fk[outidx].y = fw[inidx].y / kervalue;
     }
@@ -62,10 +114,21 @@ __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
 /* Kernel for copying fk to fw with same amplication */
 template <typename T>
 __global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord) {
+    int pivot1, w1, fwkerind1;
+    T kervalue;
+
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        int w1 = ( modeord == 0 ) ? ( (i - ms / 2 >= 0) ? i - ms / 2 : nf1 + i - ms / 2 ) : ( (i - ms + ms / 2 >= 0) ? nf1 + i - ms : i );
+        if (modeord == 0) {
+            pivot1 = i - ms / 2;
+            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+            fwkerind1 = abs(pivot1);
+        } else {
+            pivot1 = i - ms + ms / 2;
+            w1 = (pivot1 >= 0) ? nf1 + i - ms : i;
+            fwkerind1 = (pivot1 >= 0) ? ms - i : i;
+        }
 
-        T kervalue = fwkerhalf1[(modeord==0) ? abs(i - ms / 2) : ((i - ms + ms / 2 >= 0) ? ms - i : i)];
+        kervalue = fwkerhalf1[fwkerind1];
         fw[w1].x = fk[i].x / kervalue;
         fw[w1].y = fk[i].y / kervalue;
     }
@@ -74,15 +137,33 @@ __global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T>
 template <typename T>
 __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
                            T *fwkerhalf2, int modeord) {
+    int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
+    int k1, k2, inidx, outidx;
+    T kervalue;
+
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = i / ms;
-        int inidx = k1 + k2 * ms;
-        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
-        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
-        int outidx = w1 + w2 * nf1;
-
-        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)];
+        k1 = i % ms;
+        k2 = i / ms;
+        inidx = k1 + k2 * ms;
+
+        if (modeord == 0) {
+            pivot1 = k1 - ms / 2;
+            pivot2 = k2 - mt / 2;
+            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+            fwkerind1 = abs(pivot1);
+            fwkerind2 = abs(pivot2);
+        } else {
+            pivot1 = k1 - ms + ms / 2;
+            pivot2 = k2 - mt + mt / 2;
+            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
+        }
+
+        outidx = w1 + w2 * nf1;
+        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2];
         fw[outidx].x = fk[inidx].x / kervalue;
         fw[outidx].y = fk[inidx].y / kervalue;
     }
@@ -91,17 +172,40 @@ __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw
 template <typename T>
 __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw, cuda_complex<T> *fk,
                            T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) {
+    int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
+    int k1, k2, k3, inidx, outidx;
+    T kervalue;
+
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = (i / ms) % mt;
-        int k3 = (i / ms / mt);
-        int inidx = k1 + k2 * ms + k3 * ms * mt;
-        int w1 = ( modeord == 0 ) ? ( (k1 - ms / 2 >= 0) ? k1 - ms / 2 : nf1 + k1 - ms / 2 ) : ( (k1 - ms + ms / 2 >= 0) ? nf1 + k1 - ms : k1 );
-        int w2 = ( modeord == 0 ) ? ( (k2 - mt / 2 >= 0) ? k2 - mt / 2 : nf2 + k2 - mt / 2 ) : ( (k2 - mt + mt / 2 >= 0) ? nf2 + k2 - mt : k2 );
-        int w3 = ( modeord == 0 ) ? ( (k3 - mu / 2 >= 0) ? k3 - mu / 2 : nf3 + k3 - mu / 2 ) : ( (k3 - mu + mu / 2 >= 0) ? nf3 + k3 - mu : k3 );
-        int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
-
-        T kervalue = fwkerhalf1[(modeord==0) ? abs(k1 - ms / 2) : ((k1 - ms + ms / 2 >= 0) ? ms - k1 : k1)] * fwkerhalf2[(modeord==0) ? abs(k2 - mt / 2) : ((k2 - mt + mt / 2 >= 0) ? mt - k2 : k2)] * fwkerhalf3[(modeord==0) ? abs(k3 - mu / 2) : ((k3 - mu + mu / 2 >= 0) ? mu - k3 : k3)];
+        k1 = i % ms;
+        k2 = (i / ms) % mt;
+        k3 = (i / ms / mt);
+        inidx = k1 + k2 * ms + k3 * ms * mt;
+
+        if (modeord == 0) {
+            pivot1 = k1 - ms / 2;
+            pivot2 = k2 - mt / 2;
+            pivot3 = k3 - mu / 2;
+            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+            w3 = (pivot3 >= 0) ? pivot3 : nf3 + pivot3;
+            fwkerind1 = abs(pivot1);
+            fwkerind2 = abs(pivot2);
+            fwkerind3 = abs(pivot3);
+        } else {
+            pivot1 = k1 - ms + ms / 2;
+            pivot2 = k2 - mt + mt / 2;
+            pivot3 = k3 - mu + mu / 2;
+            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+            w3 = (pivot3 >= 0) ? nf3 + k3 - mu : k3;
+            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
+            fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3;
+        }
+
+        outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
+        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3];
         fw[outidx].x = fk[inidx].x / kervalue;
         fw[outidx].y = fk[inidx].y / kervalue;
     }

From bb78e88f89ec21eba34de71facc82510fcf68196 Mon Sep 17 00:00:00 2001
From: Libin Lu <llu@flatironinstitute.org>
Date: Wed, 29 May 2024 18:23:05 -0400
Subject: [PATCH 4/4] templating mordeord in cuda deconv kernels

---
 include/cufinufft/cudeconvolve.h | 30 ++++++-------
 src/cuda/1d/cufinufft1d.cu       | 18 ++++++--
 src/cuda/2d/cufinufft2d.cu       | 18 ++++++--
 src/cuda/3d/cufinufft3d.cu       | 18 ++++++--
 src/cuda/deconvolve_wrapper.cu   | 72 +++++++++++++++++---------------
 5 files changed, 96 insertions(+), 60 deletions(-)

diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h
index bf4c49e63..6395f16c3 100644
--- a/include/cufinufft/cudeconvolve.h
+++ b/include/cufinufft/cudeconvolve.h
@@ -5,29 +5,29 @@
 
 namespace cufinufft {
 namespace deconvolve {
-template <typename T>
-__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord);
-template <typename T>
-__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2, int modeord);
-template <typename T>
+template <typename T, int modeord>
+__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1);
+template <typename T, int modeord>
+__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2);
+template <typename T, int modeord>
 __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                              T *fwkerhalf1, T *fwkerhalf2, int modeord);
-template <typename T>
+                              T *fwkerhalf1, T *fwkerhalf2);
+template <typename T, int modeord>
 __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2, int modeord);
+                           T *fwkerhalf1, T *fwkerhalf2);
 
-template <typename T>
+template <typename T, int modeord>
 __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord);
-template <typename T>
+                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+template <typename T, int modeord>
 __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord);
+                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
 
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve2d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve3d(cufinufft_plan_t<T> *d_mem, int blksize);
 } // namespace deconvolve
 } // namespace cufinufft
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 246a064f6..62574c06b 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -59,8 +59,13 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
             return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
     }
 
     return 0;
@@ -95,8 +100,13 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
 
         // Step 2: FFT
         cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index b566f49ce..3cd85281f 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -59,8 +59,13 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
             return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
     }
 
     return 0;
@@ -95,8 +100,13 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
 
         // Step 2: FFT
         cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index fa02ef860..c1209039a 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -57,8 +57,13 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
             return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
     }
 
     return 0;
@@ -91,8 +96,13 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
-            return ier;
+        if (d_plan->opts.modeord == 0) {
+            if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
+                return ier;
+        } else {
+            if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
+                return ier;
+        }
 
         // Step 2: FFT
         RETURN_IF_CUDA_ERROR
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index f0213b60d..75d8adda3 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -12,8 +12,8 @@ namespace deconvolve {
 /* Kernel for copying fw to fk with amplication by prefac/ker */
 // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up
 // to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1).
-template <typename T>
-__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord) {
+template <typename T, int modeord>
+__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
     int pivot1, w1, fwkerind1;
     T kervalue;
 
@@ -34,9 +34,9 @@ __global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex
     }
 }
 
-template <typename T>
+template <typename T, int modeord>
 __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                              T *fwkerhalf2, int modeord) {
+                              T *fwkerhalf2) {
     int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
     int k1, k2, inidx, outidx;
     T kervalue;
@@ -69,9 +69,9 @@ __global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T>
     }
 }
 
-template <typename T>
+template <typename T, int modeord>
 __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) {
+                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
     int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
     int k1, k2, k3, inidx, outidx;
     T kervalue;
@@ -112,8 +112,8 @@ __global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
 }
 
 /* Kernel for copying fk to fw with same amplication */
-template <typename T>
-__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1, int modeord) {
+template <typename T, int modeord>
+__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
     int pivot1, w1, fwkerind1;
     T kervalue;
 
@@ -134,9 +134,9 @@ __global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T>
     }
 }
 
-template <typename T>
+template <typename T, int modeord>
 __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                           T *fwkerhalf2, int modeord) {
+                           T *fwkerhalf2) {
     int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
     int k1, k2, inidx, outidx;
     T kervalue;
@@ -169,9 +169,9 @@ __global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw
     }
 }
 
-template <typename T>
+template <typename T, int modeord>
 __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3, int modeord) {
+                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
     int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
     int k1, k2, k3, inidx, outidx;
     T kervalue;
@@ -211,7 +211,7 @@ __global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cu
     }
 }
 
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 1D.
@@ -228,20 +228,20 @@ int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
 
     if (d_plan->spopts.spread_direction == 1) {
         for (int t = 0; t < blksize; t++) {
-            deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord);
+            deconvolve_1d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
+                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
         }
     } else {
         checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream));
         for (int t = 0; t < blksize; t++) {
-            amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->opts.modeord);
+            amplify_1d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
+                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
         }
     }
     return 0;
 }
 
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 2D.
@@ -260,22 +260,22 @@ int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
 
     if (d_plan->spopts.spread_direction == 1) {
         for (int t = 0; t < blksize; t++) {
-            deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
+            deconvolve_2d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
                                                                         d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                        d_plan->fwkerhalf2, d_plan->opts.modeord);
+                                                                        d_plan->fwkerhalf2);
         }
     } else {
         checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream));
         for (int t = 0; t < blksize; t++) {
-            amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
+            amplify_2d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
                                                                      d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                     d_plan->fwkerhalf2, d_plan->opts.modeord);
+                                                                     d_plan->fwkerhalf2);
         }
     }
     return 0;
 }
 
-template <typename T>
+template <typename T, int modeord>
 int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 3D.
@@ -295,28 +295,34 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
     int maxbatchsize = d_plan->maxbatchsize;
     if (d_plan->spopts.spread_direction == 1) {
         for (int t = 0; t < blksize; t++) {
-            deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+            deconvolve_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
                 ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord);
+                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
         }
     } else {
         checkCudaErrors(
             cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream));
         for (int t = 0; t < blksize; t++) {
-            amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+            amplify_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
                 ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3, d_plan->opts.modeord);
+                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
         }
     }
     return 0;
 }
 
-template int cudeconvolve1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
-template int cudeconvolve1d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cudeconvolve2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
-template int cudeconvolve2d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cudeconvolve3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
-template int cudeconvolve3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cudeconvolve1d<float,  0>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve1d<float,  1>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve1d<double, 0>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cudeconvolve1d<double, 1>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cudeconvolve2d<float,  0>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve2d<float,  1>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve2d<double, 0>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cudeconvolve2d<double, 1>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cudeconvolve3d<float,  0>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve3d<float,  1>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve3d<double, 0>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cudeconvolve3d<double, 1>(cufinufft_plan_t<double> *d_plan, int blksize);
 
 } // namespace deconvolve
 } // namespace cufinufft