From 338b5e26b231dc512039de303ea54755a2e48c3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Joakim=20And=C3=A9n?= <janden@kth.se>
Date: Thu, 28 Dec 2023 00:16:37 +0100
Subject: [PATCH] cuda: update interface to take 64-bit M

For compatibility with FINUFFT.
---
 include/cufinufft.h              |  4 ++--
 include/cufinufft/impl.h         |  2 +-
 include/cufinufft/spreadinterp.h | 38 ++++++++++++++++----------------
 include/cufinufft/types.h        |  2 +-
 src/cuda/1d/interp1d_wrapper.cu  |  4 ++--
 src/cuda/1d/spread1d_wrapper.cu  | 18 +++++++--------
 src/cuda/1d/spreadinterp1d.cuh   | 18 +++++++--------
 src/cuda/2d/interp2d_wrapper.cu  |  6 ++---
 src/cuda/2d/spread2d_wrapper.cu  | 18 +++++++--------
 src/cuda/2d/spreadinterp2d.cuh   | 20 ++++++++---------
 src/cuda/3d/interp3d_wrapper.cu  | 14 ++++++------
 src/cuda/3d/spreadinterp3d.cuh   | 30 ++++++++++++-------------
 src/cuda/cufinufft.cu            |  4 ++--
 13 files changed, 89 insertions(+), 89 deletions(-)
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 3c498fed0..923fe1678 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -19,9 +19,9 @@ int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int
 int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps,
                         cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
 
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
+int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
                      double *d_t, double *d_u);
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
+int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
                       float *d_t, float *d_u);
 
 int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk);
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 34b969b46..680899250 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -264,7 +264,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
 }
 
 template <typename T>
-int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t<T> *d_plan)
+int cufinufft_setpts_impl(int64_t M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t<T> *d_plan)
 /*
     "setNUpts" stage (in single or double precision).
 
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index 7849579a7..bbe027baa 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -88,46 +88,46 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
 
 // Wrappers for methods of spreading
 template <typename T>
-int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread1d_nuptsdriven_prop(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread1d_subprob_prop(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread1d_subprob(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 
 template <typename T>
-int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread2d_subprob_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread2d_subprob(int nf1, int nf2, int64_t m, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread3d_blockgather(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
+int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan);
 template <typename T>
-int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuspread3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 
 // Wrappers for methods of interpolation
 template <typename T>
-int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuinterp1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuinterp2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuinterp2d_subprob(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 template <typename T>
-int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+int cuinterp3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 246b4aaa1..d8254b024 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -34,7 +34,7 @@ struct cufinufft_plan_t {
 
     int type;
     int dim;
-    CUFINUFFT_BIGINT M;
+    int64_t M;
     CUFINUFFT_BIGINT nf1;
     CUFINUFFT_BIGINT nf2;
     CUFINUFFT_BIGINT nf3;
diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
index 9992f1936..4cb1c5cdf 100644
--- a/src/cuda/1d/interp1d_wrapper.cu
+++ b/src/cuda/1d/interp1d_wrapper.cu
@@ -27,7 +27,7 @@ int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
 */
 {
     int nf1 = d_plan->nf1;
-    int M = d_plan->M;
+    int64_t M = d_plan->M;
 
     int ier;
     switch (d_plan->opts.gpu_method) {
@@ -43,7 +43,7 @@ int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
 }
 
 template <typename T>
-int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuinterp1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
     dim3 threadsPerBlock;
     dim3 blocks;
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
index 19101603f..ab6b22580 100644
--- a/src/cuda/1d/spread1d_wrapper.cu
+++ b/src/cuda/1d/spread1d_wrapper.cu
@@ -32,7 +32,7 @@ int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
 */
 {
     int nf1 = d_plan->nf1;
-    int M = d_plan->M;
+    int64_t M = d_plan->M;
 
     int ier;
     switch (d_plan->opts.gpu_method) {
@@ -51,7 +51,7 @@ int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
 }
 
 template <typename T>
-int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
+int cuspread1d_nuptsdriven_prop(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan) {
     auto &stream = d_plan->stream;
 
     if (d_plan->opts.gpu_sort) {
@@ -96,7 +96,7 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
 }
 
 template <typename T>
-int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuspread1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
     dim3 threadsPerBlock;
     dim3 blocks;
@@ -135,7 +135,7 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
 }
 
 template <typename T>
-int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
+int cuspread1d_subprob_prop(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan)
 /*
     This function determines the properties for spreading that are independent
     of the strength of the nodes,  only relates to the locations of the nodes,
@@ -217,7 +217,7 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
 }
 
 template <typename T>
-int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuspread1d_subprob(int nf1, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
 
     int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
@@ -275,10 +275,10 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
 
 template int cuspread1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread1d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread1d_nuptsdriven_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread1d_nuptsdriven_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread1d_subprob_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread1d_subprob_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_nuptsdriven_prop<float>(int nf1, int64_t M, cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_nuptsdriven_prop<double>(int nf1, int64_t M, cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_subprob_prop<float>(int nf1, int64_t M, cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_subprob_prop<double>(int nf1, int64_t M, cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh
index 77ecf9a43..5a8eebb94 100644
--- a/src/cuda/1d/spreadinterp1d.cuh
+++ b/src/cuda/1d/spreadinterp1d.cuh
@@ -16,14 +16,14 @@ namespace spreadinterp {
 /* Kernels for NUptsdriven Method */
 
 template <typename T, int KEREVALMETH>
-__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
+__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int64_t M, int ns, int nf1,
                                       T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) {
     int xx, ix;
     T ker1[MAX_NSPREAD];
 
     T x_rescaled;
     cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
+    for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         cnow = c[idxnupts[i]];
         int xstart = ceil(x_rescaled - ns / 2.0);
@@ -47,12 +47,12 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c, cuda
 /* Kernels for SubProb Method */
 // SubProb properties
 template <typename T>
-__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x,
+__global__ void calc_bin_size_noghost_1d(int64_t M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x,
                                          int *sortidx, int pirange) {
     int binx;
     int oldidx;
     T x_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         binx = floor(x_rescaled / bin_size_x);
         binx = binx >= nbinx ? binx - 1 : binx;
@@ -66,11 +66,11 @@ __global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbi
 }
 
 template <typename T>
-__global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nbinx, const int *bin_startpts,
+__global__ void calc_inverse_of_global_sort_idx_1d(int64_t M, int bin_size_x, int nbinx, const int *bin_startpts,
                                                    const int *sortidx, const T *x, int *index, int pirange, int nf1) {
     int binx;
     T x_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         binx = floor(x_rescaled / bin_size_x);
         binx = binx >= nbinx ? binx - 1 : binx;
@@ -81,7 +81,7 @@ __global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nb
 }
 
 template <typename T, int KEREVALMETH>
-__global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
+__global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int64_t M, int ns, int nf1,
                                   T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size,
                                   int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts,
                                   const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts,
@@ -147,10 +147,10 @@ __global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_com
 /* --------------------- 1d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 template <typename T, int KEREVALMETH>
-__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns, int nf1,
+__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c, const cuda_complex<T> *fw, int64_t M, int ns, int nf1,
                                       T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) {
     T ker1[MAX_NSPREAD];
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
+    for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
 
         int xstart = ceil(x_rescaled - ns / 2.0);
diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
index fa6fecbaa..ab58996ad 100644
--- a/src/cuda/2d/interp2d_wrapper.cu
+++ b/src/cuda/2d/interp2d_wrapper.cu
@@ -28,7 +28,7 @@ int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
 {
     int nf1 = d_plan->nf1;
     int nf2 = d_plan->nf2;
-    int M = d_plan->M;
+    int64_t M = d_plan->M;
 
     int ier;
     switch (d_plan->opts.gpu_method) {
@@ -47,7 +47,7 @@ int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
 }
 
 template <typename T>
-int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuinterp2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
 
     dim3 threadsPerBlock;
@@ -90,7 +90,7 @@ int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
 }
 
 template <typename T>
-int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuinterp2d_subprob(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
 
     int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index 79c33ad10..cc216ff9e 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -33,7 +33,7 @@ int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
 {
     int nf1 = d_plan->nf1;
     int nf2 = d_plan->nf2;
-    int M = d_plan->M;
+    int64_t M = d_plan->M;
 
     int ier;
     switch (d_plan->opts.gpu_method) {
@@ -52,7 +52,7 @@ int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
 }
 
 template <typename T>
-int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan) {
+int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan) {
     auto &stream = d_plan->stream;
 
     if (d_plan->opts.gpu_sort) {
@@ -105,7 +105,7 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_
 }
 
 template <typename T>
-int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuspread2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
     dim3 threadsPerBlock;
     dim3 blocks;
@@ -146,7 +146,7 @@ int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
 }
 
 template <typename T>
-int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan)
+int cuspread2d_subprob_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan)
 /*
     This function determines the properties for spreading that are independent
     of the strength of the nodes,  only relates to the locations of the nodes,
@@ -233,7 +233,7 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
 }
 
 template <typename T>
-int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuspread2d_subprob(int nf1, int nf2, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
 
     int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
@@ -296,10 +296,10 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int
 
 template int cuspread2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread2d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int64_t M, cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int64_t M, cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int64_t M, cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int64_t M, cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh
index f6f3341d3..fc82115e0 100644
--- a/src/cuda/2d/spreadinterp2d.cuh
+++ b/src/cuda/2d/spreadinterp2d.cuh
@@ -16,7 +16,7 @@ namespace spreadinterp {
 /* Kernels for NUptsdriven Method */
 
 template <typename T, int KEREVALMETH>
-__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M,
+__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int64_t M,
                                        int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts,
                                        int pirange) {
     int xstart, ystart, xend, yend;
@@ -28,7 +28,7 @@ __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_comple
     T x_rescaled, y_rescaled;
     T kervalue1, kervalue2;
     cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
+    for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
         cnow = c[idxnupts[i]];
@@ -66,12 +66,12 @@ __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_comple
 /* Kernels for SubProb Method */
 // SubProb properties
 template <typename T>
-__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
+__global__ void calc_bin_size_noghost_2d(int64_t M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
                                          int *bin_size, T *x, T *y, int *sortidx, int pirange) {
     int binidx, binx, biny;
     int oldidx;
     T x_rescaled, y_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         y_rescaled = RESCALE(y[i], nf2, pirange);
         binx = floor(x_rescaled / bin_size_x);
@@ -90,13 +90,13 @@ __global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x
 }
 
 template <typename T>
-__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
+__global__ void calc_inverse_of_global_sort_index_2d(int64_t M, int bin_size_x, int bin_size_y, int nbinx, int nbiny,
                                                      const int *bin_startpts, const int *sortidx, const T *x,
                                                      const T *y, int *index, int pirange, int nf1, int nf2) {
     int binx, biny;
     int binidx;
     T x_rescaled, y_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         y_rescaled = RESCALE(y[i], nf2, pirange);
         binx = floor(x_rescaled / bin_size_x);
@@ -112,7 +112,7 @@ __global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int
 }
 
 template <typename T, int KEREVALMETH>
-__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
+__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int64_t M, int ns,
                                   int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
                                   int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
                                   const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts,
@@ -204,10 +204,10 @@ __global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T>
 /* --------------------- 2d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 template <typename T, int KEREVALMETH>
-__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M,
+__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int64_t M,
                                        int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts,
                                        int pirange) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
+    for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
 
@@ -249,7 +249,7 @@ __global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *
 
 /* Kernels for Subprob Method */
 template <typename T, int KEREVALMETH>
-__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
+__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int64_t M, int ns,
                                   int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
                                   int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
                                   const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts,
diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu
index e9476ce8e..7de430eed 100644
--- a/src/cuda/3d/interp3d_wrapper.cu
+++ b/src/cuda/3d/interp3d_wrapper.cu
@@ -29,7 +29,7 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
     int nf1 = d_plan->nf1;
     int nf2 = d_plan->nf2;
     int nf3 = d_plan->nf3;
-    int M = d_plan->M;
+    int64_t M = d_plan->M;
 
     int ier;
     switch (d_plan->opts.gpu_method) {
@@ -48,7 +48,7 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
 }
 
 template <typename T>
-int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
 
     dim3 threadsPerBlock;
@@ -93,7 +93,7 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T>
 }
 
 template <typename T>
-int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+int cuinterp3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<T> *d_plan, int blksize) {
     auto &stream = d_plan->stream;
 
     int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
@@ -155,13 +155,13 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
 template int cuinterp3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuinterp3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
 
-template int cuinterp3d_nuptsdriven<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan,
+template int cuinterp3d_nuptsdriven<float>(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<float> *d_plan,
                                            int blksize);
-template int cuinterp3d_nuptsdriven<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan,
+template int cuinterp3d_nuptsdriven<double>(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<double> *d_plan,
                                             int blksize);
 
-template int cuinterp3d_subprob<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan, int blksize);
-template int cuinterp3d_subprob<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan,
+template int cuinterp3d_subprob<float>(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuinterp3d_subprob<double>(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t<double> *d_plan,
                                         int blksize);
 
 } // namespace spreadinterp
diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh
index 80930b4be..497b48dd8 100644
--- a/src/cuda/3d/spreadinterp3d.cuh
+++ b/src/cuda/3d/spreadinterp3d.cuh
@@ -16,13 +16,13 @@ namespace spreadinterp {
 /* Kernels for bin sort NUpts */
 
 template <typename T>
-__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y,
+__global__ void calc_bin_size_noghost_3d(int64_t M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y,
                                          int bin_size_z, int nbinx, int nbiny, int nbinz, int *bin_size, const T *x,
                                          const T *y, const T *z, int *sortidx, int pirange) {
     int binidx, binx, biny, binz;
     int oldidx;
     T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         y_rescaled = RESCALE(y[i], nf2, pirange);
         z_rescaled = RESCALE(z[i], nf3, pirange);
@@ -44,14 +44,14 @@ __global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int b
 }
 
 template <typename T>
-__global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx,
+__global__ void calc_inverse_of_global_sort_index_3d(int64_t M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx,
                                                      int nbiny, int nbinz, const int *bin_startpts, const int *sortidx,
                                                      const T *x, const T *y, const T *z, int *index, int pirange,
                                                      int nf1, int nf2, int nf3) {
     int binx, biny, binz;
     int binidx;
     T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         y_rescaled = RESCALE(y[i], nf2, pirange);
         z_rescaled = RESCALE(z[i], nf3, pirange);
@@ -73,7 +73,7 @@ __global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int
 /* Kernels for NUptsdriven method */
 template <typename T, int KEREVALMETH>
 __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                       cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
+                                       cuda_complex<T> *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
                                        T sigma, const int *idxnupts, int pirange) {
     int xx, yy, zz, ix, iy, iz;
     int outidx;
@@ -84,7 +84,7 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const
     T ker1val, ker2val, ker3val;
 
     T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
+    for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
         z_rescaled = RESCALE(z[idxnupts[i]], nf3, pirange);
@@ -131,7 +131,7 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const
 
 /* Kernels for Subprob method */
 template <typename T, int KEREVALMETH>
-__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
+__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex<T> *c, cuda_complex<T> *fw, int64_t M, int ns, int nf1,
                                   int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size,
                                   int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin,
                                   int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
@@ -235,14 +235,14 @@ __global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex<T> *c, cuda_com
 
 /* Kernels for BlockGather Method */
 template <typename T>
-__global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx,
+__global__ void locate_nupts_to_bins_ghost(int64_t M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx,
                                            int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz,
                                            int *bin_size, const T *x, const T *y, const T *z, int *sortidx, int pirange,
                                            int nf1, int nf2, int nf3) {
     int binidx, binx, biny, binz;
     int oldidx;
     T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         y_rescaled = RESCALE(y[i], nf2, pirange);
         z_rescaled = RESCALE(z[i], nf3, pirange);
@@ -261,7 +261,7 @@ __global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y
 }
 
 template <typename T>
-__global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z,
+__global__ void calc_inverse_of_global_sort_index_ghost(int64_t M, int bin_size_x, int bin_size_y, int bin_size_z,
                                                         int nobinx, int nobiny, int nobinz, int binsperobinx,
                                                         int binsperobiny, int binsperobinz, int *bin_startpts,
                                                         const int *sortidx, const T *x, const T *y, const T *z,
@@ -269,7 +269,7 @@ __global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, i
     int binx, biny, binz;
     int binidx;
     T x_rescaled, y_rescaled, z_rescaled;
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
+    for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
         x_rescaled = RESCALE(x[i], nf1, pirange);
         y_rescaled = RESCALE(y[i], nf2, pirange);
         z_rescaled = RESCALE(z[i], nf3, pirange);
@@ -289,7 +289,7 @@ __global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, i
 
 template <typename T, int KEREVALMETH>
 __global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                       cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
+                                       cuda_complex<T> *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
                                        T sigma, const int *binstartpts, int obin_size_x, int obin_size_y,
                                        int obin_size_z, int binsperobin, int *subprob_to_bin,
                                        const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny,
@@ -403,9 +403,9 @@ __global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const
 /* Kernels for NUptsdriven Method */
 template <typename T, int KEREVALMETH>
 __global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex<T> *c,
-                                       const cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c,
+                                       const cuda_complex<T> *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T es_c,
                                        T es_beta, T sigma, int *idxnupts, int pirange) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
+    for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
         T z_rescaled = RESCALE(z[idxnupts[i]], nf3, pirange);
@@ -459,7 +459,7 @@ __global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_
 /* Kernels for SubProb Method */
 template <typename T, int KEREVALMETH>
 __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex<T> *c, const cuda_complex<T> *fw,
-                                  int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma,
+                                  int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma,
                                   const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y,
                                   int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts,
                                   const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz,
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 60cdd4482..5084e492a 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -53,12 +53,12 @@ int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int
                                    opts);
 }
 
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
+int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
                       float *d_t, float *d_u) {
     return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t<float> *)d_plan);
 }
 
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
+int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
                      double *d_t, double *d_u) {
     return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t<double> *)d_plan);
 }