From 338b5e26b231dc512039de303ea54755a2e48c3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joakim=20And=C3=A9n?= Date: Thu, 28 Dec 2023 00:16:37 +0100 Subject: [PATCH] cuda: update interface to take 64-bit M For compatibility with FINUFFT. --- include/cufinufft.h | 4 ++-- include/cufinufft/impl.h | 2 +- include/cufinufft/spreadinterp.h | 38 ++++++++++++++++---------------- include/cufinufft/types.h | 2 +- src/cuda/1d/interp1d_wrapper.cu | 4 ++-- src/cuda/1d/spread1d_wrapper.cu | 18 +++++++-------- src/cuda/1d/spreadinterp1d.cuh | 18 +++++++-------- src/cuda/2d/interp2d_wrapper.cu | 6 ++--- src/cuda/2d/spread2d_wrapper.cu | 18 +++++++-------- src/cuda/2d/spreadinterp2d.cuh | 20 ++++++++--------- src/cuda/3d/interp3d_wrapper.cu | 14 ++++++------ src/cuda/3d/spreadinterp3d.cuh | 30 ++++++++++++------------- src/cuda/cufinufft.cu | 4 ++-- 13 files changed, 89 insertions(+), 89 deletions(-) diff --git a/include/cufinufft.h b/include/cufinufft.h index 3c498fed0..923fe1678 100644 --- a/include/cufinufft.h +++ b/include/cufinufft.h @@ -19,9 +19,9 @@ int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts); -int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s, +int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y, double *d_z, int N, double *d_s, double *d_t, double *d_u); -int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s, +int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y, float *d_z, int N, float *d_s, float *d_t, float *d_u); int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk); diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 34b969b46..680899250 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -264,7 +264,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } template -int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t *d_plan) +int cufinufft_setpts_impl(int64_t M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t *d_plan) /* "setNUpts" stage (in single or double precision). diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 7849579a7..bbe027baa 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -88,46 +88,46 @@ int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); // Wrappers for methods of spreading template -int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); +int cuspread1d_nuptsdriven_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); +int cuspread1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); +int cuspread1d_subprob_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); +int cuspread1d_subprob(int nf1, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); +int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); +int cuspread2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); +int cuspread2d_subprob_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t *d_plan, int blksize); +int cuspread2d_subprob(int nf1, int nf2, int64_t m, cufinufft_plan_t *d_plan, int blksize); template -int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); +int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); +int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +int cuspread3d_blockgather(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan); +int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan); template -int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +int cuspread3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); // Wrappers for methods of interpolation template -int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize); +int cuinterp1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); +int cuinterp2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize); +int cuinterp2d_subprob(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); template -int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); +int cuinterp3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); } // namespace spreadinterp } // namespace cufinufft diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h index 246b4aaa1..d8254b024 100644 --- a/include/cufinufft/types.h +++ b/include/cufinufft/types.h @@ -34,7 +34,7 @@ struct cufinufft_plan_t { int type; int dim; - CUFINUFFT_BIGINT M; + int64_t M; CUFINUFFT_BIGINT nf1; CUFINUFFT_BIGINT nf2; CUFINUFFT_BIGINT nf3; diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu index 9992f1936..4cb1c5cdf 100644 --- a/src/cuda/1d/interp1d_wrapper.cu +++ b/src/cuda/1d/interp1d_wrapper.cu @@ -27,7 +27,7 @@ int cuinterp1d(cufinufft_plan_t *d_plan, int blksize) */ { int nf1 = d_plan->nf1; - int M = d_plan->M; + int64_t M = d_plan->M; int ier; switch (d_plan->opts.gpu_method) { @@ -43,7 +43,7 @@ int cuinterp1d(cufinufft_plan_t *d_plan, int blksize) } template -int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuinterp1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 19101603f..ab6b22580 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -32,7 +32,7 @@ int cuspread1d(cufinufft_plan_t *d_plan, int blksize) */ { int nf1 = d_plan->nf1; - int M = d_plan->M; + int64_t M = d_plan->M; int ier; switch (d_plan->opts.gpu_method) { @@ -51,7 +51,7 @@ int cuspread1d(cufinufft_plan_t *d_plan, int blksize) } template -int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { +int cuspread1d_nuptsdriven_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan) { auto &stream = d_plan->stream; if (d_plan->opts.gpu_sort) { @@ -96,7 +96,7 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { } template -int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuspread1d_nuptsdriven(int nf1, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; @@ -135,7 +135,7 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blks } template -int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) +int cuspread1d_subprob_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan) /* This function determines the properties for spreading that are independent of the strength of the nodes, only relates to the locations of the nodes, @@ -217,7 +217,7 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) } template -int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuspread1d_subprob(int nf1, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells @@ -275,10 +275,10 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) template int cuspread1d(cufinufft_plan_t *d_plan, int blksize); template int cuspread1d(cufinufft_plan_t *d_plan, int blksize); -template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); -template int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan); +template int cuspread1d_nuptsdriven_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan); +template int cuspread1d_nuptsdriven_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan); +template int cuspread1d_subprob_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan); +template int cuspread1d_subprob_prop(int nf1, int64_t M, cufinufft_plan_t *d_plan); } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 77ecf9a43..5a8eebb94 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -16,14 +16,14 @@ namespace spreadinterp { /* Kernels for NUptsdriven Method */ template -__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, +__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int64_t M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) { int xx, ix; T ker1[MAX_NSPREAD]; T x_rescaled; cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange); cnow = c[idxnupts[i]]; int xstart = ceil(x_rescaled - ns / 2.0); @@ -47,12 +47,12 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda /* Kernels for SubProb Method */ // SubProb properties template -__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x, +__global__ void calc_bin_size_noghost_1d(int64_t M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x, int *sortidx, int pirange) { int binx; int oldidx; T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); binx = floor(x_rescaled / bin_size_x); binx = binx >= nbinx ? binx - 1 : binx; @@ -66,11 +66,11 @@ __global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbi } template -__global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nbinx, const int *bin_startpts, +__global__ void calc_inverse_of_global_sort_idx_1d(int64_t M, int bin_size_x, int nbinx, const int *bin_startpts, const int *sortidx, const T *x, int *index, int pirange, int nf1) { int binx; T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); binx = floor(x_rescaled / bin_size_x); binx = binx >= nbinx ? binx - 1 : binx; @@ -81,7 +81,7 @@ __global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nb } template -__global__ void spread_1d_subprob(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, +__global__ void spread_1d_subprob(const T *x, const cuda_complex *c, cuda_complex *fw, int64_t M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts, @@ -147,10 +147,10 @@ __global__ void spread_1d_subprob(const T *x, const cuda_complex *c, cuda_com /* --------------------- 1d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ template -__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, +__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int64_t M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) { T ker1[MAX_NSPREAD]; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange); int xstart = ceil(x_rescaled - ns / 2.0); diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index fa6fecbaa..ab58996ad 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -28,7 +28,7 @@ int cuinterp2d(cufinufft_plan_t *d_plan, int blksize) { int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; - int M = d_plan->M; + int64_t M = d_plan->M; int ier; switch (d_plan->opts.gpu_method) { @@ -47,7 +47,7 @@ int cuinterp2d(cufinufft_plan_t *d_plan, int blksize) } template -int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuinterp2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; @@ -90,7 +90,7 @@ int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } template -int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuinterp2d_subprob(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 79c33ad10..cc216ff9e 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -33,7 +33,7 @@ int cuspread2d(cufinufft_plan_t *d_plan, int blksize) { int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; - int M = d_plan->M; + int64_t M = d_plan->M; int ier; switch (d_plan->opts.gpu_method) { @@ -52,7 +52,7 @@ int cuspread2d(cufinufft_plan_t *d_plan, int blksize) } template -int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan) { +int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan) { auto &stream = d_plan->stream; if (d_plan->opts.gpu_sort) { @@ -105,7 +105,7 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_ } template -int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuspread2d_nuptsdriven(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; dim3 blocks; @@ -146,7 +146,7 @@ int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } template -int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan) +int cuspread2d_subprob_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan) /* This function determines the properties for spreading that are independent of the strength of the nodes, only relates to the locations of the nodes, @@ -233,7 +233,7 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan } template -int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuspread2d_subprob(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells @@ -296,10 +296,10 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int template int cuspread2d(cufinufft_plan_t *d_plan, int blksize); template int cuspread2d(cufinufft_plan_t *d_plan, int blksize); -template int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); -template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t *d_plan); +template int cuspread2d_subprob_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan); +template int cuspread2d_subprob_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan); +template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan); +template int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int64_t M, cufinufft_plan_t *d_plan); } // namespace spreadinterp } // namespace cufinufft diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index f6f3341d3..fc82115e0 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -16,7 +16,7 @@ namespace spreadinterp { /* Kernels for NUptsdriven Method */ template -__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, +__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) { int xstart, ystart, xend, yend; @@ -28,7 +28,7 @@ __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_comple T x_rescaled, y_rescaled; T kervalue1, kervalue2; cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange); y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange); cnow = c[idxnupts[i]]; @@ -66,12 +66,12 @@ __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_comple /* Kernels for SubProb Method */ // SubProb properties template -__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny, +__global__ void calc_bin_size_noghost_2d(int64_t M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny, int *bin_size, T *x, T *y, int *sortidx, int pirange) { int binidx, binx, biny; int oldidx; T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); y_rescaled = RESCALE(y[i], nf2, pirange); binx = floor(x_rescaled / bin_size_x); @@ -90,13 +90,13 @@ __global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x } template -__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, +__global__ void calc_inverse_of_global_sort_index_2d(int64_t M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts, const int *sortidx, const T *x, const T *y, int *index, int pirange, int nf1, int nf2) { int binx, biny; int binidx; T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); y_rescaled = RESCALE(y[i], nf2, pirange); binx = floor(x_rescaled / bin_size_x); @@ -112,7 +112,7 @@ __global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int } template -__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, +__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts, @@ -204,10 +204,10 @@ __global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex /* --------------------- 2d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ template -__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, +__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange); T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange); @@ -249,7 +249,7 @@ __global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex * /* Kernels for Subprob Method */ template -__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, +__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts, diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index e9476ce8e..7de430eed 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -29,7 +29,7 @@ int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) int nf1 = d_plan->nf1; int nf2 = d_plan->nf2; int nf3 = d_plan->nf3; - int M = d_plan->M; + int64_t M = d_plan->M; int ier; switch (d_plan->opts.gpu_method) { @@ -48,7 +48,7 @@ int cuinterp3d(cufinufft_plan_t *d_plan, int blksize) } template -int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; dim3 threadsPerBlock; @@ -93,7 +93,7 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t } template -int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize) { +int cuinterp3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize) { auto &stream = d_plan->stream; int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells @@ -155,13 +155,13 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); template int cuinterp3d(cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, +template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, +template int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, int blksize); -template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_plan, +template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); +template int cuinterp3d_subprob(int nf1, int nf2, int nf3, int64_t M, cufinufft_plan_t *d_plan, int blksize); } // namespace spreadinterp diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 80930b4be..497b48dd8 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -16,13 +16,13 @@ namespace spreadinterp { /* Kernels for bin sort NUpts */ template -__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y, +__global__ void calc_bin_size_noghost_3d(int64_t M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, int nbiny, int nbinz, int *bin_size, const T *x, const T *y, const T *z, int *sortidx, int pirange) { int binidx, binx, biny, binz; int oldidx; T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); y_rescaled = RESCALE(y[i], nf2, pirange); z_rescaled = RESCALE(z[i], nf3, pirange); @@ -44,14 +44,14 @@ __global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int b } template -__global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, +__global__ void calc_inverse_of_global_sort_index_3d(int64_t M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, int nbiny, int nbinz, const int *bin_startpts, const int *sortidx, const T *x, const T *y, const T *z, int *index, int pirange, int nf1, int nf2, int nf3) { int binx, biny, binz; int binidx; T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); y_rescaled = RESCALE(y[i], nf2, pirange); z_rescaled = RESCALE(z[i], nf3, pirange); @@ -73,7 +73,7 @@ __global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int /* Kernels for NUptsdriven method */ template __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, + cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) { int xx, yy, zz, ix, iy, iz; int outidx; @@ -84,7 +84,7 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const T ker1val, ker2val, ker3val; T x_rescaled, y_rescaled, z_rescaled; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange); y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange); z_rescaled = RESCALE(z[idxnupts[i]], nf3, pirange); @@ -131,7 +131,7 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const /* Kernels for Subprob method */ template -__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, +__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, @@ -235,14 +235,14 @@ __global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex *c, cuda_com /* Kernels for BlockGather Method */ template -__global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, +__global__ void locate_nupts_to_bins_ghost(int64_t M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size, const T *x, const T *y, const T *z, int *sortidx, int pirange, int nf1, int nf2, int nf3) { int binidx, binx, biny, binz; int oldidx; T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); y_rescaled = RESCALE(y[i], nf2, pirange); z_rescaled = RESCALE(z[i], nf3, pirange); @@ -261,7 +261,7 @@ __global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y } template -__global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, +__global__ void calc_inverse_of_global_sort_index_ghost(int64_t M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_startpts, const int *sortidx, const T *x, const T *y, const T *z, @@ -269,7 +269,7 @@ __global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, i int binx, biny, binz; int binidx; T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { + for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { x_rescaled = RESCALE(x[i], nf1, pirange); y_rescaled = RESCALE(y[i], nf2, pirange); z_rescaled = RESCALE(z[i], nf3, pirange); @@ -289,7 +289,7 @@ __global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, i template __global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, + cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *binstartpts, int obin_size_x, int obin_size_y, int obin_size_z, int binsperobin, int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny, @@ -403,9 +403,9 @@ __global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const /* Kernels for NUptsdriven Method */ template __global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex *c, - const cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, + const cuda_complex *fw, int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts, int pirange) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { + for (int64_t i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange); T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange); T z_rescaled = RESCALE(z[idxnupts[i]], nf3, pirange); @@ -459,7 +459,7 @@ __global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_ /* Kernels for SubProb Method */ template __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + int64_t M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index 60cdd4482..5084e492a 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -53,12 +53,12 @@ int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int opts); } -int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s, +int cufinufftf_setpts(cufinufftf_plan d_plan, int64_t M, float *d_x, float *d_y, float *d_z, int N, float *d_s, float *d_t, float *d_u) { return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t *)d_plan); } -int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s, +int cufinufft_setpts(cufinufft_plan d_plan, int64_t M, double *d_x, double *d_y, double *d_z, int N, double *d_s, double *d_t, double *d_u) { return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t *)d_plan); }