From 1cf0dbdfc72130d703d6ade8d5c88a1a8337c744 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 14:56:42 +0200 Subject: [PATCH 01/20] rearrange functions to avoid forward declarations --- include/finufft/spreadinterp.h | 9 - src/spreadinterp.cpp | 2364 +++++++++++++++----------------- 2 files changed, 1141 insertions(+), 1232 deletions(-) diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 78ecf9f22..7dbdd4cf1 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -40,15 +40,6 @@ FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT N, FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL interpSorted( - const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted( - const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, - UBIGINT N, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform, - const finufft_spread_opts &opts, int did_sort); FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx, diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 12327c2d6..eff98d05c 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -23,25 +23,132 @@ namespace finufft::spreadinterp { namespace { // anonymous namespace for internal structs equivalent to declaring everything // static -struct zip_low; -struct zip_hi; -template struct reverse_index; -template struct shuffle_index; -struct select_even; -struct select_odd; -// forward declaration to clean up the code and be able to use this everywhere in the file -template static constexpr auto BestSIMDHelper(); -template constexpr auto GetPaddedSIMDWidth(); +struct zip_low { + // helper struct to get the lower half of a SIMD register and zip it with itself + // it returns index 0, 0, 1, 1, ... N/2, N/2 + static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; } +}; +struct zip_hi { + // helper struct to get the upper half of a SIMD register and zip it with itself + // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N + static constexpr unsigned get(unsigned index, unsigned size) { + return (size + index) / 2; + } +}; +template struct reverse_index { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? (cap - 1 - index) : index; + } +}; +template struct shuffle_index { + static constexpr unsigned get(unsigned index, const unsigned size) { + return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; + } +}; +struct select_even { + static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; } +}; +struct select_odd { + static constexpr unsigned get(unsigned index, unsigned /*size*/) { + return index * 2 + 1; + } +}; + +// this finds the largest SIMD instruction set that can handle N elements +// void otherwise -> compile error +template static constexpr auto BestSIMDHelper() { + if constexpr (N % K == 0) { // returns void in the worst case + return xsimd::make_sized_batch{}; + } else { + return BestSIMDHelper> 1)>(); + } +} +template constexpr uint8_t min_simd_width() { + // finds the smallest simd width that can handle N elements + // simd size is batch size the SIMD width in xsimd terminology + if constexpr (std::is_void_v>) { + return min_simd_width(); + } else { + return N; + } +}; + +template constexpr auto find_optimal_simd_width() { + // finds the smallest simd width that minimizes the number of iterations + // NOTE: might be suboptimal for some cases 2^N+1 for example + // in the future we might want to implement a more sophisticated algorithm + uint8_t optimal_simd_width = min_simd_width(); + uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width; + for (uint8_t simd_width = optimal_simd_width; + simd_width <= xsimd::batch::size; + simd_width *= 2) { + uint8_t iterations = (N + simd_width - 1) / simd_width; + if (iterations < min_iterations) { + min_iterations = iterations; + optimal_simd_width = simd_width; + } + } + return optimal_simd_width; +} + +template constexpr auto GetPaddedSIMDWidth() { + // helper function to get the SIMD width with padding for the given number of elements + // that minimizes the number of iterations + return xsimd::make_sized_batch()>::type::size; +} template using PaddedSIMD = typename xsimd::make_sized_batch()>::type; -template uint8_t get_padding(uint8_t ns); -template constexpr auto get_padding(); +template constexpr auto get_padding() { + // helper function to get the padding for the given number of elements + // ns is known at compile time, rounds ns to the next multiple of the SIMD width + // then subtracts ns to get the padding using a bitwise and trick + // WARING: this trick works only for power of 2s + // SOURCE: Agner Fog's VCL manual + constexpr uint8_t width = GetPaddedSIMDWidth(); + return ((ns + width - 1) & (-width)) - ns; +} + +template constexpr auto get_padding_helper(uint8_t runtime_ns) { + // helper function to get the padding for the given number of elements where ns is + // known at runtime, it uses recursion to find the padding + // this allows to avoid having a function with a large number of switch cases + // as GetPaddedSIMDWidth requires a compile time value + // it cannot be a lambda function because of the template recursion + if constexpr (ns < 2) { + return 0; + } else { + if (runtime_ns == ns) { + return get_padding(); + } else { + return get_padding_helper(runtime_ns); + } + } +} + +template uint8_t get_padding(uint8_t ns) { + // return the padding as a function of the number of elements + // 2 * MAX_NSPREAD is the maximum number of elements that we can have + // that's why is hardcoded here + return get_padding_helper(ns); +} template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; -template constexpr uint8_t min_simd_width(); -template constexpr auto find_optimal_simd_width(); +template +constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { + // utility function to generate a sequence of a, b interleaved as function arguments + return T(((Is % 2 == 0) ? a : b)...); +} + template -constexpr auto initialize_complex_register(V a, V b) noexcept; +constexpr auto initialize_complex_register(V a, V b) noexcept { + // populates a SIMD register with a and b interleaved + // for example: + // +-------------------------------+ + // | a | b | a | b | a | b | a | b | + // +-------------------------------+ + // it uses index_sequence to generate the sequence of a, b at compile time + return generate_sequence_impl(a, b, std::make_index_sequence{}); +} template constexpr auto zip_low_index = xsimd::make_batch_constant, arch_t, zip_low>(); @@ -56,1013 +163,419 @@ constexpr auto select_odd_mask = xsimd::make_batch_constant, arch_t, select_odd>(); template constexpr std::array, N> pad_2D_array_with_zeros( - const std::array, N> &input) noexcept; -template FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept; + const std::array, N> &input) noexcept { + constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept { + std::array padded{0}; + for (auto i = 0; i < input.size(); ++i) { + padded[i] = input[i]; + } + return padded; + }; + std::array, N> output{}; + for (std::size_t i = 0; i < N; ++i) { + output[i] = pad_with_zeros(input[i]); + } + return output; +} + +template FINUFFT_ALWAYS_INLINE auto xsimd_to_array(const T &vec) noexcept { + constexpr auto alignment = T::arch_type::alignment(); + alignas(alignment) std::array array{}; + vec.store_aligned(array.data()); + return array; +} FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3, - UBIGINT M0); + UBIGINT M0) { + printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); + switch (ndims) { + case 1: + printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, + (long long)padded_size1, (long long)M0); + break; + case 2: + printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1, + (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); + break; + case 3: + printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", + (long long)offset1, (long long)offset2, (long long)offset3, + (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0); + break; + default: + printf("Invalid number of dimensions: %d\n", ndims); + break; + } +} } // namespace // declarations of purely internal functions... (thus need not be in .h) -template()>, - typename... V> -static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, - const finufft_spread_opts &opts, - const V... elems) noexcept; -static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, UBIGINT N) noexcept; + +/* local NU coord fold+rescale macro: does the following affine transform to x: + (x+PI) mod PI each to [0,N) + Note: folding big numbers can cause numerical inaccuracies + Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range + limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function +*/ +static FINUFFT_ALWAYS_INLINE FLT fold_rescale(const FLT x, const UBIGINT N) noexcept { + static constexpr const FLT x2pi = FLT(M_1_2PI); + const FLT result = x * x2pi + FLT(0.5); + return (result - floor(result)) * FLT(N); +} + template -FINUFFT_ALWAYS_INLINE static simd_type fold_rescale(const simd_type &x, - UBIGINT N) noexcept; -static FINUFFT_ALWAYS_INLINE void set_kernel_args( - FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; -static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( - FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept; +static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x, + const BIGINT N) noexcept { + const simd_type x2pi = FLT(M_1_2PI); + const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5)); + return (result - xsimd::floor(result)) * simd_type(FLT(N)); +} +template +static void set_kernel_args(FLT *args, FLT x) noexcept +// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. +// needed for the vectorized kernel eval of Ludvig af K. +{ + for (int i = 0; i < ns; i++) args[i] = x + (FLT)i; +} +template +static void evaluate_kernel_vector(FLT *ker, FLT *args, + const finufft_spread_opts &opts) noexcept +/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K. + If opts.kerpad true, args and ker must be allocated for Npad, and args is + written to (to pad to length Npad), only first N outputs are correct. + Barnett 4/24/18 option to pad to mult of 4 for better SIMD vectorization. + Rescaled so max is 1, Barnett 7/21/24 + + Obsolete (replaced by Horner), but keep around for experimentation since + works for arbitrary beta. Formula must match reference implementation. +*/ +{ + FLT b = (FLT)opts.ES_beta; + FLT c = (FLT)opts.ES_c; + if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { + // Note (by Ludvig af K): Splitting kernel evaluation into two loops + // seems to benefit auto-vectorization. + // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops + int Npad = N; + if (opts.kerpad) { // since always same branch, no speed hit + Npad = 4 * (1 + (N - 1) / 4); // pad N to mult of 4; help i7 GCC, not xeon + for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval + args[i] = 0.0; + } + for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments + // care! 1.0 is double... + ker[i] = b * (sqrt((FLT)1.0 - c * args[i] * args[i]) - (FLT)1.0); + } + if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL)) + for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials + ker[i] = exp(ker[i]); + if (opts.kerpad) { + // padded part should be zero, in spread_subproblem_nd_kernels, there are + // out of bound writes to trg arrays + for (int i = N; i < Npad; ++i) ker[i] = 0.0; + } + } else { + for (int i = 0; i < N; i++) // dummy for timing only + ker[i] = 1.0; + } + // Separate check from arithmetic (Is this really needed? doesn't slow down) + for (int i = 0; i < N; i++) + if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; +} +// static FINUFFT_ALWAYS_INLINE void set_kernel_args( +// FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; template()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( - FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; -template> -static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - BIGINT i1, UBIGINT N1); -template> -static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2); -template> -static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, - UBIGINT N1, UBIGINT N2, UBIGINT N3); -static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du0, UBIGINT M0, - FLT *kx0, FLT *dd0, - const finufft_spread_opts &opts) noexcept; -static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2, - FLT *FINUFFT_RESTRICT du, UBIGINT M, const FLT *kx, - const FLT *ky, const FLT *dd, - const finufft_spread_opts &opts) noexcept; -static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, - UBIGINT size2, UBIGINT size3, FLT *du0, UBIGINT M0, - FLT *kx0, FLT *ky0, FLT *kz0, FLT *dd0, - const finufft_spread_opts &opts) noexcept; -template -static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, - UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, - UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0); -static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, - double bin_size_x, double bin_size_y, double bin_size_z, - int debug); -void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, - UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, - double bin_size_z, int debug, int nthr); -static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, - BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3, - UBIGINT M0, FLT *kx0, FLT *ky0, FLT *kz0, int ns, int ndims); + FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept +/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at +x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. +This is the current evaluation method, since it's faster (except i7 w=16). +Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ -// ========================================================================== -int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, - const finufft_spread_opts &opts) -/* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- - If opts.spread_direction=1, evaluate, in the 1D case, - - N1-1 - data_nonuniform[j] = SUM phi(kx[j] - n) data_uniform[n], for j=0...M-1 - n=0 - - If opts.spread_direction=2, evaluate its transpose, in the 1D case, - - M-1 - data_uniform[n] = SUM phi(kx[j] - n) data_nonuniform[j], for n=0...N1-1 - j=0 - - In each case phi is the spreading kernel, which has support - [-opts.nspread/2,opts.nspread/2]. In 2D or 3D, the generalization with - product of 1D kernels is performed. - For 1D set N2=N3=1; for 2D set N3=1; for 3D set N1,N2,N3>1. - - Notes: - No particular normalization of the spreading kernel is assumed. - Uniform (U) points are centered at coords - [0,1,...,N1-1] in 1D, analogously in 2D and 3D. They are stored in x - fastest, y medium, z slowest ordering, up to however many - dimensions are relevant; note that this is Fortran-style ordering for an - array f(x,y,z), but C style for f[z][y][x]. This is to match the Fortran - interface of the original CMCL libraries. - Non-uniform (NU) points kx,ky,kz are real, and may lie in the central three - periods in each coordinate (these are folded into the central period). - The finufft_spread_opts struct must have been set up already by calling setup_kernel. - It is assumed that 2*opts.nspread < min(N1,N2,N3), so that the kernel - only ever wraps once when falls below 0 or off the top of a uniform grid - dimension. +{ + // scale so local grid offset z in[-1,1] + const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); + static constexpr auto horner_coeffs = []() constexpr noexcept { + if constexpr (upsampfact == 200) { + return get_horner_coeffs_200(); + } else if constexpr (upsampfact == 125) { + return get_horner_coeffs_125(); + } + }(); + static constexpr auto nc = horner_coeffs.size(); + static constexpr auto use_ker_sym = (simd_size < w); - Inputs: - N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively. - If N2==1, 1D spreading is done. If N3==1, 2D spreading. - Otherwise, 3D. - M - number of NU pts. - kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in - 1D, only kx and ky read in 2D). + alignas(alignment) static constexpr auto padded_coeffs = + pad_2D_array_with_zeros(horner_coeffs); - These should lie in the box -pi<=kx<=pi. Points outside this domain are also - correctly folded back into this domain. - opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h + // use kernel symmetry trick if w > simd_size + if constexpr (use_ker_sym) { + static constexpr uint8_t tail = w % simd_size; + static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); + static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; + static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; + const simd_type zv{z}; + const auto z2v = zv * zv; - Inputs/Outputs: - data_uniform - output values on grid (dir=1) OR input grid data (dir=2) - data_nonuniform - input strengths of the sources (dir=1) - OR output values at targets (dir=2) - Returned value: - 0 indicates success; other values have meanings in ../docs/error.rst, with - following modifications: - 3 : one or more non-trivial box dimensions is less than 2.nspread. - 5 : failed allocate sort indices + // some xsimd constant for shuffle or inverse + static constexpr auto shuffle_batch = []() constexpr noexcept { + if constexpr (tail) { + return xsimd::make_batch_constant, arch_t, + shuffle_index>(); + } else { + return xsimd::make_batch_constant, arch_t, + reverse_index>(); + } + }(); - Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17 - error codes 3/13/17. pirange 3/28/17. Rewritten 6/15/17. parallel sort 2/9/18 - No separate subprob indices in t-1 2/11/18. - sort_threads (since for M<= end_idx) { + if constexpr (tail) { + // to use aligned store, we need shuffle the previous k_sym and current k_sym + k_prev = k_sym; + k_sym = xsimd::fnma(k_odd, zv, k_even); + xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); + } else { + xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch) + .store_aligned(ker + offset); + } + } + } + } else { + const simd_type zv(z); + for (uint8_t i = 0; i < w; i += simd_size) { + auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); + for (uint8_t j = 1; j < nc; ++j) { + const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); + k = xsimd::fma(k, zv, cji); + } + k.store_aligned(ker + i); + } } - int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts); - spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, - data_nonuniform, opts, did_sort); - free(sort_indices); - return 0; -} - -static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3) -/* rule for getting number of spreading dimensions from the list of Ns per dim. - Split out, Barnett 7/26/18 -*/ -{ - return 1 + (N2 > 1) + (N3 > 1); } -int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, - const finufft_spread_opts &opts) -/* This does just the input checking and reporting for the spreader. - See spreadinterp() for input arguments and meaning of returned value. - Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18. - Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to - [-3pi,3pi) -*/ -{ - // INPUT CHECKING & REPORTING .... cuboid not too small for spreading? - int minN = 2 * opts.nspread; - if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) { - fprintf(stderr, - "%s error: one or more non-trivial box dims is less than 2.nspread!\n", - __func__); - return FINUFFT_ERR_SPREAD_BOX_SMALL; - } - if (opts.spread_direction != 1 && opts.spread_direction != 2) { - fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__); - return FINUFFT_ERR_SPREAD_DIR; +template +static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, + const BIGINT i1, const UBIGINT N1) { + /* This function is called when the kernel wraps around the grid. It is + slower than interp_line. + M. Barbone July 2024: - moved the logic to a separate function + - using fused multiply-add (fma) for better performance + */ + std::array out{0}; + BIGINT j = i1; + if (i1 < 0) { // wraps at left + j += BIGINT(N1); + for (uint8_t dx = 0; dx < -i1; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + j -= BIGINT(N1); + for (uint8_t dx = -i1; dx < ns; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + } else if (i1 + ns >= N1) { // wraps at right + for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + j -= BIGINT(N1); + for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } + } else { + // padding is okay for ker, but it might spill over du array + // so this checks for that case and does not explicitly vectorize + for (uint8_t dx = 0; dx < ns; ++dx, ++j) { + out[0] = std::fma(du[2 * j], ker[dx], out[0]); + out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); + } } - return 0; + target[0] = out[0]; + target[1] = out[1]; } -int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts) -/* This makes a decision whether or not to sort the NU pts (influenced by - opts.sort), and if yes, calls either single- or multi-threaded bin sort, - writing reordered index list to sort_indices. If decided not to sort, the - identity permutation is written to sort_indices. - The permutation is designed to make RAM access close to contiguous, to - speed up spreading/interpolation, in the case of disordered NU points. - +template> +static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, + BIGINT i1, UBIGINT N1) { + /* 1D interpolate complex values from size-ns block of the du (uniform grid + data) array to a single complex output value "target", using as weights the + 1d kernel evaluation list ker1. Inputs: - M - number of input NU points. - kx,ky,kz - length-M arrays of real coords of NU pts. Domain is [-pi, pi), - points outside are folded in. - (only kz used in 1D, only kx and ky used in 2D.) - N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D). - 1 = x (fastest), 2 = y (medium), 3 = z (slowest). - opts - spreading options struct, see ../include/finufft_spread_opts.h + du : input regular grid of size 2*N1 (alternating real,imag) + ker1 : length-ns real array of 1d kernel evaluations + i1 : start (left-most) x-coord index to read du from, where the indices + of du run from 0 to N1-1, and indices outside that range are wrapped. + ns : kernel width (must be <=MAX_NSPREAD) Outputs: - sort_indices - a good permutation of NU points. (User must preallocate - to length M.) Ie, kx[sort_indices[j]], j=0,..,M-1, is a good - ordering for the x-coords of NU pts, etc. - returned value - whether a sort was done (1) or not (0). + target : size 2 array (containing real,imag) of interpolated output - Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024. -*/ -{ - CNTime timer{}; - uint8_t ndims = ndims_from_Ns(N1, N2, N3); - auto N = N1 * N2 * N3; // U grid (periodic box) sizes + Periodic wrapping in the du array is applied, assuming N1>=ns. + Internally, dx indices into ker array j is index in complex du array. + Barnett 6/16/17. + M. Barbone July 2024: - moved wrapping logic to interp_line_wrap + - using explicit SIMD vectorization to overcome the out[2] array + limitation +*/ + using arch_t = typename simd_type::arch_type; + static constexpr auto padding = get_padding(); + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size)); + std::array out{0}; + const auto j = i1; + // removing the wrapping leads up to 10% speedup in certain cases + // moved the wrapping to another function to reduce instruction cache pressure + if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) { + return interp_line_wrap(target, du, ker, i1, N1); + } else { // doesn't wrap + // logic largely similar to spread 1D kernel, please see the explanation there + // for the first part of this code + const auto res = [du, j, ker]() constexpr noexcept { + const auto du_ptr = du + 2 * j; + simd_type res_low{0}, res_hi{0}; + for (uint8_t dx{0}; dx < regular_part; dx += 2 * simd_size) { + const auto ker_v = simd_type::load_aligned(ker + dx / 2); + const auto du_pt0 = simd_type::load_unaligned(du_ptr + dx); + const auto du_pt1 = simd_type::load_unaligned(du_ptr + dx + simd_size); + const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); + const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); + res_low = xsimd::fma(ker0low, du_pt0, res_low); + res_hi = xsimd::fma(ker0hi, du_pt1, res_hi); + } - // heuristic binning box size for U grid... affects performance: - double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4; - // put in heuristics based on cache sizes (only useful for single-thread) ? + if constexpr (regular_part < 2 * ns) { + const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2)); + const auto du_pt = simd_type::load_unaligned(du_ptr + regular_part); + const auto ker0low = xsimd::swizzle(ker0, zip_low_index); + res_low = xsimd::fma(ker0low, du_pt, res_low); + } - int better_to_sort = - !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or - // dir=2 case: - // don't sort + // This does a horizontal sum using a loop instead of relying on SIMD instructions + // this is faster than the code below but less elegant. + // lambdas here to limit the scope of temporary variables and have the compiler + // optimize the code better + return res_low + res_hi; + }(); + const auto res_array = xsimd_to_array(res); + for (uint8_t i{0}; i < simd_size; i += 2) { + out[0] += res_array[i]; + out[1] += res_array[i + 1]; + } + // this is where the code differs from spread_kernel, the interpolator does an extra + // reduction step to SIMD elements down to 2 elements + // This is known as horizontal sum in SIMD terminology - timer.start(); // if needed, sort all the NU pts... - int did_sort = 0; - auto maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default - if (opts.nthreads > 0) - maxnthr = opts.nthreads; // user nthreads overrides, without limit - if (opts.sort_threads > 0) - maxnthr = opts.sort_threads; // high-priority override, also no limit - // At this point: maxnthr = the max threads sorting could use - // (we don't print warning here, since: no showwarn in spread_opts, and finufft - // already warned about it. spreadinterp-only advanced users will miss a warning) - if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) { - // store a good permutation ordering of all NU pts (dim=1,2 or 3) - int sort_debug = (opts.debug >= 2); // show timing output? - int sort_nthr = opts.sort_threads; // 0, or user max # threads for sort -#ifndef _OPENMP - sort_nthr = 1; // if single-threaded lib, override user -#endif - if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better! - sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic - if (sort_nthr == 1) - bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, - bin_size_y, bin_size_z, sort_debug); - else // sort_nthr>1, user fixes # threads (>=2) - bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, - bin_size_y, bin_size_z, sort_debug, sort_nthr); - if (opts.debug) - printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec()); - did_sort = 1; - } else { -#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000) - for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7 - sort_indices[i] = i; // the identity permutation - if (opts.debug) - printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec()); + // This does a horizontal sum using vector instruction, + // is slower than summing and looping + // clang-format off + // const auto res_real = xsimd::shuffle(res_low, res_hi, select_even_mask); + // const auto res_imag = xsimd::shuffle(res_low, res_hi, select_odd_mask); + // out[0] = xsimd::reduce_add(res_real); + // out[1] = xsimd::reduce_add(res_imag); + // clang-format on } - return did_sort; -} - -int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, FLT *data_uniform, const UBIGINT M, - FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, - FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts, int did_sort) -/* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. - See spreadinterp() above for inputs arguments and definitions. - Return value should always be 0 (no error reporting). - Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20. -*/ -{ - if (opts.spread_direction == 1) // ========= direction 1 (spreading) ======= - spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, - opts, did_sort); - - else // ================= direction 2 (interpolation) =========== - interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, - opts); - - return 0; + target[0] = out[0]; + target[1] = out[1]; } -// -------------------------------------------------------------------------- -int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - const FLT *data_nonuniform, const finufft_spread_opts &opts, - int did_sort) -// Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. -{ - CNTime timer{}; - const auto ndims = ndims_from_Ns(N1, N2, N3); - const auto N = N1 * N2 * N3; // output array size - const auto ns = opts.nspread; // abbrev. for w, kernel width - auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread - if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit -#ifndef _OPENMP - nthr = 1; // single-threaded lib must override user -#endif - if (opts.debug) - printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, - (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); - timer.start(); - std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array - if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec()); - if (M == 0) // no NU pts, we're done - return 0; - - auto spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic? - spread_single = false; // for now - timer.start(); - if (spread_single) { // ------- Basic single-core t1 spreading ------ - for (UBIGINT j = 0; j < M; j++) { - // *** todo, not urgent - // ... (question is: will the index wrapping per NU pt slow it down?) +template +static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, + const FLT *ker1, const FLT *ker2, const BIGINT i1, + const BIGINT i2, const UBIGINT N1, const UBIGINT N2) { + /* + * This function is called when the kernel wraps around the grid. It is slower than + * the non wrapping version. + * There is an extra case for when ker is padded and spills over the du array. + * In this case uses the old non wrapping version. + */ + std::array out{0}; + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); + if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { + // store a horiz line (interleaved real,imag) + alignas(alignment) std::array line{0}; + // add remaining const-y lines to the line (expensive inner loop) + for (uint8_t dy{0}; dy < ns; ++dy) { + const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above) + for (uint8_t l{0}; l < 2 * ns; ++l) { + line[l] = std::fma(ker2[dy], l_ptr[l], line[l]); + } } - if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec()); - } else { // ------- Fancy multi-core blocked t1 spreading ---- - // Splits sorted inds (jfm's advanced2), could double RAM. - // choose nb (# subprobs) via used nthreads: - auto nb = std::min((UBIGINT)nthr, M); // simply split one subprob per thr... - if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size - nb = 1 + (M - 1) / opts.max_subproblem_size; // int div does - // ceil(M/opts.max_subproblem_size) - if (opts.debug) - printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size); + // apply x kernel to the (interleaved) line and add together + for (uint8_t dx{0}; dx < ns; dx++) { + out[0] = std::fma(line[2 * dx], ker1[dx], out[0]); + out[1] = std::fma(line[2 * dx + 1], ker1[dx], out[1]); } - if (M * 1000 < N) { // low-density heuristic: one thread per NU pt! - nb = M; - if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n"); + } else { + std::array j1{}, j2{}; // 1d ptr lists + auto x = i1, y = i2; // initialize coords + for (uint8_t d{0}; d < ns; d++) { // set up ptr lists + if (x < 0) x += BIGINT(N1); + if (x >= N1) x -= BIGINT(N1); + j1[d] = x++; + if (y < 0) y += BIGINT(N2); + if (y >= N2) y -= BIGINT(N2); + j2[d] = y++; } - if (!did_sort && nthr == 1) { - nb = 1; - if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n"); + for (uint8_t dy{0}; dy < ns; dy++) { // use the pts lists + const UBIGINT oy = N1 * j2[dy]; // offset due to y + for (uint8_t dx{0}; dx < ns; dx++) { + const auto k = ker1[dx] * ker2[dy]; + const UBIGINT j = oy + j1[dx]; + out[0] += du[2 * j] * k; + out[1] += du[2 * j + 1] * k; + } } - if (opts.debug && nthr > opts.atomic_threshold) - printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n"); + } + target[0] = out[0]; + target[1] = out[1]; +} - std::vector brk(nb + 1); // NU index breakpoints defining nb subproblems - for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; - -#pragma omp parallel num_threads(nthr) - { - // local copies of NU pts and data for each subproblem - std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; -#pragma omp for schedule(dynamic, 1) // each is big - for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems - const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem - // copy the location and data vectors for the nonuniform points - kx0.resize(M0); - ky0.resize(M0 * (N2 > 1)); - kz0.resize(M0 * (N3 > 1)); - dd0.resize(2 * M0); // complex strength data - for (auto j = 0; j < M0; j++) { // todo: can avoid this copying? - const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list - kx0[j] = fold_rescale(kx[kk], N1); - if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); - if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); - dd0[j * 2] = data_nonuniform[kk * 2]; // real part - dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part - } - // get the subgrid which will include padding by roughly nspread/2 - // get_subgrid sets - BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; - // sets offsets and sizes - get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, - kx0.data(), ky0.data(), kz0.data(), ns, ndims); - if (opts.debug > 1) { - print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2, - size3, M0); - } - // allocate output data for this subgrid - du0.resize(2 * padded_size1 * size2 * size3); // complex - // Spread to subgrid without need for bounds checking or wrapping - if (!(opts.flags & TF_OMIT_SPREADING)) { - if (ndims == 1) - spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(), - dd0.data(), opts); - else if (ndims == 2) - spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0, - kx0.data(), ky0.data(), dd0.data(), opts); - else - spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, - du0.data(), M0, kx0.data(), ky0.data(), kz0.data(), - dd0.data(), opts); - } - // do the adding of subgrid to output - if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { - if (nthr > opts.atomic_threshold) { // see above for debug reporting - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, - du0.data()); // R Blackwell's atomic version - } else { -#pragma omp critical - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, - du0.data()); - } - } - } // end main loop over subprobs - } - if (opts.debug) - printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb); - } // end of choice of which t1 spread type to use - return 0; -}; - -// -------------------------------------------------------------------------- -template -FINUFFT_NEVER_INLINE static int interpSorted_kernel( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - const FLT *data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) -// Interpolate to NU pts in sorted order from a uniform grid. -// See spreadinterp() for doc. -{ - using simd_type = PaddedSIMD; - using arch_t = typename simd_type::arch_type; - static constexpr auto alignment = arch_t::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift - - CNTime timer{}; - const auto ndims = ndims_from_Ns(N1, N2, N3); - auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp - if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit -#ifndef _OPENMP - nthr = 1; // single-threaded lib must override user -#endif - if (opts.debug) - printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, - (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); - timer.start(); -#pragma omp parallel num_threads(nthr) - { - static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk - alignas(alignment) UBIGINT jlist[CHUNKSIZE]; - alignas(alignment) FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; - alignas(alignment) FLT outbuf[2 * CHUNKSIZE]; - // Kernels: static alloc is faster, so we do it for up to 3D... - alignas(alignment) std::array kernel_values{0}; - auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); - auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; - auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; - - // Loop over interpolation chunks - // main loop over NU trgs, interp each from U - // (note: windows omp doesn't like unsigned loop vars) -#pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: - for (BIGINT i = 0; i < M; i += CHUNKSIZE) { - // Setup buffers for this chunk - const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; - for (int ibuf = 0; ibuf < bufsize; ibuf++) { - UBIGINT j = sort_indices[i + ibuf]; - jlist[ibuf] = j; - xjlist[ibuf] = fold_rescale(kx[j], N1); - if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2); - if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3); - } - - // Loop over targets in chunk - for (int ibuf = 0; ibuf < bufsize; ibuf++) { - const auto xj = xjlist[ibuf]; - const auto yj = (ndims > 1) ? yjlist[ibuf] : 0; - const auto zj = (ndims > 2) ? zjlist[ibuf] : 0; - - auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf; - - // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ - const auto i1 = BIGINT(std::ceil(xj - ns2)); // leftmost grid index - const auto i2 = (ndims > 1) ? BIGINT(std::ceil(yj - ns2)) : 0; // min y grid index - const auto i3 = (ndims > 2) ? BIGINT(std::ceil(zj - ns2)) : 0; // min z grid index - - const auto x1 = std::ceil(xj - ns2) - xj; // shift of ker center, in [-w/2,-w/2+1] - const auto x2 = (ndims > 1) ? std::ceil(yj - ns2) - yj : 0; - const auto x3 = (ndims > 2) ? std::ceil(zj - ns2) - zj : 0; - - // eval kernel values patch and use to interpolate from uniform data... - if (!(opts.flags & TF_OMIT_SPREADING)) { - switch (ndims) { - case 1: - ker_eval(kernel_values.data(), opts, x1); - interp_line(target, data_uniform, ker1, i1, N1); - break; - case 2: - ker_eval(kernel_values.data(), opts, x1, x2); - interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, - N2); - break; - case 3: - ker_eval(kernel_values.data(), opts, x1, x2, - x3); - interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, - N1, N2, N3); - break; - default: // can't get here - FINUFFT_UNREACHABLE; - break; - } - } - } // end loop over targets in chunk - - // Copy result buffer to output array - for (int ibuf = 0; ibuf < bufsize; ibuf++) { - const UBIGINT j = jlist[ibuf]; - data_nonuniform[2 * j] = outbuf[2 * ibuf]; - data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1]; - } - - } // end NU targ loop - } // end parallel section - if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec()); - return 0; -} - -template -static int interpSorted_dispatch( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { - static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, - "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); - if constexpr (NS == MIN_NSPREAD) { // Base case - if (opts.kerevalmeth) - return interpSorted_kernel( - sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); - else { - return interpSorted_kernel( - sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); - } - } else { - if (opts.nspread == NS) { - if (opts.kerevalmeth) { - return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts); - } else { - return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts); - } - } else { - return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts); - } - } -} - -int interpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, - FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, - FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts) { - return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts); -} - -/////////////////////////////////////////////////////////////////////////// - -int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth, - int debug, int showwarn, int dim) -/* Initializes spreader kernel parameters given desired NUFFT tolerance eps, - upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth - (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags. - Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for - opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where - upsampfac is set. Must call this before any kernel evals done, otherwise segfault - likely. Returns: 0 : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be - achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h); - spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. -*/ -{ - if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma - if (kerevalmeth == 1) { - fprintf(stderr, - "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by " - "kerevalmeth=1\n", - upsampfac); - return FINUFFT_ERR_HORNER_WRONG_BETA; - } - if (upsampfac <= 1.0) { // no digits would result - fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n", - upsampfac); - return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; - } - // calling routine must abort on above errors, since opts is garbage! - if (showwarn && upsampfac > 4.0) - fprintf(stderr, - "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be " - "beneficial.\n", - upsampfac); - } - - // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft) - opts.spread_direction = 0; // user should always set to 1 or 2 as desired - opts.sort = 2; // 2:auto-choice - opts.kerpad = 0; // affects only evaluate_kernel_vector - opts.kerevalmeth = kerevalmeth; - opts.upsampfac = upsampfac; - opts.nthreads = 0; // all avail - opts.sort_threads = 0; // 0:auto-choice - // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake... - opts.max_subproblem_size = (dim == 1) ? 10000 : 100000; - opts.flags = 0; // 0:no timing flags (>0 for experts only) - opts.debug = 0; // 0:no debug output - // heuristic nthr above which switch OMP critical to atomic (add_wrapped...): - opts.atomic_threshold = 10; // R Blackwell's value - - int ns, ier = 0; // Set kernel width w (aka ns, nspread) then copy to opts... - if (eps < EPSILON) { // safety; there's no hope of beating e_mach - if (showwarn) - fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__, - (double)eps, (double)EPSILON); - eps = EPSILON; // only changes local copy (not any opts) - ier = FINUFFT_WARN_EPS_TOO_SMALL; - } - if (upsampfac == 2.0) // standard sigma (see SISC paper) - ns = std::ceil(-log10(eps / (FLT)10.0)); // 1 digit per power of 10 - else // custom sigma - ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 - ns = max(2, ns); // (we don't have ns=1 version yet) - if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules - if (showwarn) - fprintf(stderr, - "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; " - "clipping to max %d.\n", - __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); - ns = MAX_NSPREAD; - ier = FINUFFT_WARN_EPS_TOO_SMALL; - } - opts.nspread = ns; - // setup for reference kernel eval (via formula): select beta width param... - // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel) - opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines) - opts.ES_c = 4.0 / (double)(ns * ns); - double betaoverns = 2.30; // gives decent betas for default sigma=2.0 - if (ns == 2) betaoverns = 2.20; // some small-width tweaks... - if (ns == 3) betaoverns = 2.26; - if (ns == 4) betaoverns = 2.38; - if (upsampfac != 2.0) { // again, override beta for custom sigma - FLT gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! - betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff - } - opts.ES_beta = betaoverns * ns; // set the kernel beta parameter - if (debug) - printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__, - kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta); - - return ier; -} - -FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts) -/* ES ("exp sqrt") kernel evaluation at single real argument: - phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)), for |x| < nspread/2 - related to an asymptotic approximation to the Kaiser--Bessel, itself an - approximation to prolate spheroidal wavefunction (PSWF) of order 0. - This is the "reference implementation", used by eg finufft/onedim_* 2/17/17. - Rescaled so max is 1, Barnett 7/21/24 -*/ -{ - if (abs(x) >= (FLT)opts.ES_halfwidth) - // if spreading/FT careful, shouldn't need this if, but causes no speed hit - return 0.0; - else - return exp((FLT)opts.ES_beta * (sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x) - (FLT)1.0)); -} - -template -void set_kernel_args(FLT *args, FLT x) noexcept -// Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. -// needed for the vectorized kernel eval of Ludvig af K. -{ - for (int i = 0; i < ns; i++) args[i] = x + (FLT)i; -} -template -void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts) noexcept -/* Evaluate ES kernel for a vector of N arguments; by Ludvig af K. - If opts.kerpad true, args and ker must be allocated for Npad, and args is - written to (to pad to length Npad), only first N outputs are correct. - Barnett 4/24/18 option to pad to mult of 4 for better SIMD vectorization. - Rescaled so max is 1, Barnett 7/21/24 - - Obsolete (replaced by Horner), but keep around for experimentation since - works for arbitrary beta. Formula must match reference implementation. -*/ -{ - FLT b = (FLT)opts.ES_beta; - FLT c = (FLT)opts.ES_c; - if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { - // Note (by Ludvig af K): Splitting kernel evaluation into two loops - // seems to benefit auto-vectorization. - // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops - int Npad = N; - if (opts.kerpad) { // since always same branch, no speed hit - Npad = 4 * (1 + (N - 1) / 4); // pad N to mult of 4; help i7 GCC, not xeon - for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval - args[i] = 0.0; - } - for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments - // care! 1.0 is double... - ker[i] = b * (sqrt((FLT)1.0 - c * args[i] * args[i]) - (FLT)1.0); - } - if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL)) - for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials - ker[i] = exp(ker[i]); - if (opts.kerpad) { - // padded part should be zero, in spread_subproblem_nd_kernels, there are - // out of bound writes to trg arrays - for (int i = N; i < Npad; ++i) ker[i] = 0.0; - } - } else { - for (int i = 0; i < N; i++) // dummy for timing only - ker[i] = 1.0; - } - // Separate check from arithmetic (Is this really needed? doesn't slow down) - for (int i = 0; i < N; i++) - if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; -} - -template // aka ns -void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, - const finufft_spread_opts &opts) noexcept -/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at -x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. -This is the current evaluation method, since it's faster (except i7 w=16). -Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ - -{ - // scale so local grid offset z in[-1,1] - const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); - using arch_t = typename simd_type::arch_type; - static constexpr auto alignment = arch_t::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); - static constexpr auto horner_coeffs = []() constexpr noexcept { - if constexpr (upsampfact == 200) { - return get_horner_coeffs_200(); - } else if constexpr (upsampfact == 125) { - return get_horner_coeffs_125(); - } - }(); - static constexpr auto nc = horner_coeffs.size(); - static constexpr auto use_ker_sym = (simd_size < w); - - alignas(alignment) static constexpr auto padded_coeffs = - pad_2D_array_with_zeros(horner_coeffs); - - // use kernel symmetry trick if w > simd_size - if constexpr (use_ker_sym) { - static constexpr uint8_t tail = w % simd_size; - static constexpr uint8_t if_odd_degree = ((nc + 1) % 2); - static constexpr uint8_t offset_start = tail ? w - tail : w - simd_size; - static constexpr uint8_t end_idx = (w + (tail > 0)) / 2; - const simd_type zv{z}; - const auto z2v = zv * zv; - - // some xsimd constant for shuffle or inverse - static constexpr auto shuffle_batch = []() constexpr noexcept { - if constexpr (tail) { - return xsimd::make_batch_constant, arch_t, - shuffle_index>(); - } else { - return xsimd::make_batch_constant, arch_t, - reverse_index>(); - } - }(); - - // process simd vecs - simd_type k_prev, k_sym{0}; - for (uint8_t i{0}, offset = offset_start; i < end_idx; - i += simd_size, offset -= simd_size) { - auto k_odd = [i]() constexpr noexcept { - if constexpr (if_odd_degree) { - return simd_type::load_aligned(padded_coeffs[0].data() + i); - } else { - return simd_type{0}; - } - }(); - auto k_even = simd_type::load_aligned(padded_coeffs[if_odd_degree].data() + i); - for (uint8_t j{1 + if_odd_degree}; j < nc; j += 2) { - const auto cji_odd = simd_type::load_aligned(padded_coeffs[j].data() + i); - const auto cji_even = simd_type::load_aligned(padded_coeffs[j + 1].data() + i); - k_odd = xsimd::fma(k_odd, z2v, cji_odd); - k_even = xsimd::fma(k_even, z2v, cji_even); - } - // left part - xsimd::fma(k_odd, zv, k_even).store_aligned(ker + i); - // right part symmetric to the left part - if (offset >= end_idx) { - if constexpr (tail) { - // to use aligned store, we need shuffle the previous k_sym and current k_sym - k_prev = k_sym; - k_sym = xsimd::fnma(k_odd, zv, k_even); - xsimd::shuffle(k_sym, k_prev, shuffle_batch).store_aligned(ker + offset); - } else { - xsimd::swizzle(xsimd::fnma(k_odd, zv, k_even), shuffle_batch) - .store_aligned(ker + offset); - } - } - } - } else { - const simd_type zv(z); - for (uint8_t i = 0; i < w; i += simd_size) { - auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); - for (uint8_t j = 1; j < nc; ++j) { - const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); - k = xsimd::fma(k, zv, cji); - } - k.store_aligned(ker + i); - } - } -} - -template -static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - const BIGINT i1, const UBIGINT N1) { - /* This function is called when the kernel wraps around the grid. It is - slower than interp_line. - M. Barbone July 2024: - moved the logic to a separate function - - using fused multiply-add (fma) for better performance - */ - std::array out{0}; - BIGINT j = i1; - if (i1 < 0) { // wraps at left - j += BIGINT(N1); - for (uint8_t dx = 0; dx < -i1; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - j -= BIGINT(N1); - for (uint8_t dx = -i1; dx < ns; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - } else if (i1 + ns >= N1) { // wraps at right - for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - j -= BIGINT(N1); - for (uint8_t dx = N1 - i1; dx < ns; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - } else { - // padding is okay for ker, but it might spill over du array - // so this checks for that case and does not explicitly vectorize - for (uint8_t dx = 0; dx < ns; ++dx, ++j) { - out[0] = std::fma(du[2 * j], ker[dx], out[0]); - out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); - } - } - target[0] = out[0]; - target[1] = out[1]; -} - -template -void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - const BIGINT i1, const UBIGINT N1) { - /* 1D interpolate complex values from size-ns block of the du (uniform grid - data) array to a single complex output value "target", using as weights the - 1d kernel evaluation list ker1. - Inputs: - du : input regular grid of size 2*N1 (alternating real,imag) - ker1 : length-ns real array of 1d kernel evaluations - i1 : start (left-most) x-coord index to read du from, where the indices - of du run from 0 to N1-1, and indices outside that range are wrapped. - ns : kernel width (must be <=MAX_NSPREAD) - Outputs: - target : size 2 array (containing real,imag) of interpolated output - - Periodic wrapping in the du array is applied, assuming N1>=ns. - Internally, dx indices into ker array j is index in complex du array. - Barnett 6/16/17. - M. Barbone July 2024: - moved wrapping logic to interp_line_wrap - - using explicit SIMD vectorization to overcome the out[2] array - limitation -*/ - using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); - static constexpr auto alignment = arch_t::alignment(); - static constexpr auto simd_size = simd_type::size; - static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size)); - std::array out{0}; - const auto j = i1; - // removing the wrapping leads up to 10% speedup in certain cases - // moved the wrapping to another function to reduce instruction cache pressure - if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) { - return interp_line_wrap(target, du, ker, i1, N1); - } else { // doesn't wrap - // logic largely similar to spread 1D kernel, please see the explanation there - // for the first part of this code - const auto res = [du, j, ker]() constexpr noexcept { - const auto du_ptr = du + 2 * j; - simd_type res_low{0}, res_hi{0}; - for (uint8_t dx{0}; dx < regular_part; dx += 2 * simd_size) { - const auto ker_v = simd_type::load_aligned(ker + dx / 2); - const auto du_pt0 = simd_type::load_unaligned(du_ptr + dx); - const auto du_pt1 = simd_type::load_unaligned(du_ptr + dx + simd_size); - const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); - const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); - res_low = xsimd::fma(ker0low, du_pt0, res_low); - res_hi = xsimd::fma(ker0hi, du_pt1, res_hi); - } - - if constexpr (regular_part < 2 * ns) { - const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2)); - const auto du_pt = simd_type::load_unaligned(du_ptr + regular_part); - const auto ker0low = xsimd::swizzle(ker0, zip_low_index); - res_low = xsimd::fma(ker0low, du_pt, res_low); - } - - // This does a horizontal sum using a loop instead of relying on SIMD instructions - // this is faster than the code below but less elegant. - // lambdas here to limit the scope of temporary variables and have the compiler - // optimize the code better - return res_low + res_hi; - }(); - const auto res_array = xsimd_to_array(res); - for (uint8_t i{0}; i < simd_size; i += 2) { - out[0] += res_array[i]; - out[1] += res_array[i + 1]; - } - // this is where the code differs from spread_kernel, the interpolator does an extra - // reduction step to SIMD elements down to 2 elements - // This is known as horizontal sum in SIMD terminology - - // This does a horizontal sum using vector instruction, - // is slower than summing and looping - // clang-format off - // const auto res_real = xsimd::shuffle(res_low, res_hi, select_even_mask); - // const auto res_imag = xsimd::shuffle(res_low, res_hi, select_odd_mask); - // out[0] = xsimd::reduce_add(res_real); - // out[1] = xsimd::reduce_add(res_imag); - // clang-format on - } - target[0] = out[0]; - target[1] = out[1]; -} - -template -static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, - const FLT *ker1, const FLT *ker2, const BIGINT i1, - const BIGINT i2, const UBIGINT N1, const UBIGINT N2) { - /* - * This function is called when the kernel wraps around the grid. It is slower than - * the non wrapping version. - * There is an extra case for when ker is padded and spills over the du array. - * In this case uses the old non wrapping version. - */ - std::array out{0}; - using arch_t = typename simd_type::arch_type; - static constexpr auto alignment = arch_t::alignment(); - if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { - // store a horiz line (interleaved real,imag) - alignas(alignment) std::array line{0}; - // add remaining const-y lines to the line (expensive inner loop) - for (uint8_t dy{0}; dy < ns; ++dy) { - const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above) - for (uint8_t l{0}; l < 2 * ns; ++l) { - line[l] = std::fma(ker2[dy], l_ptr[l], line[l]); - } - } - // apply x kernel to the (interleaved) line and add together - for (uint8_t dx{0}; dx < ns; dx++) { - out[0] = std::fma(line[2 * dx], ker1[dx], out[0]); - out[1] = std::fma(line[2 * dx + 1], ker1[dx], out[1]); - } - } else { - std::array j1{}, j2{}; // 1d ptr lists - auto x = i1, y = i2; // initialize coords - for (uint8_t d{0}; d < ns; d++) { // set up ptr lists - if (x < 0) x += BIGINT(N1); - if (x >= N1) x -= BIGINT(N1); - j1[d] = x++; - if (y < 0) y += BIGINT(N2); - if (y >= N2) y -= BIGINT(N2); - j2[d] = y++; - } - for (uint8_t dy{0}; dy < ns; dy++) { // use the pts lists - const UBIGINT oy = N1 * j2[dy]; // offset due to y - for (uint8_t dx{0}; dx < ns; dx++) { - const auto k = ker1[dx] * ker2[dy]; - const UBIGINT j = oy + j1[dx]; - out[0] += du[2 * j] * k; - out[1] += du[2 * j + 1] * k; - } - } - } - target[0] = out[0]; - target[1] = out[1]; -} - -template -void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const BIGINT i1, const BIGINT i2, const UBIGINT N1, - const UBIGINT N2) -/* 2D interpolate complex values from a ns*ns block of the du (uniform grid - data) array to a single complex output value "target", using as weights the - ns*ns outer product of the 1d kernel lists ker1 and ker2. - Inputs: - du : input regular grid of size 2*N1*N2 (alternating real,imag) - ker1, ker2 : length-ns real arrays of 1d kernel evaluations - i1 : start (left-most) x-coord index to read du from, where the indices - of du run from 0 to N1-1, and indices outside that range are wrapped. - i2 : start (bottom) y-coord index to read du from. - ns : kernel width (must be <=MAX_NSPREAD) - Outputs: - target : size 2 array (containing real,imag) of interpolated output +template> +static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, + const FLT *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2) +/* 2D interpolate complex values from a ns*ns block of the du (uniform grid + data) array to a single complex output value "target", using as weights the + ns*ns outer product of the 1d kernel lists ker1 and ker2. + Inputs: + du : input regular grid of size 2*N1*N2 (alternating real,imag) + ker1, ker2 : length-ns real arrays of 1d kernel evaluations + i1 : start (left-most) x-coord index to read du from, where the indices + of du run from 0 to N1-1, and indices outside that range are wrapped. + i2 : start (bottom) y-coord index to read du from. + ns : kernel width (must be <=MAX_NSPREAD) + Outputs: + target : size 2 array (containing real,imag) of interpolated output Periodic wrapping in the du array is applied, assuming N1,N2>=ns. Internally, dx,dy indices into ker array, l indices the 2*ns interleaved @@ -1217,10 +730,10 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, target[1] = out[1]; } -template -void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const FLT *ker3, const BIGINT i1, const BIGINT i2, - const BIGINT i3, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) +template> +static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, + const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, + UBIGINT N1, UBIGINT N2, UBIGINT N3) /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns*ns outer product of the 1d kernel lists ker1, ker2, and ker3. @@ -1302,12 +815,57 @@ void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, out[0] += res_array[i]; out[1] += res_array[i + 1]; } - } else { - return interp_cube_wrapped(target, du, ker1, ker2, ker3, i1, i2, i3, - N1, N2, N3); + } else { + return interp_cube_wrapped(target, du, ker1, ker2, ker3, i1, i2, i3, + N1, N2, N3); + } + target[0] = out[0]; + target[1] = out[1]; +} + +template()>, + typename... V> +static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, + const finufft_spread_opts &opts, + const V... elems) noexcept { + /* Utility function that allows to move the kernel evaluation outside the spreader for + clarity + Inputs are: + ns = kernel width + kerevalmeth = kernel evaluation method + T = (single or double precision) type of the kernel + simd_type = xsimd::batch for Horner + vectorization (default is the optimal simd size) + finufft_spread_opts as Horner needs + the oversampling factor + elems = kernel arguments + Examples usage is + ker_eval(opts, x, y, z) // for 3D or + ker_eval(opts, x, y) // for 2D or + ker_eval(opts, x) // for 1D + */ + const std::array inputs{elems...}; + // compile time loop, no performance overhead + for (auto i = 0; i < sizeof...(elems); ++i) { + // compile time branch no performance overhead + if constexpr (kerevalmeth == 1) { + if (opts.upsampfac == 2.0) { + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], + opts); + } + if (opts.upsampfac == 1.25) { + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], + opts); + } + } + if constexpr (kerevalmeth == 0) { + alignas(simd_type::arch_type::alignment()) std::array kernel_args{}; + set_kernel_args(kernel_args.data(), inputs[i]); + evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts); + } } - target[0] = out[0]; - target[1] = out[1]; + return ker; } template @@ -1505,8 +1063,8 @@ static void spread_subproblem_1d_dispatch( } } -void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *kx, - FLT *dd, const finufft_spread_opts &opts) noexcept +static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *kx, + FLT *dd, const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1642,10 +1200,10 @@ void spread_subproblem_2d_dispatch( } } -void spread_subproblem_2d(const BIGINT off1, const BIGINT off2, const UBIGINT size1, - const UBIGINT size2, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *kx, const FLT *ky, const FLT *dd, - const finufft_spread_opts &opts) noexcept +static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2, + FLT *FINUFFT_RESTRICT du, UBIGINT M, const FLT *kx, + const FLT *ky, const FLT *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1765,10 +1323,10 @@ void spread_subproblem_3d_dispatch( } } -void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, - UBIGINT size2, UBIGINT size3, FLT *du, UBIGINT M, FLT *kx, - FLT *ky, FLT *kz, FLT *dd, - const finufft_spread_opts &opts) noexcept +static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, + UBIGINT size2, UBIGINT size3, FLT *du, UBIGINT M, + FLT *kx, FLT *ky, FLT *kz, FLT *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 3D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky,kz (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in each dim. @@ -1781,10 +1339,10 @@ du (size size1*size2*size3) is uniform complex output array } template -void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, - UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, - UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const FLT *const du0) +static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, + UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, + UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0) /* Add a large subgrid (du0) to output grid (data_uniform), with periodic wrapping to N1,N2,N3 box. offset1,2,3 give the offset of the subgrid from the lowest corner of output. @@ -1841,10 +1399,10 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } } -void bin_sort_singlethread( - BIGINT *ret, const UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, - const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, const double bin_size_x, - const double bin_size_y, const double bin_size_z, const int debug) +static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const FLT *ky, + const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, + double bin_size_x, double bin_size_y, double bin_size_z, + int debug) /* Returns permutation of all nonuniform points with good RAM access, * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version * @@ -1915,9 +1473,9 @@ void bin_sort_singlethread( } } -void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, - UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, - double bin_size_z, int debug, int nthr) +static void bin_sort_multithread( + BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, + double bin_size_x, double bin_size_y, double bin_size_z, int debug, int nthr) /* Mostly-OpenMP'ed version of bin_sort. For documentation see: bin_sort_singlethread. Caution: when M (# NU pts) << N (# U pts), is SLOWER than single-thread. @@ -1985,9 +1543,9 @@ void bin_sort_multithread(BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBI } } -void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padded_size1, - BIGINT &size1, BIGINT &size2, BIGINT &size3, UBIGINT M, FLT *kx, FLT *ky, - FLT *kz, int ns, int ndims) +static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, + BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3, + UBIGINT M, FLT *kx, FLT *ky, FLT *kz, int ns, int ndims) /* Writes out the integer offsets and sizes of a "subgrid" (cuboid subset of Z^ndims) large enough to enclose all of the nonuniform points with (non-periodic) padding of half the kernel width ns to each side in @@ -2056,245 +1614,605 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padd size3 = 1; } } -/* local NU coord fold+rescale macro: does the following affine transform to x: - (x+PI) mod PI each to [0,N) - Note: folding big numbers can cause numerical inaccuracies - Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range - limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function + +// ========================================================================== +int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT M, + FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, + const finufft_spread_opts &opts) +/* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- + If opts.spread_direction=1, evaluate, in the 1D case, + + N1-1 + data_nonuniform[j] = SUM phi(kx[j] - n) data_uniform[n], for j=0...M-1 + n=0 + + If opts.spread_direction=2, evaluate its transpose, in the 1D case, + + M-1 + data_uniform[n] = SUM phi(kx[j] - n) data_nonuniform[j], for n=0...N1-1 + j=0 + + In each case phi is the spreading kernel, which has support + [-opts.nspread/2,opts.nspread/2]. In 2D or 3D, the generalization with + product of 1D kernels is performed. + For 1D set N2=N3=1; for 2D set N3=1; for 3D set N1,N2,N3>1. + + Notes: + No particular normalization of the spreading kernel is assumed. + Uniform (U) points are centered at coords + [0,1,...,N1-1] in 1D, analogously in 2D and 3D. They are stored in x + fastest, y medium, z slowest ordering, up to however many + dimensions are relevant; note that this is Fortran-style ordering for an + array f(x,y,z), but C style for f[z][y][x]. This is to match the Fortran + interface of the original CMCL libraries. + Non-uniform (NU) points kx,ky,kz are real, and may lie in the central three + periods in each coordinate (these are folded into the central period). + The finufft_spread_opts struct must have been set up already by calling setup_kernel. + It is assumed that 2*opts.nspread < min(N1,N2,N3), so that the kernel + only ever wraps once when falls below 0 or off the top of a uniform grid + dimension. + + Inputs: + N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively. + If N2==1, 1D spreading is done. If N3==1, 2D spreading. + Otherwise, 3D. + M - number of NU pts. + kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in + 1D, only kx and ky read in 2D). + + These should lie in the box -pi<=kx<=pi. Points outside this domain are also + correctly folded back into this domain. + opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h + + Inputs/Outputs: + data_uniform - output values on grid (dir=1) OR input grid data (dir=2) + data_nonuniform - input strengths of the sources (dir=1) + OR output values at targets (dir=2) + Returned value: + 0 indicates success; other values have meanings in ../docs/error.rst, with + following modifications: + 3 : one or more non-trivial box dimensions is less than 2.nspread. + 5 : failed allocate sort indices + + Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17 + error codes 3/13/17. pirange 3/28/17. Rewritten 6/15/17. parallel sort 2/9/18 + No separate subprob indices in t-1 2/11/18. + sort_threads (since for M< -simd_type fold_rescale(const simd_type &x, const BIGINT N) noexcept { - const simd_type x2pi = FLT(M_1_2PI); - const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5)); - return (result - xsimd::floor(result)) * simd_type(FLT(N)); +static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3) +/* rule for getting number of spreading dimensions from the list of Ns per dim. + Split out, Barnett 7/26/18 +*/ +{ + return 1 + (N2 > 1) + (N3 > 1); } -template -auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, - const V... elems) noexcept { - /* Utility function that allows to move the kernel evaluation outside the spreader for - clarity - Inputs are: - ns = kernel width - kerevalmeth = kernel evaluation method - T = (single or double precision) type of the kernel - simd_type = xsimd::batch for Horner - vectorization (default is the optimal simd size) - finufft_spread_opts as Horner needs - the oversampling factor - elems = kernel arguments - Examples usage is - ker_eval(opts, x, y, z) // for 3D or - ker_eval(opts, x, y) // for 2D or - ker_eval(opts, x) // for 1D - */ - const std::array inputs{elems...}; - // compile time loop, no performance overhead - for (auto i = 0; i < sizeof...(elems); ++i) { - // compile time branch no performance overhead - if constexpr (kerevalmeth == 1) { - if (opts.upsampfac == 2.0) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], - opts); - } - if (opts.upsampfac == 1.25) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], - opts); - } +int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, + const finufft_spread_opts &opts) +/* This does just the input checking and reporting for the spreader. + See spreadinterp() for input arguments and meaning of returned value. + Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18. + Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to + [-3pi,3pi) +*/ +{ + // INPUT CHECKING & REPORTING .... cuboid not too small for spreading? + int minN = 2 * opts.nspread; + if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) { + fprintf(stderr, + "%s error: one or more non-trivial box dims is less than 2.nspread!\n", + __func__); + return FINUFFT_ERR_SPREAD_BOX_SMALL; + } + if (opts.spread_direction != 1 && opts.spread_direction != 2) { + fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__); + return FINUFFT_ERR_SPREAD_DIR; + } + return 0; +} + +int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, + FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts) +/* This makes a decision whether or not to sort the NU pts (influenced by + opts.sort), and if yes, calls either single- or multi-threaded bin sort, + writing reordered index list to sort_indices. If decided not to sort, the + identity permutation is written to sort_indices. + The permutation is designed to make RAM access close to contiguous, to + speed up spreading/interpolation, in the case of disordered NU points. + + Inputs: + M - number of input NU points. + kx,ky,kz - length-M arrays of real coords of NU pts. Domain is [-pi, pi), + points outside are folded in. + (only kz used in 1D, only kx and ky used in 2D.) + N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D). + 1 = x (fastest), 2 = y (medium), 3 = z (slowest). + opts - spreading options struct, see ../include/finufft_spread_opts.h + Outputs: + sort_indices - a good permutation of NU points. (User must preallocate + to length M.) Ie, kx[sort_indices[j]], j=0,..,M-1, is a good + ordering for the x-coords of NU pts, etc. + returned value - whether a sort was done (1) or not (0). + + Barnett 2017; split out by Melody Shih, Jun 2018. Barnett nthr logic 2024. +*/ +{ + CNTime timer{}; + uint8_t ndims = ndims_from_Ns(N1, N2, N3); + auto N = N1 * N2 * N3; // U grid (periodic box) sizes + + // heuristic binning box size for U grid... affects performance: + double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4; + // put in heuristics based on cache sizes (only useful for single-thread) ? + + int better_to_sort = + !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or + // dir=2 case: + // don't sort + + timer.start(); // if needed, sort all the NU pts... + int did_sort = 0; + auto maxnthr = MY_OMP_GET_MAX_THREADS(); // used if both below opts default + if (opts.nthreads > 0) + maxnthr = opts.nthreads; // user nthreads overrides, without limit + if (opts.sort_threads > 0) + maxnthr = opts.sort_threads; // high-priority override, also no limit + // At this point: maxnthr = the max threads sorting could use + // (we don't print warning here, since: no showwarn in spread_opts, and finufft + // already warned about it. spreadinterp-only advanced users will miss a warning) + if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) { + // store a good permutation ordering of all NU pts (dim=1,2 or 3) + int sort_debug = (opts.debug >= 2); // show timing output? + int sort_nthr = opts.sort_threads; // 0, or user max # threads for sort +#ifndef _OPENMP + sort_nthr = 1; // if single-threaded lib, override user +#endif + if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better! + sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic + if (sort_nthr == 1) + bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, + bin_size_y, bin_size_z, sort_debug); + else // sort_nthr>1, user fixes # threads (>=2) + bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x, + bin_size_y, bin_size_z, sort_debug, sort_nthr); + if (opts.debug) + printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec()); + did_sort = 1; + } else { +#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000) + for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7 + sort_indices[i] = i; // the identity permutation + if (opts.debug) + printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec()); + } + return did_sort; +} + +// -------------------------------------------------------------------------- +static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, UBIGINT M, + FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, + FLT *FINUFFT_RESTRICT kz, const FLT *data_nonuniform, + const finufft_spread_opts &opts, int did_sort) +// Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. +{ + CNTime timer{}; + const auto ndims = ndims_from_Ns(N1, N2, N3); + const auto N = N1 * N2 * N3; // output array size + const auto ns = opts.nspread; // abbrev. for w, kernel width + auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to spread + if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit +#ifndef _OPENMP + nthr = 1; // single-threaded lib must override user +#endif + if (opts.debug) + printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, + (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); + timer.start(); + std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array + if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec()); + if (M == 0) // no NU pts, we're done + return 0; + + auto spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic? + spread_single = false; // for now + timer.start(); + if (spread_single) { // ------- Basic single-core t1 spreading ------ + for (UBIGINT j = 0; j < M; j++) { + // *** todo, not urgent + // ... (question is: will the index wrapping per NU pt slow it down?) } - if constexpr (kerevalmeth == 0) { - alignas(simd_type::arch_type::alignment()) std::array kernel_args{}; - set_kernel_args(kernel_args.data(), inputs[i]); - evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts); + if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec()); + } else { // ------- Fancy multi-core blocked t1 spreading ---- + // Splits sorted inds (jfm's advanced2), could double RAM. + // choose nb (# subprobs) via used nthreads: + auto nb = std::min((UBIGINT)nthr, M); // simply split one subprob per thr... + if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size + nb = 1 + (M - 1) / opts.max_subproblem_size; // int div does + // ceil(M/opts.max_subproblem_size) + if (opts.debug) + printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size); } - } - return ker; -} + if (M * 1000 < N) { // low-density heuristic: one thread per NU pt! + nb = M; + if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n"); + } + if (!did_sort && nthr == 1) { + nb = 1; + if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n"); + } + if (opts.debug && nthr > opts.atomic_threshold) + printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n"); -namespace { + std::vector brk(nb + 1); // NU index breakpoints defining nb subproblems + for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; -template -constexpr array, N> pad_2D_array_with_zeros( - const array, N> &input) noexcept { - constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept { - std::array padded{0}; - for (auto i = 0; i < input.size(); ++i) { - padded[i] = input[i]; +#pragma omp parallel num_threads(nthr) + { + // local copies of NU pts and data for each subproblem + std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; +#pragma omp for schedule(dynamic, 1) // each is big + for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems + const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem + // copy the location and data vectors for the nonuniform points + kx0.resize(M0); + ky0.resize(M0 * (N2 > 1)); + kz0.resize(M0 * (N3 > 1)); + dd0.resize(2 * M0); // complex strength data + for (auto j = 0; j < M0; j++) { // todo: can avoid this copying? + const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list + kx0[j] = fold_rescale(kx[kk], N1); + if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); + if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); + dd0[j * 2] = data_nonuniform[kk * 2]; // real part + dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part + } + // get the subgrid which will include padding by roughly nspread/2 + // get_subgrid sets + BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; + // sets offsets and sizes + get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, + kx0.data(), ky0.data(), kz0.data(), ns, ndims); + if (opts.debug > 1) { + print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2, + size3, M0); + } + // allocate output data for this subgrid + du0.resize(2 * padded_size1 * size2 * size3); // complex + // Spread to subgrid without need for bounds checking or wrapping + if (!(opts.flags & TF_OMIT_SPREADING)) { + if (ndims == 1) + spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(), + dd0.data(), opts); + else if (ndims == 2) + spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0, + kx0.data(), ky0.data(), dd0.data(), opts); + else + spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, + du0.data(), M0, kx0.data(), ky0.data(), kz0.data(), + dd0.data(), opts); + } + // do the adding of subgrid to output + if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { + if (nthr > opts.atomic_threshold) { // see above for debug reporting + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); // R Blackwell's atomic version + } else { +#pragma omp critical + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); + } + } + } // end main loop over subprobs } - return padded; - }; - std::array, N> output{}; - for (std::size_t i = 0; i < N; ++i) { - output[i] = pad_with_zeros(input[i]); - } - return output; -} + if (opts.debug) + printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb); + } // end of choice of which t1 spread type to use + return 0; +}; -template -constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { - // utility function to generate a sequence of a, b interleaved as function arguments - return T(((Is % 2 == 0) ? a : b)...); -} +// -------------------------------------------------------------------------- +template +FINUFFT_NEVER_INLINE static int interpSorted_kernel( + const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, + const FLT *data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) +// Interpolate to NU pts in sorted order from a uniform grid. +// See spreadinterp() for doc. +{ + using simd_type = PaddedSIMD; + using arch_t = typename simd_type::arch_type; + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift -template -constexpr auto initialize_complex_register(V a, V b) noexcept { - // populates a SIMD register with a and b interleaved - // for example: - // +-------------------------------+ - // | a | b | a | b | a | b | a | b | - // +-------------------------------+ - // it uses index_sequence to generate the sequence of a, b at compile time - return generate_sequence_impl(a, b, std::make_index_sequence{}); -} + CNTime timer{}; + const auto ndims = ndims_from_Ns(N1, N2, N3); + auto nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp + if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit +#ifndef _OPENMP + nthr = 1; // single-threaded lib must override user +#endif + if (opts.debug) + printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, + (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); + timer.start(); +#pragma omp parallel num_threads(nthr) + { + static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk + alignas(alignment) UBIGINT jlist[CHUNKSIZE]; + alignas(alignment) FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; + alignas(alignment) FLT outbuf[2 * CHUNKSIZE]; + // Kernels: static alloc is faster, so we do it for up to 3D... + alignas(alignment) std::array kernel_values{0}; + auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); + auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; + auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; -// Below there is some template metaprogramming magic to find the best SIMD type -// for the given number of elements. The code is based on the xsimd library + // Loop over interpolation chunks + // main loop over NU trgs, interp each from U + // (note: windows omp doesn't like unsigned loop vars) +#pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: + for (BIGINT i = 0; i < M; i += CHUNKSIZE) { + // Setup buffers for this chunk + const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; + for (int ibuf = 0; ibuf < bufsize; ibuf++) { + UBIGINT j = sort_indices[i + ibuf]; + jlist[ibuf] = j; + xjlist[ibuf] = fold_rescale(kx[j], N1); + if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2); + if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3); + } -// this finds the largest SIMD instruction set that can handle N elements -// void otherwise -> compile error -template constexpr auto BestSIMDHelper() { - if constexpr (N % K == 0) { // returns void in the worst case - return xsimd::make_sized_batch{}; - } else { - return BestSIMDHelper> 1)>(); - } -} + // Loop over targets in chunk + for (int ibuf = 0; ibuf < bufsize; ibuf++) { + const auto xj = xjlist[ibuf]; + const auto yj = (ndims > 1) ? yjlist[ibuf] : 0; + const auto zj = (ndims > 2) ? zjlist[ibuf] : 0; -template constexpr uint8_t min_simd_width() { - // finds the smallest simd width that can handle N elements - // simd size is batch size the SIMD width in xsimd terminology - if constexpr (std::is_void_v>) { - return min_simd_width(); - } else { - return N; - } -}; + auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf; -template constexpr auto find_optimal_simd_width() { - // finds the smallest simd width that minimizes the number of iterations - // NOTE: might be suboptimal for some cases 2^N+1 for example - // in the future we might want to implement a more sophisticated algorithm - uint8_t optimal_simd_width = min_simd_width(); - uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width; - for (uint8_t simd_width = optimal_simd_width; - simd_width <= xsimd::batch::size; - simd_width *= 2) { - uint8_t iterations = (N + simd_width - 1) / simd_width; - if (iterations < min_iterations) { - min_iterations = iterations; - optimal_simd_width = simd_width; - } - } - return optimal_simd_width; -} + // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ + const auto i1 = BIGINT(std::ceil(xj - ns2)); // leftmost grid index + const auto i2 = (ndims > 1) ? BIGINT(std::ceil(yj - ns2)) : 0; // min y grid index + const auto i3 = (ndims > 2) ? BIGINT(std::ceil(zj - ns2)) : 0; // min z grid index + + const auto x1 = std::ceil(xj - ns2) - xj; // shift of ker center, in [-w/2,-w/2+1] + const auto x2 = (ndims > 1) ? std::ceil(yj - ns2) - yj : 0; + const auto x3 = (ndims > 2) ? std::ceil(zj - ns2) - zj : 0; + + // eval kernel values patch and use to interpolate from uniform data... + if (!(opts.flags & TF_OMIT_SPREADING)) { + switch (ndims) { + case 1: + ker_eval(kernel_values.data(), opts, x1); + interp_line(target, data_uniform, ker1, i1, N1); + break; + case 2: + ker_eval(kernel_values.data(), opts, x1, x2); + interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, + N2); + break; + case 3: + ker_eval(kernel_values.data(), opts, x1, x2, + x3); + interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, + N1, N2, N3); + break; + default: // can't get here + FINUFFT_UNREACHABLE; + break; + } + } + } // end loop over targets in chunk -template constexpr auto GetPaddedSIMDWidth() { - // helper function to get the SIMD width with padding for the given number of elements - // that minimizes the number of iterations - return xsimd::make_sized_batch()>::type::size; -} + // Copy result buffer to output array + for (int ibuf = 0; ibuf < bufsize; ibuf++) { + const UBIGINT j = jlist[ibuf]; + data_nonuniform[2 * j] = outbuf[2 * ibuf]; + data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1]; + } -template constexpr auto get_padding() { - // helper function to get the padding for the given number of elements - // ns is known at compile time, rounds ns to the next multiple of the SIMD width - // then subtracts ns to get the padding using a bitwise and trick - // WARING: this trick works only for power of 2s - // SOURCE: Agner Fog's VCL manual - constexpr uint8_t width = GetPaddedSIMDWidth(); - return ((ns + width - 1) & (-width)) - ns; + } // end NU targ loop + } // end parallel section + if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec()); + return 0; } -template constexpr auto get_padding_helper(uint8_t runtime_ns) { - // helper function to get the padding for the given number of elements where ns is - // known at runtime, it uses recursion to find the padding - // this allows to avoid having a function with a large number of switch cases - // as GetPaddedSIMDWidth requires a compile time value - // it cannot be a lambda function because of the template recursion - if constexpr (ns < 2) { - return 0; +template +static int interpSorted_dispatch( + const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { + static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, + "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); + if constexpr (NS == MIN_NSPREAD) { // Base case + if (opts.kerevalmeth) + return interpSorted_kernel( + sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); + else { + return interpSorted_kernel( + sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); + } } else { - if (runtime_ns == ns) { - return get_padding(); + if (opts.nspread == NS) { + if (opts.kerevalmeth) { + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); + } else { + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); + } } else { - return get_padding_helper(runtime_ns); + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, + ky, kz, data_nonuniform, opts); } } } -template uint8_t get_padding(uint8_t ns) { - // return the padding as a function of the number of elements - // 2 * MAX_NSPREAD is the maximum number of elements that we can have - // that's why is hardcoded here - return get_padding_helper(ns); +static int interpSorted( + const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, + ky, kz, data_nonuniform, opts); } -struct zip_low { - // helper struct to get the lower half of a SIMD register and zip it with itself - // it returns index 0, 0, 1, 1, ... N/2, N/2 - static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; } -}; -struct zip_hi { - // helper struct to get the upper half of a SIMD register and zip it with itself - // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N - static constexpr unsigned get(unsigned index, unsigned size) { - return (size + index) / 2; +int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, FLT *data_uniform, const UBIGINT M, + FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, + FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort) +/* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. + See spreadinterp() above for inputs arguments and definitions. + Return value should always be 0 (no error reporting). + Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20. +*/ +{ + if (opts.spread_direction == 1) // ========= direction 1 (spreading) ======= + spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, + opts, did_sort); + + else // ================= direction 2 (interpolation) =========== + interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, + opts); + + return 0; +} + +/////////////////////////////////////////////////////////////////////////// + +int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth, + int debug, int showwarn, int dim) +/* Initializes spreader kernel parameters given desired NUFFT tolerance eps, + upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth + (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags. + Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for + opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where + upsampfac is set. Must call this before any kernel evals done, otherwise segfault + likely. Returns: 0 : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be + achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h); + spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. +*/ +{ + if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma + if (kerevalmeth == 1) { + fprintf(stderr, + "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by " + "kerevalmeth=1\n", + upsampfac); + return FINUFFT_ERR_HORNER_WRONG_BETA; + } + if (upsampfac <= 1.0) { // no digits would result + fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n", + upsampfac); + return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL; + } + // calling routine must abort on above errors, since opts is garbage! + if (showwarn && upsampfac > 4.0) + fprintf(stderr, + "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be " + "beneficial.\n", + upsampfac); } -}; -template struct reverse_index { - static constexpr unsigned get(unsigned index, const unsigned size) { - return index < cap ? (cap - 1 - index) : index; + + // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft) + opts.spread_direction = 0; // user should always set to 1 or 2 as desired + opts.sort = 2; // 2:auto-choice + opts.kerpad = 0; // affects only evaluate_kernel_vector + opts.kerevalmeth = kerevalmeth; + opts.upsampfac = upsampfac; + opts.nthreads = 0; // all avail + opts.sort_threads = 0; // 0:auto-choice + // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake... + opts.max_subproblem_size = (dim == 1) ? 10000 : 100000; + opts.flags = 0; // 0:no timing flags (>0 for experts only) + opts.debug = 0; // 0:no debug output + // heuristic nthr above which switch OMP critical to atomic (add_wrapped...): + opts.atomic_threshold = 10; // R Blackwell's value + + int ns, ier = 0; // Set kernel width w (aka ns, nspread) then copy to opts... + if (eps < EPSILON) { // safety; there's no hope of beating e_mach + if (showwarn) + fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__, + (double)eps, (double)EPSILON); + eps = EPSILON; // only changes local copy (not any opts) + ier = FINUFFT_WARN_EPS_TOO_SMALL; } -}; -template struct shuffle_index { - static constexpr unsigned get(unsigned index, const unsigned size) { - return index < cap ? (cap - 1 - index) : size + size + cap - 1 - index; + if (upsampfac == 2.0) // standard sigma (see SISC paper) + ns = std::ceil(-log10(eps / (FLT)10.0)); // 1 digit per power of 10 + else // custom sigma + ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 + ns = max(2, ns); // (we don't have ns=1 version yet) + if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules + if (showwarn) + fprintf(stderr, + "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; " + "clipping to max %d.\n", + __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); + ns = MAX_NSPREAD; + ier = FINUFFT_WARN_EPS_TOO_SMALL; } -}; - -struct select_even { - static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index * 2; } -}; -struct select_odd { - static constexpr unsigned get(unsigned index, unsigned /*size*/) { - return index * 2 + 1; + opts.nspread = ns; + // setup for reference kernel eval (via formula): select beta width param... + // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel) + opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines) + opts.ES_c = 4.0 / (double)(ns * ns); + double betaoverns = 2.30; // gives decent betas for default sigma=2.0 + if (ns == 2) betaoverns = 2.20; // some small-width tweaks... + if (ns == 3) betaoverns = 2.26; + if (ns == 4) betaoverns = 2.38; + if (upsampfac != 2.0) { // again, override beta for custom sigma + FLT gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! + betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff } -}; + opts.ES_beta = betaoverns * ns; // set the kernel beta parameter + if (debug) + printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__, + kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta); -template auto xsimd_to_array(const T &vec) noexcept { - constexpr auto alignment = T::arch_type::alignment(); - alignas(alignment) std::array array{}; - vec.store_aligned(array.data()); - return array; + return ier; } -void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, - UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3, - UBIGINT M0) { - printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); - switch (ndims) { - case 1: - printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, - (long long)padded_size1, (long long)M0); - break; - case 2: - printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1, - (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); - break; - case 3: - printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", - (long long)offset1, (long long)offset2, (long long)offset3, - (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0); - break; - default: - printf("Invalid number of dimensions: %d\n", ndims); - break; - } +FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts) +/* ES ("exp sqrt") kernel evaluation at single real argument: + phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)), for |x| < nspread/2 + related to an asymptotic approximation to the Kaiser--Bessel, itself an + approximation to prolate spheroidal wavefunction (PSWF) of order 0. + This is the "reference implementation", used by eg finufft/onedim_* 2/17/17. + Rescaled so max is 1, Barnett 7/21/24 +*/ +{ + if (abs(x) >= (FLT)opts.ES_halfwidth) + // if spreading/FT careful, shouldn't need this if, but causes no speed hit + return 0.0; + else + return exp((FLT)opts.ES_beta * (sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x) - (FLT)1.0)); } -} // namespace + } // namespace finufft::spreadinterp From 7de00c8073c567ce1d4f2c8d556328724f2b7e74 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 14:58:25 +0200 Subject: [PATCH 02/20] simplify simple interfaces code --- include/finufft_eitherprec.h | 4 +- src/finufft.cpp | 4 +- src/simpleinterfaces.cpp | 218 ++++++++++------------------------- 3 files changed, 67 insertions(+), 159 deletions(-) diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h index 47f7860e1..3f0a7d95c 100644 --- a/include/finufft_eitherprec.h +++ b/include/finufft_eitherprec.h @@ -86,8 +86,8 @@ typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN; FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o); FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)( - int type, int dim, FINUFFT_BIGINT *n_modes, int iflag, int n_transf, FINUFFT_FLT tol, - FINUFFT_PLAN *plan, finufft_opts *o); + int type, int dim, const FINUFFT_BIGINT *n_modes, int iflag, int n_transf, + FINUFFT_FLT tol, FINUFFT_PLAN *plan, finufft_opts *o); FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)( FINUFFT_PLAN plan, FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u); diff --git a/src/finufft.cpp b/src/finufft.cpp index 21e6db7ab..f80fc0bf3 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -528,8 +528,8 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o) } // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP -int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, FLT tol, - FINUFFT_PLAN *pp, finufft_opts *opts) +int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) // Populates the fields of finufft_plan which is pointed to by "pp". // opts is ptr to a finufft_opts to set options, or NULL to use defaults. // For some of the fields (if "auto" selected) here choose the actual setting. diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index edd25adfb..1fb49db9e 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -1,6 +1,7 @@ // public header #include // private headers +#include #include #include using namespace std; @@ -23,14 +24,15 @@ using namespace std; namespace finufft { namespace common { -int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, FLT *yj, - FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT *n_modes, BIGINT nk, - FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *popts) +static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, + FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, + const std::array &n_modes, BIGINT nk, FLT *s, + FLT *t, FLT *u, CPX *fk, finufft_opts *popts) // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { FINUFFT_PLAN plan; - int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan, + int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, popts); // popts (ptr to opts) can be NULL if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); @@ -63,219 +65,122 @@ using namespace finufft::common; // Dimension 1111111111111111111111111111111111111111111111111111111111111111 -int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, - finufft_opts *opts) -// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst -{ - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; -} - int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, finufft_opts *opts) // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, + {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } -int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, +int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, finufft_opts *opts) -// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst +// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return FINUFFT1D1MANY(1, nj, xj, cj, iflag, eps, ms, fk, opts); } int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, finufft_opts *opts) // Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, 1, 1}; - int n_dims = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 2, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, + {ms, 1, 1}, 0, NULL, NULL, NULL, fk, opts); } -int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, - CPX *fk, finufft_opts *opts) -// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst +int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, + finufft_opts *opts) +// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 1; - int n_transf = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, NULL, nk, s, NULL, NULL, fk, opts); - return ier; + return FINUFFT1D2MANY(1, nj, xj, cj, iflag, eps, ms, fk, opts); } int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts) // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag, - eps, NULL, nk, s, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(1, 3, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, + {0, 0, 0}, nk, s, NULL, NULL, fk, opts); } - -// Dimension 22222222222222222222222222222222222222222222222222222222222222222 - -int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, - BIGINT mt, CPX *fk, finufft_opts *opts) -// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst +int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, + CPX *fk, finufft_opts *opts) +// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return FINUFFT1D3MANY(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } +// Dimension 22222222222222222222222222222222222222222222222222222222222222222 + int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) // Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, NULL, c, iflag, eps, {ms, mt, 1}, + 0, NULL, NULL, NULL, fk, opts); } - -int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, +int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) -// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst +// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return FINUFFT2D1MANY(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) // Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, 1}; - int n_dims = 2; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, NULL, c, iflag, eps, {ms, mt, 1}, + 0, NULL, NULL, NULL, fk, opts); } - -int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk, - FLT *s, FLT *t, CPX *fk, finufft_opts *opts) -// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst +int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, + BIGINT mt, CPX *fk, finufft_opts *opts) +// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 2; - int type = 3; - int n_transf = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - NULL, nk, s, t, NULL, fk, opts); - return ier; + return FINUFFT2D2MANY(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 2; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - NULL, nk, s, t, NULL, fk, opts); - return ier; + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, NULL, cj, iflag, eps, {0, 0, 0}, + nk, s, t, NULL, fk, opts); } - -// Dimension 3333333333333333333333333333333333333333333333333333333333333333 - -int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) -// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst +int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk, + FLT *s, FLT *t, CPX *fk, finufft_opts *opts) +// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - int n_dims = 3; - int n_transf = 1; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return FINUFFT2D3MANY(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); } +// Dimension 3333333333333333333333333333333333333333333333333333333333333333 + int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) // Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - int n_dims = 3; - int type = 1; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, + 0, NULL, NULL, NULL, fk, opts); } - -int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, +int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) -// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst +// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - int n_dims = 3; - int n_transf = 1; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return FINUFFT3D1MANY(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) // Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - BIGINT n_modes[] = {ms, mt, mu}; - n_modes[0] = ms; - n_modes[1] = mt; - n_modes[2] = mu; - int n_dims = 3; - int type = 2; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - n_modes, 0, NULL, NULL, NULL, fk, opts); - return ier; + return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, + 0, NULL, NULL, NULL, fk, opts); } - -int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts) -// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst +int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, + BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst { - int n_dims = 3; - int n_transf = 1; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - NULL, nk, s, t, u, fk, opts); - return ier; + return FINUFFT3D2MANY(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, @@ -283,9 +188,12 @@ int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, finufft_opts *opts) // Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - int n_dims = 3; - int type = 3; - int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps, - NULL, nk, s, t, u, fk, opts); - return ier; + return invokeGuruInterface(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, + nk, s, t, u, fk, opts); +} +int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, + BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts) +// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return FINUFFT3D3MANY(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); } From 7b4d8e677067c1ebb263b79183197449eb32c49c Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 16:00:05 +0200 Subject: [PATCH 03/20] templatize spreadinterp --- CMakeLists.txt | 9 +- include/finufft/spreadinterp.h | 26 +- include/finufft/utils.h | 13 + src/spreadinterp.cpp | 613 ++++++++++++++++++--------------- 4 files changed, 363 insertions(+), 298 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 423c8adc4..8446d5500 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,9 +121,8 @@ endif() # This set of sources is compiled twice, once in single precision and once in # double precision The single precision compilation is done with -DSINGLE -set(FINUFFT_PRECISION_DEPENDENT_SOURCES - src/finufft.cpp src/fft.cpp src/simpleinterfaces.cpp src/spreadinterp.cpp - src/utils.cpp) +set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/fft.cpp + src/simpleinterfaces.cpp src/utils.cpp) # If we're building for Fortran, make sure we also include the translation # layer. @@ -259,10 +258,10 @@ if(FINUFFT_USE_CPU) add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) set_finufft_options(finufft_f64) if(NOT FINUFFT_STATIC_LINKING) - add_library(finufft SHARED src/utils_precindep.cpp + add_library(finufft SHARED src/spreadinterp.cpp src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) else() - add_library(finufft STATIC src/utils_precindep.cpp + add_library(finufft STATIC src/spreadinterp.cpp src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) endif() target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64) diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 7dbdd4cf1..6851b14d7 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -31,22 +31,28 @@ namespace finufft { namespace spreadinterp { // things external (spreadinterp) interface needs... +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( - UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT N, FLT *kx, FLT *ky, - FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts); + UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT N, T *kx, T *ky, T *kz, + T *data_nonuniform, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, - UBIGINT N, FLT *kx, FLT *ky, FLT *kz, + UBIGINT N, T *kx, T *ky, T *kz, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, - UBIGINT N3, UBIGINT N, FLT *kx, FLT *ky, - FLT *kz, const finufft_spread_opts &opts); + UBIGINT N3, UBIGINT N, T *kx, T *ky, T *kz, + const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT N, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); -FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts); -FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps, + T *FINUFFT_RESTRICT data_uniform, UBIGINT N, T *FINUFFT_RESTRICT kx, + T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort); +template +FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel(T x, const finufft_spread_opts &opts); +template +FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, int debug, int showwarn, int dim); diff --git a/include/finufft/utils.h b/include/finufft/utils.h index 9039fee96..2758c726e 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -15,6 +15,19 @@ FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b); FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a); FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a); FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi); +template +void arrayrange(BIGINT n, T *a, T *lo, T *hi) +// With a a length-n array, writes out min(a) to lo and max(a) to hi, +// so that all a values lie in [lo,hi]. +// If n==0, lo and hi are not finite. +{ + *lo = INFINITY; + *hi = -INFINITY; + for (BIGINT m = 0; m < n; ++m) { + if (a[m] < *lo) *lo = a[m]; + if (a[m] > *hi) *hi = a[m]; + } +} FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo, FLT *hi); FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c); diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index eff98d05c..c98a5801a 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1,5 +1,4 @@ -// Spreading/interpolating module within FINUFFT. Uses precision-switching -// macros for FLT, CPX, etc. +// Spreading/interpolating module within FINUFFT. #include #include @@ -149,18 +148,18 @@ constexpr auto initialize_complex_register(V a, V b) noexcept { // it uses index_sequence to generate the sequence of a, b at compile time return generate_sequence_impl(a, b, std::make_index_sequence{}); } -template +template constexpr auto zip_low_index = - xsimd::make_batch_constant, arch_t, zip_low>(); -template + xsimd::make_batch_constant, arch_t, zip_low>(); +template constexpr auto zip_hi_index = - xsimd::make_batch_constant, arch_t, zip_hi>(); -template -constexpr auto select_even_mask = - xsimd::make_batch_constant, arch_t, select_even>(); -template -constexpr auto select_odd_mask = - xsimd::make_batch_constant, arch_t, select_odd>(); + xsimd::make_batch_constant, arch_t, zip_hi>(); +// template +// constexpr auto select_even_mask = +// xsimd::make_batch_constant, arch_t, select_even>(); +// template +// constexpr auto select_odd_mask = +// xsimd::make_batch_constant, arch_t, select_odd>(); template constexpr std::array, N> pad_2D_array_with_zeros( const std::array, N> &input) noexcept { @@ -218,28 +217,29 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function */ -static FINUFFT_ALWAYS_INLINE FLT fold_rescale(const FLT x, const UBIGINT N) noexcept { - static constexpr const FLT x2pi = FLT(M_1_2PI); - const FLT result = x * x2pi + FLT(0.5); - return (result - floor(result)) * FLT(N); +template +static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept { + static constexpr const T x2pi = T(M_1_2PI); + const T result = x * x2pi + T(0.5); + return (result - floor(result)) * T(N); } -template +template static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x, const BIGINT N) noexcept { - const simd_type x2pi = FLT(M_1_2PI); + const simd_type x2pi = T(M_1_2PI); const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5)); - return (result - xsimd::floor(result)) * simd_type(FLT(N)); + return (result - xsimd::floor(result)) * simd_type(T(N)); } -template -static void set_kernel_args(FLT *args, FLT x) noexcept +template +static void set_kernel_args(T *args, T x) noexcept // Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. // needed for the vectorized kernel eval of Ludvig af K. { - for (int i = 0; i < ns; i++) args[i] = x + (FLT)i; + for (int i = 0; i < ns; i++) args[i] = x + T(i); } -template -static void evaluate_kernel_vector(FLT *ker, FLT *args, +template +static void evaluate_kernel_vector(T *ker, T *args, const finufft_spread_opts &opts) noexcept /* Evaluate ES kernel for a vector of N arguments; by Ludvig af K. If opts.kerpad true, args and ker must be allocated for Npad, and args is @@ -251,8 +251,8 @@ static void evaluate_kernel_vector(FLT *ker, FLT *args, works for arbitrary beta. Formula must match reference implementation. */ { - FLT b = (FLT)opts.ES_beta; - FLT c = (FLT)opts.ES_c; + T b = (T)opts.ES_beta; + T c = (T)opts.ES_c; if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { // Note (by Ludvig af K): Splitting kernel evaluation into two loops // seems to benefit auto-vectorization. @@ -265,7 +265,7 @@ static void evaluate_kernel_vector(FLT *ker, FLT *args, } for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments // care! 1.0 is double... - ker[i] = b * (sqrt((FLT)1.0 - c * args[i] * args[i]) - (FLT)1.0); + ker[i] = b * (sqrt((T)1.0 - c * args[i] * args[i]) - (T)1.0); } if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL)) for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials @@ -281,15 +281,14 @@ static void evaluate_kernel_vector(FLT *ker, FLT *args, } // Separate check from arithmetic (Is this really needed? doesn't slow down) for (int i = 0; i < N; i++) - if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; + if (abs(args[i]) >= (T)opts.ES_halfwidth) ker[i] = 0.0; } -// static FINUFFT_ALWAYS_INLINE void set_kernel_args( -// FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; -template()>> // aka ns + xsimd::make_sized_batch_t()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( - FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept + T *FINUFFT_RESTRICT ker, T x, const finufft_spread_opts &opts) noexcept /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. This is the current evaluation method, since it's faster (except i7 w=16). @@ -297,23 +296,23 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { // scale so local grid offset z in[-1,1] - const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); + const T z = std::fma(T(2.0), x, T(w - 1)); using arch_t = typename simd_type::arch_type; static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); static constexpr auto horner_coeffs = []() constexpr noexcept { if constexpr (upsampfact == 200) { - return get_horner_coeffs_200(); + return get_horner_coeffs_200(); } else if constexpr (upsampfact == 125) { - return get_horner_coeffs_125(); + return get_horner_coeffs_125(); } }(); static constexpr auto nc = horner_coeffs.size(); static constexpr auto use_ker_sym = (simd_size < w); alignas(alignment) static constexpr auto padded_coeffs = - pad_2D_array_with_zeros(horner_coeffs); + pad_2D_array_with_zeros(horner_coeffs); // use kernel symmetry trick if w > simd_size if constexpr (use_ker_sym) { @@ -327,10 +326,10 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ // some xsimd constant for shuffle or inverse static constexpr auto shuffle_batch = []() constexpr noexcept { if constexpr (tail) { - return xsimd::make_batch_constant, arch_t, + return xsimd::make_batch_constant, arch_t, shuffle_index>(); } else { - return xsimd::make_batch_constant, arch_t, + return xsimd::make_batch_constant, arch_t, reverse_index>(); } }(); @@ -381,15 +380,15 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ } } -template -static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, +template +static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker, const BIGINT i1, const UBIGINT N1) { /* This function is called when the kernel wraps around the grid. It is slower than interp_line. M. Barbone July 2024: - moved the logic to a separate function - using fused multiply-add (fma) for better performance */ - std::array out{0}; + std::array out{0}; BIGINT j = i1; if (i1 < 0) { // wraps at left j += BIGINT(N1); @@ -424,9 +423,9 @@ static void interp_line_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, const target[1] = out[1]; } -template> -static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, - BIGINT i1, UBIGINT N1) { +template> +static void interp_line(T *FINUFFT_RESTRICT target, const T *du, const T *ker, BIGINT i1, + UBIGINT N1) { /* 1D interpolate complex values from size-ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the 1d kernel evaluation list ker1. @@ -447,16 +446,16 @@ static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * limitation */ using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size)); - std::array out{0}; + std::array out{0}; const auto j = i1; // removing the wrapping leads up to 10% speedup in certain cases // moved the wrapping to another function to reduce instruction cache pressure if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) { - return interp_line_wrap(target, du, ker, i1, N1); + return interp_line_wrap(target, du, ker, i1, N1); } else { // doesn't wrap // logic largely similar to spread 1D kernel, please see the explanation there // for the first part of this code @@ -467,8 +466,8 @@ static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * const auto ker_v = simd_type::load_aligned(ker + dx / 2); const auto du_pt0 = simd_type::load_unaligned(du_ptr + dx); const auto du_pt1 = simd_type::load_unaligned(du_ptr + dx + simd_size); - const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); - const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); + const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); + const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); res_low = xsimd::fma(ker0low, du_pt0, res_low); res_hi = xsimd::fma(ker0hi, du_pt1, res_hi); } @@ -476,7 +475,7 @@ static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * if constexpr (regular_part < 2 * ns) { const auto ker0 = simd_type::load_unaligned(ker + (regular_part / 2)); const auto du_pt = simd_type::load_unaligned(du_ptr + regular_part); - const auto ker0low = xsimd::swizzle(ker0, zip_low_index); + const auto ker0low = xsimd::swizzle(ker0, zip_low_index); res_low = xsimd::fma(ker0low, du_pt, res_low); } @@ -508,22 +507,22 @@ static void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * target[1] = out[1]; } -template -static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, - const FLT *ker1, const FLT *ker2, const BIGINT i1, - const BIGINT i2, const UBIGINT N1, const UBIGINT N2) { +template +static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, const BIGINT i1, const BIGINT i2, + const UBIGINT N1, const UBIGINT N2) { /* * This function is called when the kernel wraps around the grid. It is slower than * the non wrapping version. * There is an extra case for when ker is padded and spills over the du array. * In this case uses the old non wrapping version. */ - std::array out{0}; + std::array out{0}; using arch_t = typename simd_type::arch_type; static constexpr auto alignment = arch_t::alignment(); if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { // store a horiz line (interleaved real,imag) - alignas(alignment) std::array line{0}; + alignas(alignment) std::array line{0}; // add remaining const-y lines to the line (expensive inner loop) for (uint8_t dy{0}; dy < ns; ++dy) { const auto *l_ptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above) @@ -561,9 +560,9 @@ static void interp_square_wrap(FLT *FINUFFT_RESTRICT target, const FLT *du, target[1] = out[1]; } -template> -static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2) +template> +static void interp_square(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, BIGINT i1, BIGINT i2, UBIGINT N1, UBIGINT N2) /* 2D interpolate complex values from a ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns outer product of the 1d kernel lists ker1 and ker2. @@ -596,10 +595,10 @@ static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT The code is largely similar to 1D interpolation, please see the explanation there */ { - std::array out{0}; + std::array out{0}; // no wrapping: avoid ptrs using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size; @@ -630,15 +629,15 @@ static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable) i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); res_low = xsimd::fma(ker1low, line[i], res_low); res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi); } if constexpr (line_vectors % 2) { const auto ker1_v = simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); res_low = xsimd::fma(ker1low, line.back(), res_low); } return res_low + res_hi; @@ -651,17 +650,17 @@ static void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT } else { // wraps somewhere: use ptr list // this is slower than above, but occurs much less often, with fractional // rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized. - return interp_square_wrap(target, du, ker1, ker2, i1, i2, N1, N2); + return interp_square_wrap(target, du, ker1, ker2, i1, i2, N1, N2); } target[0] = out[0]; target[1] = out[1]; } -template -static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, - const FLT *ker1, const FLT *ker2, const FLT *ker3, - const BIGINT i1, const BIGINT i2, const BIGINT i3, - const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) { +template +static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, const T *ker3, const BIGINT i1, + const BIGINT i2, const BIGINT i3, const UBIGINT N1, + const UBIGINT N2, const UBIGINT N3) { /* * This function is called when the kernel wraps around the cube. * Similarly to 2D and 1D wrapping, this is slower than the non wrapping version. @@ -671,14 +670,14 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1); const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2); const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3); - std::array out{0}; + std::array out{0}; // case no wrapping needed but padding spills over du array. // Hence, no explicit vectorization but the code is still faster if (FINUFFT_LIKELY(in_bounds_1 && in_bounds_2 && in_bounds_3)) { // no wrapping: avoid ptrs (by far the most common case) // store a horiz line (interleaved real,imag) // initialize line with zeros; hard to avoid here, but overhead small in 3D - alignas(alignment) std::array line{0}; + alignas(alignment) std::array line{0}; // co-add y and z contributions to line in x; do not apply x kernel yet // This is expensive innermost loop for (uint8_t dz{0}; dz < ns; ++dz) { @@ -730,9 +729,9 @@ static void interp_cube_wrapped(FLT *FINUFFT_RESTRICT target, const FLT *du, target[1] = out[1]; } -template> -static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, +template> +static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, + const T *ker2, const T *ker3, BIGINT i1, BIGINT i2, BIGINT i3, UBIGINT N1, UBIGINT N2, UBIGINT N3) /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the @@ -764,7 +763,7 @@ static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * */ { using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto ker23_size = (ns + simd_size - 1) & -simd_size; @@ -772,7 +771,7 @@ static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1); const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2); const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3); - std::array out{0}; + std::array out{0}; if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) { const auto line = [N1, N2, i1 = UBIGINT(i1), i2 = UBIGINT(i2), i3 = UBIGINT(i3), ker2, ker3, du]() constexpr noexcept { @@ -797,15 +796,15 @@ static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * for (uint8_t i{0}; i < (line_vectors & ~1); // NOLINT(*-too-small-loop-variable) i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); res_low = xsimd::fma(ker1low, line[i], res_low); res_hi = xsimd::fma(ker1hi, line[i + 1], res_hi); } if constexpr (line_vectors % 2) { const auto ker1_v = simd_type::load_aligned(ker1 + (line_vectors - 1) * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); res_low = xsimd::fma(ker1low, line.back(), res_low); } return res_low + res_hi; @@ -816,8 +815,8 @@ static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * out[1] += res_array[i + 1]; } } else { - return interp_cube_wrapped(target, du, ker1, ker2, ker3, i1, i2, i3, - N1, N2, N3); + return interp_cube_wrapped(target, du, ker1, ker2, ker3, i1, i2, i3, + N1, N2, N3); } target[0] = out[0]; target[1] = out[1]; @@ -826,9 +825,8 @@ static void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT * template()>, typename... V> -static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, - const finufft_spread_opts &opts, - const V... elems) noexcept { +static FINUFFT_ALWAYS_INLINE auto ker_eval( + T *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept { /* Utility function that allows to move the kernel evaluation outside the spreader for clarity Inputs are: @@ -851,27 +849,27 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, // compile time branch no performance overhead if constexpr (kerevalmeth == 1) { if (opts.upsampfac == 2.0) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], - opts); + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], + opts); } if (opts.upsampfac == 1.25) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], - opts); + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], + opts); } } if constexpr (kerevalmeth == 0) { alignas(simd_type::arch_type::alignment()) std::array kernel_args{}; - set_kernel_args(kernel_args.data(), inputs[i]); - evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts); + set_kernel_args(kernel_args.data(), inputs[i]); + evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts); } } return ker; } -template +template FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( - const BIGINT off1, const UBIGINT size1, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *const kx, const FLT *const dd, const finufft_spread_opts &opts) noexcept { + const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M, + const T *const kx, const T *const dd, const finufft_spread_opts &opts) noexcept { /* 1D spreader from nonuniform to uniform subproblem grid, without wrapping. Inputs: off1 - integer offset of left end of du subgrid from that of overall fine @@ -892,15 +890,15 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( This needed off1 as extra arg. AHB 11/30/20. Vectorized using xsimd by M. Barbone 06/24. */ - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; - static constexpr auto ns2 = ns * FLT(0.5); // half spread width + static constexpr auto ns2 = ns * T(0.5); // half spread width // something weird here. Reversing ker{0} and std fill causes ker // to be zeroed inside the loop GCC uses AVX, clang AVX2 - alignas(alignment) std::array ker{0}; + alignas(alignment) std::array ker{0}; std::fill(du, du + 2 * size1, 0); // zero output // no padding needed if MAX_NSPREAD is 16 // the largest read is 16 floats with avx512 @@ -920,7 +918,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( const auto dd_pt = initialize_complex_register(dd[i * 2], dd[i * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index - // FLT(i1) has different semantics and results an extra cast + // T(i1) has different semantics and results an extra cast const auto x1 = [i, kx]() constexpr noexcept { auto x1 = std::ceil(kx[i] - ns2) - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly @@ -932,8 +930,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( }(); // Libin improvement: pass ker as a parameter and allocate it outside the loop // gcc13 + 10% speedup - ker_eval(ker.data(), opts, x1); - // const auto ker = ker_eval(opts, x1); + ker_eval(ker.data(), opts, x1); + // const auto ker = ker_eval(opts, x1); const auto j = i1 - off1; // offset rel to subgrid, starts the output indices auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize // du is padded, so we can use SIMD even if we write more than ns values in du @@ -969,12 +967,12 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( const auto du_pt1 = simd_type::load_unaligned(trg + dx + simd_size); // swizzle is faster than zip_lo(ker_v, ker_v) and zip_hi(ker_v, ker_v) // swizzle in this case is equivalent to zip_lo and zip_hi respectively - const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); + const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); // ker 0 looks like this now: // +-----------------------+ // |y0|y0|y1|y1|y2|y2|y3|y3| // +-----------------------+ - const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); + const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); // ker 1 looks like this now: // +-----------------------+ // |y4|y4|y5|y5|y6|y6|y7|y7| @@ -1001,17 +999,17 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // the corresponding memory is not accessed const auto ker0 = simd_type::load_unaligned(ker.data() + (regular_part / 2)); const auto du_pt = simd_type::load_unaligned(trg + regular_part); - const auto ker0low = xsimd::swizzle(ker0, zip_low_index); + const auto ker0low = xsimd::swizzle(ker0, zip_low_index); const auto res = xsimd::fma(ker0low, dd_pt, du_pt); res.store_unaligned(trg + regular_part); } } } -template +template static void spread_subproblem_1d_dispatch( - const BIGINT off1, const UBIGINT size1, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *kx, const FLT *dd, const finufft_spread_opts &opts) noexcept { + const BIGINT off1, const UBIGINT size1, T *FINUFFT_RESTRICT du, const UBIGINT M, + const T *kx, const T *dd, const finufft_spread_opts &opts) noexcept { /* this is a dispatch function that will call the correct kernel based on the ns it recursively iterates from MAX_NSPREAD to MIN_NSPREAD it generates the following code: @@ -1044,27 +1042,29 @@ static void spread_subproblem_1d_dispatch( "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, - opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); else { - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, - opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, + dd, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); } else { - return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); } } else { - return spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); + return spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); } } } -static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, FLT *kx, - FLT *dd, const finufft_spread_opts &opts) noexcept +template +static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, T *du, UBIGINT M, T *kx, + T *dd, const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1073,14 +1073,14 @@ static void spread_subproblem_1d(BIGINT off1, UBIGINT size1, FLT *du, UBIGINT M, For algoritmic details see spread_subproblem_1d_kernel. */ { - spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); + spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); } -template +template FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2, - FLT *FINUFFT_RESTRICT du, const UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *dd, const finufft_spread_opts &opts) noexcept + T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. @@ -1089,24 +1089,24 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( For algoritmic details see spread_subproblem_1d_kernel. */ { - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto simd_size = simd_type::size; static constexpr auto alignment = arch_t::alignment(); // Kernel values stored in consecutive memory. This allows us to compute // values in all three directions in a single kernel evaluation call. - static constexpr auto ns2 = ns * FLT(0.5); // half spread width - alignas(alignment) std::array kernel_values{0}; - std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding - for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts + static constexpr auto ns2 = ns * T(0.5); // half spread width + alignas(alignment) std::array kernel_values{0}; + std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding + for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts const auto dd_pt = initialize_complex_register(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); - const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; - const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; - ker_eval(kernel_values.data(), opts, x1, x2); + const auto x1 = (T)std::ceil(kx[pt] - ns2) - kx[pt]; + const auto x2 = (T)std::ceil(ky[pt] - ns2) - ky[pt]; + ker_eval(kernel_values.data(), opts, x1, x2); const auto *ker1 = kernel_values.data(); const auto *ker2 = kernel_values.data() + MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop @@ -1136,8 +1136,8 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable) i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); // this initializes the entire vector registers with the same value // the ker1val_v[i] looks like this: // +-----------------------+ @@ -1149,7 +1149,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( if constexpr (kerval_vectors % 2) { const auto ker1_v = simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2); - const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; + const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; ker1val_v[kerval_vectors - 1] = res; } return ker1val_v; @@ -1169,40 +1169,41 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( } } -template +template void spread_subproblem_2d_dispatch( const BIGINT off1, const BIGINT off2, const UBIGINT size1, const UBIGINT size2, - FLT *FINUFFT_RESTRICT du, const UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *dd, const finufft_spread_opts &opts) { + T *FINUFFT_RESTRICT du, const UBIGINT M, const T *kx, const T *ky, const T *dd, + const finufft_spread_opts &opts) { static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, - M, kx, ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, + du, M, kx, ky, dd, opts); else { - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, - M, kx, ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, + du, M, kx, ky, dd, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, kx, - ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, + kx, ky, dd, opts); } else { - return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, kx, - ky, dd, opts); + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, + kx, ky, dd, opts); } } else { - return spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, - ky, dd, opts); + return spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, + ky, dd, opts); } } } +template static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGINT size2, - FLT *FINUFFT_RESTRICT du, UBIGINT M, const FLT *kx, - const FLT *ky, const FLT *dd, + T *FINUFFT_RESTRICT du, UBIGINT M, const T *kx, + const T *ky, const T *dd, const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. @@ -1212,24 +1213,24 @@ static void spread_subproblem_2d(BIGINT off1, BIGINT off2, UBIGINT size1, UBIGIN For algoritmic details see spread_subproblem_1d_kernel. */ { - spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, ky, dd, - opts); + spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, ky, + dd, opts); } -template +template FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const BIGINT off1, const BIGINT off2, const BIGINT off3, const UBIGINT size1, - const UBIGINT size2, const UBIGINT size3, FLT *FINUFFT_RESTRICT du, const UBIGINT M, - const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, + const UBIGINT size2, const UBIGINT size3, T *FINUFFT_RESTRICT du, const UBIGINT M, + const T *kx, const T *ky, const T *kz, const T *dd, const finufft_spread_opts &opts) noexcept { - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; - static constexpr auto padding = get_padding(); + static constexpr auto padding = get_padding(); static constexpr auto simd_size = simd_type::size; static constexpr auto alignment = arch_t::alignment(); - static constexpr auto ns2 = ns * FLT(0.5); // half spread width - alignas(alignment) std::array kernel_values{0}; + static constexpr auto ns2 = ns * T(0.5); // half spread width + alignas(alignment) std::array kernel_values{0}; std::fill(du, du + 2 * size1 * size2 * size3, 0); for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts @@ -1242,7 +1243,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; - ker_eval(kernel_values.data(), opts, x1, x2, x3); + ker_eval(kernel_values.data(), opts, x1, x2, x3); const auto *ker1 = kernel_values.data(); const auto *ker2 = kernel_values.data() + MAX_NSPREAD; const auto *ker3 = kernel_values.data() + 2 * MAX_NSPREAD; @@ -1260,8 +1261,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable i += 2) { const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); - const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); - const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); ker1val_v[i] = ker1low * dd_pt; ker1val_v[i + 1] = ker1hi * dd_pt; } @@ -1270,7 +1271,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( if constexpr (kerval_vectors % 2) { const auto ker1_v = simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2); - const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; + const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; ker1val_v[kerval_vectors - 1] = res; } return ker1val_v; @@ -1292,40 +1293,41 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( } } -template -void spread_subproblem_3d_dispatch( - BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, UBIGINT size2, UBIGINT size3, - FLT *du, UBIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, - const finufft_spread_opts &opts) noexcept { +template +void spread_subproblem_3d_dispatch(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, + UBIGINT size2, UBIGINT size3, T *du, UBIGINT M, + const T *kx, const T *ky, const T *kz, const T *dd, + const finufft_spread_opts &opts) noexcept { static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return spread_subproblem_3d_kernel( + return spread_subproblem_3d_kernel( off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); else { - return spread_subproblem_3d_kernel( + return spread_subproblem_3d_kernel( off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return spread_subproblem_3d_kernel(off1, off2, off3, size1, size2, - size3, du, M, kx, ky, kz, dd, opts); + return spread_subproblem_3d_kernel( + off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); } else { - return spread_subproblem_3d_kernel(off1, off2, off3, size1, size2, - size3, du, M, kx, ky, kz, dd, opts); + return spread_subproblem_3d_kernel( + off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); } } else { - return spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, - du, M, kx, ky, kz, dd, opts); + return spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, + size3, du, M, kx, ky, kz, dd, opts); } } } +template static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, UBIGINT size1, - UBIGINT size2, UBIGINT size3, FLT *du, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *dd, + UBIGINT size2, UBIGINT size3, T *du, UBIGINT M, T *kx, + T *ky, T *kz, T *dd, const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 3D without wrapping. See above docs/notes for spread_subproblem_2d. @@ -1334,15 +1336,15 @@ dd (size M complex) are complex source strengths du (size size1*size2*size3) is uniform complex output array */ { - spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, du, M, - kx, ky, kz, dd, opts); + spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, du, + M, kx, ky, kz, dd, opts); } -template +template static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0) + T *FINUFFT_RESTRICT data_uniform, const T *du0) /* Add a large subgrid (du0) to output grid (data_uniform), with periodic wrapping to N1,N2,N3 box. offset1,2,3 give the offset of the subgrid from the lowest corner of output. @@ -1354,7 +1356,7 @@ static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, */ { std::vector o2(size2), o3(size3); - static auto accumulate = [](FLT &a, FLT b) { + static auto accumulate = [](T &a, T b) { if constexpr (thread_safe) { // NOLINT(*-branch-clone) #pragma omp atomic a += b; @@ -1399,10 +1401,10 @@ static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } } -static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const FLT *ky, - const FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, - double bin_size_x, double bin_size_y, double bin_size_z, - int debug) +template +static void bin_sort_singlethread( + BIGINT *ret, UBIGINT M, const T *kx, const T *ky, const T *kz, UBIGINT N1, UBIGINT N2, + UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug) /* Returns permutation of all nonuniform points with good RAM access, * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version * @@ -1435,21 +1437,21 @@ static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const F // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). // Note that round-off near kx=-pi stably rounds negative to i1=0. - const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); - const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; - const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins1 = BIGINT(T(N1) / bin_size_x + 1); + const auto nbins2 = isky ? BIGINT(T(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? BIGINT(T(N3) / bin_size_z + 1) : 1; const auto nbins = nbins1 * nbins2 * nbins3; - const auto inv_bin_size_x = FLT(1.0 / bin_size_x); - const auto inv_bin_size_y = FLT(1.0 / bin_size_y); - const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + const auto inv_bin_size_x = T(1.0 / bin_size_x); + const auto inv_bin_size_y = T(1.0 / bin_size_y); + const auto inv_bin_size_z = T(1.0 / bin_size_z); // count how many pts in each bin std::vector counts(nbins, 0); for (auto i = 0; i < M; i++) { // find the bin index in however many dims are needed - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; } @@ -1464,17 +1466,18 @@ static void bin_sort_singlethread(BIGINT *ret, UBIGINT M, const FLT *kx, const F for (auto i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly ++counts[bin]; // update the offsets } } +template static void bin_sort_multithread( - BIGINT *ret, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, + BIGINT *ret, UBIGINT M, T *kx, T *ky, T *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug, int nthr) /* Mostly-OpenMP'ed version of bin_sort. For documentation see: bin_sort_singlethread. @@ -1510,9 +1513,9 @@ static void bin_sort_multithread( my_counts.resize(nbins, 0); // allocate counts[t], now in parallel region for (auto i = brk[t]; i < brk[t + 1]; i++) { // find the bin index in however many dims are needed - BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; - if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; - if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; + BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; + if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; + if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++my_counts[bin]; // no clash btw threads } @@ -1533,9 +1536,9 @@ static void bin_sort_multithread( auto &my_counts(counts[t]); for (UBIGINT i = brk[t]; i < brk[t + 1]; i++) { // find the bin index (again! but better than using RAM) - UBIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; - if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; - if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; + UBIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; + if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; + if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; UBIGINT bin = i1 + nbins1 * (i2 + nbins2 * i3); ret[my_counts[bin]] = i; // inverse is offset for this NU pt and thread ++my_counts[bin]; // update the offsets; no thread clash @@ -1543,9 +1546,10 @@ static void bin_sort_multithread( } } +template static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3, - UBIGINT M, FLT *kx, FLT *ky, FLT *kz, int ns, int ndims) + UBIGINT M, T *kx, T *ky, T *kz, int ns, int ndims) /* Writes out the integer offsets and sizes of a "subgrid" (cuboid subset of Z^ndims) large enough to enclose all of the nonuniform points with (non-periodic) padding of half the kernel width ns to each side in @@ -1589,14 +1593,14 @@ static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, tests. */ { - FLT ns2 = (FLT)ns / 2; - FLT min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points + T ns2 = (T)ns / 2; + T min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points arrayrange(M, kx, &min_kx, &max_kx); offset1 = (BIGINT)std::ceil(min_kx - ns2); // min index touched by kernel size1 = (BIGINT)std::ceil(max_kx - ns2) - offset1 + ns; // int(ceil) first! - padded_size1 = size1 + get_padding(2 * ns) / 2; + padded_size1 = size1 + get_padding(2 * ns) / 2; if (ndims > 1) { - FLT min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points + T min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points arrayrange(M, ky, &min_ky, &max_ky); offset2 = (BIGINT)std::ceil(min_ky - ns2); size2 = (BIGINT)std::ceil(max_ky - ns2) - offset2 + ns; @@ -1605,7 +1609,7 @@ static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, size2 = 1; } if (ndims > 2) { - FLT min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points + T min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points arrayrange(M, kz, &min_kz, &max_kz); offset3 = (BIGINT)std::ceil(min_kz - ns2); size3 = (BIGINT)std::ceil(max_kz - ns2) - offset3 + ns; @@ -1616,9 +1620,9 @@ static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, } // ========================================================================== -int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, - const finufft_spread_opts &opts) +template +int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, + T *ky, T *kz, T *data_nonuniform, const finufft_spread_opts &opts) /* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- If opts.spread_direction=1, evaluate, in the 1D case, @@ -1692,12 +1696,19 @@ int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, FLT *data_uniform, UBIGINT return FINUFFT_ERR_SPREAD_ALLOC; } int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts); - spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, - data_nonuniform, opts, did_sort); + spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, + data_nonuniform, opts, did_sort); free(sort_indices); return 0; } +template int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, float *data_uniform, + UBIGINT M, float *kx, float *ky, float *kz, + float *data_nonuniform, const finufft_spread_opts &opts); +template int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, double *data_uniform, + UBIGINT M, double *kx, double *ky, double *kz, + double *data_nonuniform, const finufft_spread_opts &opts); + static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) /* rule for getting number of spreading dimensions from the list of Ns per dim. @@ -1707,7 +1718,8 @@ static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, return 1 + (N2 > 1) + (N3 > 1); } -int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, FLT *kx, FLT *ky, FLT *kz, +template +int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, T *ky, T *kz, const finufft_spread_opts &opts) /* This does just the input checking and reporting for the spreader. See spreadinterp() for input arguments and meaning of returned value. @@ -1730,9 +1742,14 @@ int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, FLT *kx, FLT *ky, } return 0; } - -int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, - FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts) +template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, float *kx, + float *ky, float *kz, const finufft_spread_opts &opts); +template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, double *kx, + double *ky, double *kz, const finufft_spread_opts &opts); + +template +int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, + T *ky, T *kz, const finufft_spread_opts &opts) /* This makes a decision whether or not to sort the NU pts (influenced by opts.sort), and if yes, calls either single- or multi-threaded bin sort, writing reordered index list to sort_indices. If decided not to sort, the @@ -1807,12 +1824,19 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT } return did_sort; } +template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + UBIGINT M, float *kx, float *ky, float *kz, + const finufft_spread_opts &opts); +template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + UBIGINT M, double *kx, double *ky, double *kz, + const finufft_spread_opts &opts); // -------------------------------------------------------------------------- +template static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, UBIGINT M, - FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, - FLT *FINUFFT_RESTRICT kz, const FLT *data_nonuniform, + T *FINUFFT_RESTRICT data_uniform, UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, + T *FINUFFT_RESTRICT kz, const T *data_nonuniform, const finufft_spread_opts &opts, int did_sort) // Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. { @@ -1870,7 +1894,7 @@ static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIG #pragma omp parallel num_threads(nthr) { // local copies of NU pts and data for each subproblem - std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; + std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; #pragma omp for schedule(dynamic, 1) // each is big for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem @@ -1881,9 +1905,9 @@ static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIG dd0.resize(2 * M0); // complex strength data for (auto j = 0; j < M0; j++) { // todo: can avoid this copying? const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list - kx0[j] = fold_rescale(kx[kk], N1); - if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); - if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); + kx0[j] = fold_rescale(kx[kk], N1); + if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); + if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); dd0[j * 2] = data_nonuniform[kk * 2]; // real part dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part } @@ -1915,14 +1939,14 @@ static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIG // do the adding of subgrid to output if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { if (nthr > opts.atomic_threshold) { // see above for debug reporting - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, - du0.data()); // R Blackwell's atomic version + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); // R Blackwell's atomic version } else { #pragma omp critical - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, - du0.data()); + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); } } } // end main loop over subprobs @@ -1934,20 +1958,20 @@ static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIG }; // -------------------------------------------------------------------------- -template +template FINUFFT_NEVER_INLINE static int interpSorted_kernel( const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - const FLT *data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) + const T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, + T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts) // Interpolate to NU pts in sorted order from a uniform grid. // See spreadinterp() for doc. { - using simd_type = PaddedSIMD; + using simd_type = PaddedSIMD; using arch_t = typename simd_type::arch_type; static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; - static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift + static constexpr auto ns2 = ns * T(0.5); // half spread width, used as stencil shift CNTime timer{}; const auto ndims = ndims_from_Ns(N1, N2, N3); @@ -1964,10 +1988,10 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( { static constexpr auto CHUNKSIZE = simd_size; // number of targets per chunk alignas(alignment) UBIGINT jlist[CHUNKSIZE]; - alignas(alignment) FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; - alignas(alignment) FLT outbuf[2 * CHUNKSIZE]; + alignas(alignment) T xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; + alignas(alignment) T outbuf[2 * CHUNKSIZE]; // Kernels: static alloc is faster, so we do it for up to 3D... - alignas(alignment) std::array kernel_values{0}; + alignas(alignment) std::array kernel_values{0}; auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; @@ -1982,9 +2006,9 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( for (int ibuf = 0; ibuf < bufsize; ibuf++) { UBIGINT j = sort_indices[i + ibuf]; jlist[ibuf] = j; - xjlist[ibuf] = fold_rescale(kx[j], N1); - if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2); - if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3); + xjlist[ibuf] = fold_rescale(kx[j], N1); + if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2); + if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3); } // Loop over targets in chunk @@ -2008,19 +2032,19 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( if (!(opts.flags & TF_OMIT_SPREADING)) { switch (ndims) { case 1: - ker_eval(kernel_values.data(), opts, x1); - interp_line(target, data_uniform, ker1, i1, N1); + ker_eval(kernel_values.data(), opts, x1); + interp_line(target, data_uniform, ker1, i1, N1); break; case 2: - ker_eval(kernel_values.data(), opts, x1, x2); - interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, - N2); + ker_eval(kernel_values.data(), opts, x1, x2); + interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, + N2); break; case 3: - ker_eval(kernel_values.data(), opts, x1, x2, - x3); - interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, - N1, N2, N3); + ker_eval(kernel_values.data(), opts, x1, x2, + x3); + interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, + i3, N1, N2, N3); break; default: // can't get here FINUFFT_UNREACHABLE; @@ -2042,51 +2066,53 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( return 0; } -template +template static int interpSorted_dispatch( const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { + T *FINUFFT_RESTRICT data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, + T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts) { static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return interpSorted_kernel( + return interpSorted_kernel( sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); else { - return interpSorted_kernel( + return interpSorted_kernel( sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { - return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts); + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); } else { - return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts); + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, + M, kx, ky, kz, data_nonuniform, opts); } } else { - return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts); + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); } } } -static int interpSorted( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - FLT *FINUFFT_RESTRICT data_uniform, const UBIGINT M, FLT *FINUFFT_RESTRICT kx, - FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, - FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { - return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts); +template +static int interpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, + const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, + T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts) { + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts); } +template int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, FLT *data_uniform, const UBIGINT M, - FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, - FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, + const UBIGINT N3, T *data_uniform, const UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, + T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort) /* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. See spreadinterp() above for inputs arguments and definitions. @@ -2104,10 +2130,23 @@ int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGI return 0; } +template int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, + const UBIGINT N2, const UBIGINT N3, float *data_uniform, + const UBIGINT M, float *FINUFFT_RESTRICT kx, + float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz, + float *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort); +template int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, + const UBIGINT N2, const UBIGINT N3, double *data_uniform, + const UBIGINT M, double *FINUFFT_RESTRICT kx, + double *FINUFFT_RESTRICT ky, double *FINUFFT_RESTRICT kz, + double *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort); /////////////////////////////////////////////////////////////////////////// -int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth, +template +int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, int debug, int showwarn, int dim) /* Initializes spreader kernel parameters given desired NUFFT tolerance eps, upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth @@ -2164,9 +2203,9 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int ker eps = EPSILON; // only changes local copy (not any opts) ier = FINUFFT_WARN_EPS_TOO_SMALL; } - if (upsampfac == 2.0) // standard sigma (see SISC paper) - ns = std::ceil(-log10(eps / (FLT)10.0)); // 1 digit per power of 10 - else // custom sigma + if (upsampfac == 2.0) // standard sigma (see SISC paper) + ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10 + else // custom sigma ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 ns = max(2, ns); // (we don't have ns=1 version yet) if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules @@ -2188,7 +2227,7 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int ker if (ns == 3) betaoverns = 2.26; if (ns == 4) betaoverns = 2.38; if (upsampfac != 2.0) { // again, override beta for custom sigma - FLT gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! + T gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff } opts.ES_beta = betaoverns * ns; // set the kernel beta parameter @@ -2198,8 +2237,13 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int ker return ier; } +template int setup_spreader(finufft_spread_opts &opts, float eps, double upsampfac, + int kerevalmeth, int debug, int showwarn, int dim); +template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac, + int kerevalmeth, int debug, int showwarn, int dim); -FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts) +template +T evaluate_kernel(T x, const finufft_spread_opts &opts) /* ES ("exp sqrt") kernel evaluation at single real argument: phi(x) = exp(beta.(sqrt(1 - (2x/n_s)^2) - 1)), for |x| < nspread/2 related to an asymptotic approximation to the Kaiser--Bessel, itself an @@ -2208,11 +2252,14 @@ FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts) Rescaled so max is 1, Barnett 7/21/24 */ { - if (abs(x) >= (FLT)opts.ES_halfwidth) + if (abs(x) >= (T)opts.ES_halfwidth) // if spreading/FT careful, shouldn't need this if, but causes no speed hit return 0.0; else - return exp((FLT)opts.ES_beta * (sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x) - (FLT)1.0)); + return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0)); } +template float evaluate_kernel(float x, const finufft_spread_opts &opts); +template double evaluate_kernel(double x, const finufft_spread_opts &opts); + } // namespace finufft::spreadinterp From dab8745890b54bca8705b5f9e3e7438ec95540a8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 16:14:00 +0200 Subject: [PATCH 04/20] templatize utils.cpp --- CMakeLists.txt | 2 +- include/finufft/utils.h | 71 +++++++++++++++++++++++++++++----- src/utils.cpp | 86 ----------------------------------------- 3 files changed, 63 insertions(+), 96 deletions(-) delete mode 100644 src/utils.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8446d5500..0bae95ad5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -122,7 +122,7 @@ endif() # This set of sources is compiled twice, once in single precision and once in # double precision The single precision compilation is done with -DSINGLE set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/fft.cpp - src/simpleinterfaces.cpp src/utils.cpp) + src/simpleinterfaces.cpp) # If we're building for Fortran, make sure we also include the translation # layer. diff --git a/include/finufft/utils.h b/include/finufft/utils.h index 2758c726e..b4fe64681 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -10,13 +10,52 @@ namespace finufft { namespace utils { // ahb's low-level array helpers -FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX *a, CPX *b); -FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b); -FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a); -FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a); -FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi); template -void arrayrange(BIGINT n, T *a, T *lo, T *hi) +FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, std::complex *a, + std::complex *b) +// ||a-b||_2 / ||a||_2 +{ + T err = 0.0, nrm = 0.0; + for (BIGINT m = 0; m < n; ++m) { + nrm += real(conj(a[m]) * a[m]); + std::complex diff = a[m] - b[m]; + err += real(conj(diff) * diff); + } + return sqrt(err / nrm); +} +template +FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, std::complex *a, + std::complex *b) +// ||a-b||_2 +{ + T err = 0.0; // compute error 2-norm + for (BIGINT m = 0; m < n; ++m) { + std::complex diff = a[m] - b[m]; + err += real(conj(diff) * diff); + } + return sqrt(err); +} +template +FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, std::complex *a) +// ||a||_2 +{ + T nrm = 0.0; + for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]); + return sqrt(nrm); +} +template +FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, std::complex *a) +// ||a||_infty +{ + T nrm = 0.0; + for (BIGINT m = 0; m < n; ++m) { + T aa = real(conj(a[m]) * a[m]); + if (aa > nrm) nrm = aa; + } + return sqrt(nrm); +} +template +FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi) // With a a length-n array, writes out min(a) to lo and max(a) to hi, // so that all a values lie in [lo,hi]. // If n==0, lo and hi are not finite. @@ -28,9 +67,23 @@ void arrayrange(BIGINT n, T *a, T *lo, T *hi) if (a[m] > *hi) *hi = a[m]; } } -FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo, - FLT *hi); -FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c); +template +FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, T *a, T *w, T *c) +// Writes out w = half-width and c = center of an interval enclosing all a[n]'s +// Only chooses a nonzero center if this increases w by less than fraction +// ARRAYWIDCEN_GROWFRAC defined in defs.h. +// This prevents rephasings which don't grow nf by much. 6/8/17 +// If n==0, w and c are not finite. +{ + T lo, hi; + arrayrange(n, a, &lo, &hi); + *w = (hi - lo) / 2; + *c = (hi + lo) / 2; + if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) { + *w += std::abs(*c); + *c = 0.0; + } +} } // namespace utils } // namespace finufft diff --git a/src/utils.cpp b/src/utils.cpp deleted file mode 100644 index 8df6ed665..000000000 --- a/src/utils.cpp +++ /dev/null @@ -1,86 +0,0 @@ -// Low-level array manipulations, timer, and OMP helpers, that need separate -// single/double routines (FLT must be an arg). Others are in utils_precindep - -// For self-test see ../test/testutils.cpp Barnett 2017-2020. - -#include "finufft/utils.h" -#include "finufft/defs.h" - -namespace finufft { -namespace utils { - -// ------------ complex array utils --------------------------------- - -FLT relerrtwonorm(BIGINT n, CPX *a, CPX *b) -// ||a-b||_2 / ||a||_2 -{ - FLT err = 0.0, nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) { - nrm += real(conj(a[m]) * a[m]); - CPX diff = a[m] - b[m]; - err += real(conj(diff) * diff); - } - return sqrt(err / nrm); -} -FLT errtwonorm(BIGINT n, CPX *a, CPX *b) -// ||a-b||_2 -{ - FLT err = 0.0; // compute error 2-norm - for (BIGINT m = 0; m < n; ++m) { - CPX diff = a[m] - b[m]; - err += real(conj(diff) * diff); - } - return sqrt(err); -} -FLT twonorm(BIGINT n, CPX *a) -// ||a||_2 -{ - FLT nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]); - return sqrt(nrm); -} -FLT infnorm(BIGINT n, CPX *a) -// ||a||_infty -{ - FLT nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) { - FLT aa = real(conj(a[m]) * a[m]); - if (aa > nrm) nrm = aa; - } - return sqrt(nrm); -} - -// ------------ real array utils --------------------------------- - -void arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi) -// With a a length-n array, writes out min(a) to lo and max(a) to hi, -// so that all a values lie in [lo,hi]. -// If n==0, lo and hi are not finite. -{ - *lo = INFINITY; - *hi = -INFINITY; - for (BIGINT m = 0; m < n; ++m) { - if (a[m] < *lo) *lo = a[m]; - if (a[m] > *hi) *hi = a[m]; - } -} - -void arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c) -// Writes out w = half-width and c = center of an interval enclosing all a[n]'s -// Only chooses a nonzero center if this increases w by less than fraction -// ARRAYWIDCEN_GROWFRAC defined in defs.h. -// This prevents rephasings which don't grow nf by much. 6/8/17 -// If n==0, w and c are not finite. -{ - FLT lo, hi; - arrayrange(n, a, &lo, &hi); - *w = (hi - lo) / 2; - *c = (hi + lo) / 2; - if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) { - *w += std::abs(*c); - *c = 0.0; - } -} - -} // namespace utils -} // namespace finufft From f6fab70362fed12a84d53c4ff70312fd21faefad Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 16:26:08 +0200 Subject: [PATCH 05/20] adjust makefile --- makefile | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/makefile b/makefile index 4a91506db..85a6a9a78 100644 --- a/makefile +++ b/makefile @@ -135,11 +135,11 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB) # spreader is subset of the library with self-contained testing, hence own objs: # double-prec spreader object files that also need single precision... -SOBJS = src/spreadinterp.o src/utils.o +SOBJS = # their single-prec versions SOBJSF = $(SOBJS:%.o=%_32.o) # precision-dependent spreader object files (compiled & linked only once)... -SOBJS_PI = src/utils_precindep.o +SOBJS_PI = src/utils_precindep.o src/spreadinterp.o # spreader dual-precision objs SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI) @@ -209,7 +209,6 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) include/finufft/fft.h: $(DUCC_SETUP) SHEAD = $(wildcard src/*.h) $(XSIMD_DIR)/include/xsimd/xsimd.hpp src/spreadinterp.o: $(SHEAD) -src/spreadinterp_32.o: $(SHEAD) # lib ----------------------------------------------------------------------- @@ -277,10 +276,10 @@ test/%: test/%.cpp $(DYNLIB) test/%f: test/%.cpp $(DYNLIB) $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@ # low-level tests that are cleaner if depend on only specific objects... -test/testutils: test/testutils.cpp src/utils.o src/utils_precindep.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o src/utils_precindep.o $(LIBS) -o test/testutils -test/testutilsf: test/testutils.cpp src/utils_32.o src/utils_precindep.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils_32.o src/utils_precindep.o $(LIBS) -o test/testutilsf +test/testutils: test/testutils.cpp src/utils_precindep.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils_precindep.o $(LIBS) -o test/testutils +test/testutilsf: test/testutils.cpp src/utils_precindep.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils_precindep.o $(LIBS) -o test/testutilsf # make sure all double-prec test executables ready for testing TESTS := $(basename $(wildcard test/*.cpp)) From 3b120a76fd4cd951c94612fd6d3cd12a8910b969 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 16:36:14 +0200 Subject: [PATCH 06/20] fix inconsistent prototype --- include/finufft/spreadinterp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 6851b14d7..f17d05651 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -45,9 +45,9 @@ FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBI const finufft_spread_opts &opts); template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( - const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - T *FINUFFT_RESTRICT data_uniform, UBIGINT N, T *FINUFFT_RESTRICT kx, - T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, + T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, + T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); template FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel(T x, const finufft_spread_opts &opts); From 0faa04982674507decc59c35d8345a2e64dcd4ae Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 16:54:45 +0200 Subject: [PATCH 07/20] another attempt --- include/finufft/spreadinterp.h | 2 +- src/spreadinterp.cpp | 65 ++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index f17d05651..56d705563 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -33,7 +33,7 @@ namespace spreadinterp { // things external (spreadinterp) interface needs... template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( - UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT N, T *kx, T *ky, T *kz, + UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, T *ky, T *kz, T *data_nonuniform, const finufft_spread_opts &opts); template FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index c98a5801a..5117ea8fb 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1621,8 +1621,9 @@ static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, // ========================================================================== template -int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, - T *ky, T *kz, T *data_nonuniform, const finufft_spread_opts &opts) +FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, T *kx, T *ky, T *kz, + T *data_nonuniform, const finufft_spread_opts &opts) /* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- If opts.spread_direction=1, evaluate, in the 1D case, @@ -1702,12 +1703,12 @@ int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, T *data_uniform, UBIGINT M, return 0; } -template int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, float *data_uniform, - UBIGINT M, float *kx, float *ky, float *kz, - float *data_nonuniform, const finufft_spread_opts &opts); -template int spreadinterp(UBIGINT N1, UBIGINT N2, UBIGINT N3, double *data_uniform, - UBIGINT M, double *kx, double *ky, double *kz, - double *data_nonuniform, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + UBIGINT N1, UBIGINT N2, UBIGINT N3, float *data_uniform, UBIGINT M, float *kx, + float *ky, float *kz, float *data_nonuniform, const finufft_spread_opts &opts); +template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( + UBIGINT N1, UBIGINT N2, UBIGINT N3, double *data_uniform, UBIGINT M, double *kx, + double *ky, double *kz, double *data_nonuniform, const finufft_spread_opts &opts); static constexpr uint8_t ndims_from_Ns(const UBIGINT N1, const UBIGINT N2, const UBIGINT N3) @@ -1742,10 +1743,11 @@ int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, T *ky, T * } return 0; } -template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, float *kx, - float *ky, float *kz, const finufft_spread_opts &opts); -template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, double *kx, - double *ky, double *kz, const finufft_spread_opts &opts); +template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, float *kx, + float *ky, float *kz, const finufft_spread_opts &opts); +template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, + double *kx, double *ky, double *kz, + const finufft_spread_opts &opts); template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, @@ -1824,12 +1826,12 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT } return did_sort; } -template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - UBIGINT M, float *kx, float *ky, float *kz, - const finufft_spread_opts &opts); -template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - UBIGINT M, double *kx, double *ky, double *kz, - const finufft_spread_opts &opts); +template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + UBIGINT M, float *kx, float *ky, float *kz, + const finufft_spread_opts &opts); +template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + UBIGINT M, double *kx, double *ky, double *kz, + const finufft_spread_opts &opts); // -------------------------------------------------------------------------- template @@ -2130,18 +2132,18 @@ int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGI return 0; } -template int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, +template int spreadinterpSorted( const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, float *data_uniform, const UBIGINT M, float *FINUFFT_RESTRICT kx, float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz, float *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); -template int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, - const UBIGINT N2, const UBIGINT N3, double *data_uniform, - const UBIGINT M, double *FINUFFT_RESTRICT kx, - double *FINUFFT_RESTRICT ky, double *FINUFFT_RESTRICT kz, - double *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts, int did_sort); +template int spreadinterpSorted( + const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, + double *data_uniform, const UBIGINT M, double *FINUFFT_RESTRICT kx, + double *FINUFFT_RESTRICT ky, double *FINUFFT_RESTRICT kz, + double *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, + int did_sort); /////////////////////////////////////////////////////////////////////////// @@ -2237,10 +2239,11 @@ int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerev return ier; } -template int setup_spreader(finufft_spread_opts &opts, float eps, double upsampfac, - int kerevalmeth, int debug, int showwarn, int dim); -template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac, - int kerevalmeth, int debug, int showwarn, int dim); +template int setup_spreader(finufft_spread_opts &opts, float eps, double upsampfac, + int kerevalmeth, int debug, int showwarn, int dim); +template int setup_spreader(finufft_spread_opts &opts, double eps, + double upsampfac, int kerevalmeth, int debug, + int showwarn, int dim); template T evaluate_kernel(T x, const finufft_spread_opts &opts) @@ -2259,7 +2262,7 @@ T evaluate_kernel(T x, const finufft_spread_opts &opts) return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0)); } -template float evaluate_kernel(float x, const finufft_spread_opts &opts); -template double evaluate_kernel(double x, const finufft_spread_opts &opts); +template float evaluate_kernel(float x, const finufft_spread_opts &opts); +template double evaluate_kernel(double x, const finufft_spread_opts &opts); } // namespace finufft::spreadinterp From d136d81ed6e30ac9b0c4a593cb3da08811e53fbe Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 16:59:11 +0200 Subject: [PATCH 08/20] another attempt --- src/spreadinterp.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 5117ea8fb..94a5822ab 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -2132,12 +2132,12 @@ int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGI return 0; } -template int spreadinterpSorted( const BIGINT *sort_indices, const UBIGINT N1, - const UBIGINT N2, const UBIGINT N3, float *data_uniform, - const UBIGINT M, float *FINUFFT_RESTRICT kx, - float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz, - float *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts, int did_sort); +template int spreadinterpSorted( + const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, + float *data_uniform, const UBIGINT M, float *FINUFFT_RESTRICT kx, + float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz, + float *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, + int did_sort); template int spreadinterpSorted( const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, double *data_uniform, const UBIGINT M, double *FINUFFT_RESTRICT kx, From efc3592aa6fc5ab0e9a6e123ffebbdce63db62cc Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 24 Sep 2024 17:07:36 +0200 Subject: [PATCH 09/20] another attempt --- src/spreadinterp.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 94a5822ab..b2b5ea8b0 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -2148,8 +2148,9 @@ template int spreadinterpSorted( /////////////////////////////////////////////////////////////////////////// template -int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, - int debug, int showwarn, int dim) +FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps, + double upsampfac, int kerevalmeth, + int debug, int showwarn, int dim) /* Initializes spreader kernel parameters given desired NUFFT tolerance eps, upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags. @@ -2239,11 +2240,12 @@ int setup_spreader(finufft_spread_opts &opts, T eps, double upsampfac, int kerev return ier; } -template int setup_spreader(finufft_spread_opts &opts, float eps, double upsampfac, - int kerevalmeth, int debug, int showwarn, int dim); -template int setup_spreader(finufft_spread_opts &opts, double eps, - double upsampfac, int kerevalmeth, int debug, - int showwarn, int dim); +template FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader( + finufft_spread_opts &opts, float eps, double upsampfac, int kerevalmeth, int debug, + int showwarn, int dim); +template FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader( + finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth, int debug, + int showwarn, int dim); template T evaluate_kernel(T x, const finufft_spread_opts &opts) From bbdaa7014b47195086127c604029031bbc734ff8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 10:15:11 +0200 Subject: [PATCH 10/20] more templatizing --- include/finufft/defs.h | 2 +- src/finufft.cpp | 144 ++++++++++++++++++++++------------------- 2 files changed, 78 insertions(+), 68 deletions(-) diff --git a/include/finufft/defs.h b/include/finufft/defs.h index 42e5e7ff8..4265ced01 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -279,7 +279,7 @@ struct FINUFFT_PLAN_S { // the main plan object, fully C++ type3params t3P; // groups together type 3 shift, scale, phase, parameters FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3 - // other internal structs; each is C-compatible of course + // other internal structs std::unique_ptr> fftPlan; finufft_opts opts; // this and spopts could be made ptrs finufft_spread_opts spopts; diff --git a/src/finufft.cpp b/src/finufft.cpp index f80fc0bf3..96e986a5d 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -105,8 +105,9 @@ static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopt } } -int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts, - int dim) +template +static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts, + int dim) // Set up the spreader parameters given eps, and pass across various nufft // options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 { @@ -130,8 +131,9 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts return ier; } -void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf, FLT *h, FLT *gam) +template +static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf, T *h, T *gam) /* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), for type 3 only. Inputs: @@ -145,9 +147,9 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts, New logic 6/12/17 */ { - int nss = spopts.nspread + 1; // since ns may be odd - FLT Xsafe = X, Ssafe = S; // may be tweaked locally - if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 + int nss = spopts.nspread + 1; // since ns may be odd + T Xsafe = X, Ssafe = S; // may be tweaked locally + if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 if (S == 0.0) { Xsafe = 1.0; Ssafe = 1.0; @@ -156,19 +158,20 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts, else Ssafe = max(Ssafe, 1 / X); // use the safe X and S... - auto nfd = FLT(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); - if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf + auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); + if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf *nf = (BIGINT)nfd; // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); // catch too small nf, and nan or +-inf, otherwise spread fails... if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; - if (*nf < MAX_NF) // otherwise will fail anyway - *nf = next235even(*nf); // expensive at huge nf - *h = FLT(2.0 * PI / *nf); // upsampled grid spacing - *gam = FLT(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' + if (*nf < MAX_NF) // otherwise will fail anyway + *nf = next235even(*nf); // expensive at huge nf + *h = T(2.0 * PI / *nf); // upsampled grid spacing + *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' } -void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) +template +static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) /* Approximates exact Fourier series coeffs of cnufftspread's real symmetric kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting @@ -187,7 +190,7 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) Outputs: fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, divided by h = 2pi/n. - (should be allocated for at least nf/2+1 FLTs) + (should be allocated for at least nf/2+1 Ts) Compare onedim_dct_kernel which has same interface, but computes DFT of sampled kernel, not quite the same object. @@ -196,17 +199,17 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. */ { - FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - FLT f[MAX_NQUAD]; + T f[MAX_NQUAD]; double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - CPX a[MAX_NQUAD]; + std::complex a[MAX_NQUAD]; for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n z[n] *= J2; // rescale nodes - f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei - a[n] = -exp(2 * PI * IMA * (FLT)z[n] / (FLT)nf); // phase winding rates + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + a[n] = -exp(2 * PI * IMA * (T)z[n] / (T)nf); // phase winding rates } BIGINT nout = nf / 2 + 1; // how many values we're writing to int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks @@ -216,11 +219,11 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) #pragma omp parallel num_threads(nt) { // each thread gets own chunk to do int t = MY_OMP_GET_THREAD_NUM(); - CPX aj[MAX_NQUAD]; // phase rotator for this thread + std::complex aj[MAX_NQUAD]; // phase rotator for this thread for (int n = 0; n < q; ++n) - aj[n] = pow(a[n], (FLT)brk[t]); // init phase factors for chunk + aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array - FLT x = 0.0; // accumulator for answer at this j + T x = 0.0; // accumulator for answer at this j for (int n = 0; n < q; ++n) { x += f[n] * 2 * real(aj[n]); // include the negative freq aj[n] *= a[n]; // wind the phases @@ -230,7 +233,8 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) } } -void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts) +template +static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts) /* Approximates exact 1D Fourier transform of cnufftspread's real symmetric kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting @@ -246,33 +250,34 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts opts - spreading opts object, needed to eval kernel (must be already set up) Outputs: - phihat - real Fourier transform evaluated at freqs (alloc for nk FLTs) + phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) Barnett 2/8/17. openmp since cos slow 2/9/17 */ { - FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support // # quadr nodes in z (from 0 to J/2; reflections will be added)... int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); - FLT f[MAX_NQUAD]; + T f[MAX_NQUAD]; double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) for (int n = 0; n < q; ++n) { - z[n] *= (FLT)J2; // quadr nodes for [0,J/2] - f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights + z[n] *= (T)J2; // quadr nodes for [0,J/2] + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights } #pragma omp parallel for num_threads(opts.nthreads) - for (BIGINT j = 0; j < nk; ++j) { // loop along output array - FLT x = 0.0; // register + for (BIGINT j = 0; j < nk; ++j) { // loop along output array + T x = 0.0; // register for (int n = 0; n < q; ++n) - x += f[n] * 2 * cos(k[j] * (FLT)z[n]); // pos & neg freq pair. use FLT cos! + x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! phihat[j] = x; } } -void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGINT nf1, - CPX *fw, int modeord) +template +static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1, + std::complex *fw, int modeord) /* if dir==1: copies fw to fk with amplification by prefac/ker if dir==2: copies fk to fw (and zero pads rest of it), same amplification. @@ -280,9 +285,9 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). - fk is a size-ms FLT complex array (2*ms FLTs alternating re,im parts) - fw is a size-nf1 complex array (2*nf1 FLTs alternating re,im parts) - ker is real-valued FLT array of length nf1/2+1. + fk is a size-ms T complex array (2*ms Ts alternating re,im parts) + fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts) + ker is real-valued T array of length nf1/2+1. Single thread only, but shouldn't matter since mostly data movement. @@ -329,8 +334,10 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI } } -void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, BIGINT mt, - FLT *fk, BIGINT nf1, BIGINT nf2, CPX *fw, int modeord) +template +static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt, + T *fk, BIGINT nf1, BIGINT nf2, std::complex *fw, + int modeord) /* 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. @@ -340,11 +347,11 @@ void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, B modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) 1: use FFT-style (pos then negative, on each dim) - fk is a complex array stored as 2*ms*mt FLTs alternating re,im parts, with + fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with ms looped over fast and mt slow. - fw is a complex array stored as 2*nf1*nf2] FLTs alternating re,im parts, with + fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with nf1 looped over fast and nf2 slow. - ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1 + ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1 respectively. Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 @@ -371,9 +378,10 @@ void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, B &fw[nf1 * (nf2 + k2)], modeord); } -void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT ms, - BIGINT mt, BIGINT mu, FLT *fk, BIGINT nf1, BIGINT nf2, - BIGINT nf3, CPX *fw, int modeord) +template +static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms, + BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, + BIGINT nf3, std::complex *fw, int modeord) /* 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. @@ -383,11 +391,11 @@ void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, B modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) 1: use FFT-style (pos then negative, on each dim) - fk is a complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with + fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with ms looped over fastest and mu slowest. - fw is a complex array stored as 2*nf1*nf2*nf3 FLTs alternating re,im parts, with + fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with nf1 looped over fastest and nf3 slowest. - ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1, + ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1, and nf3/2+1 respectively. Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 @@ -416,7 +424,8 @@ void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, B // --------- batch helper functions for t1,2 exec: --------------------------- -int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch) +template +static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, std::complex *cBatch) /* Spreads (or interpolates) a batch of batchSize strength vectors in cBatch to (or from) the batch of fine working grids p->fwBatch, using the same set of @@ -438,15 +447,16 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch) #endif #pragma omp parallel for num_threads(nthr_outer) for (int i = 0; i < batchSize; i++) { - CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - CPX *ci = cBatch + i * p->nj; // start of i'th c array in cBatch - spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT *)fwi, p->nj, p->X, - p->Y, p->Z, (FLT *)ci, p->spopts, p->didSort); + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch + spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, + p->Y, p->Z, (T *)ci, p->spopts, p->didSort); } return 0; } -int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch) +template +static int deconvolveBatch(int batchSize, FINUFFT_PLAN p, std::complex *fkBatch) /* Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch into each output array fk in fkBatch. @@ -461,19 +471,19 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch) // since deconvolveshuffle?d are single-thread, omp par seems to help here... #pragma omp parallel for num_threads(batchSize) for (int i = 0; i < batchSize; i++) { - CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - CPX *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... if (p->dim == 1) - deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, p->ms, (FLT *)fki, + deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki, p->nf1, fwi, p->opts.modeord); else if (p->dim == 2) - deconvolveshuffle2d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, p->ms, - p->mt, (FLT *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); + deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms, + p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); else - deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, - p->phiHat3, p->ms, p->mt, p->mu, (FLT *)fki, p->nf1, p->nf2, + deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, + p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2, p->nf3, fwi, p->opts.modeord); } return 0; @@ -1063,10 +1073,10 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { // STEP 1: (varies by type) timer.restart(); if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid - spreadinterpSortedBatch(thisBatchSize, p, cjb); + spreadinterpSortedBatch(thisBatchSize, p, cjb); t_sprint += timer.elapsedsec(); } else { // type 2: amplify Fourier coeffs fk into 0-padded fw - deconvolveBatch(thisBatchSize, p, fkb); + deconvolveBatch(thisBatchSize, p, fkb); t_deconv += timer.elapsedsec(); } @@ -1079,10 +1089,10 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { // STEP 3: (varies by type) timer.restart(); if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk - deconvolveBatch(thisBatchSize, p, fkb); + deconvolveBatch(thisBatchSize, p, fkb); t_deconv += timer.elapsedsec(); } else { // type 2: interpolate unif fw grid to NU target pts - spreadinterpSortedBatch(thisBatchSize, p, cjb); + spreadinterpSortedBatch(thisBatchSize, p, cjb); t_sprint += timer.elapsedsec(); } } // ........end b loop @@ -1134,8 +1144,8 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed + p->spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed t_spr += timer.elapsedsec(); // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... From 90743223b8fc7def2fbf97254139dacdae46ef59 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 15:10:37 +0200 Subject: [PATCH 11/20] more templatizing --- CMakeLists.txt | 14 +- include/finufft/defs.h | 157 +--- include/finufft/fft.h | 17 +- include/finufft/finufft_core.h | 210 +++++ include/finufft/spreadinterp.h | 1 - include/finufft/utils.h | 2 +- include/finufft/utils_precindep.h | 6 +- makefile | 12 +- src/fft.cpp | 8 +- src/finufft.cpp | 1194 +--------------------------- src/finufft_core.cpp | 1237 +++++++++++++++++++++++++++++ src/simpleinterfaces.cpp | 4 +- src/spreadinterp.cpp | 1 + test/testutils.cpp | 3 +- 14 files changed, 1501 insertions(+), 1365 deletions(-) create mode 100644 include/finufft/finufft_core.h create mode 100644 src/finufft_core.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bae95ad5..435bcd8c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,7 +121,7 @@ endif() # This set of sources is compiled twice, once in single precision and once in # double precision The single precision compilation is done with -DSINGLE -set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/fft.cpp +set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp src/simpleinterfaces.cpp) # If we're building for Fortran, make sure we also include the translation @@ -258,11 +258,15 @@ if(FINUFFT_USE_CPU) add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) set_finufft_options(finufft_f64) if(NOT FINUFFT_STATIC_LINKING) - add_library(finufft SHARED src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp) + add_library( + finufft SHARED + src/spreadinterp.cpp src/utils_precindep.cpp + contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp) else() - add_library(finufft STATIC src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp) + add_library( + finufft STATIC + src/spreadinterp.cpp src/utils_precindep.cpp + contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp) endif() target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64) set_finufft_options(finufft) diff --git a/include/finufft/defs.h b/include/finufft/defs.h index 4265ced01..084ffa41c 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -18,6 +18,7 @@ // public header gives access to f_opts, f_spread_opts, f_plan... // (and clobbers FINUFFT* macros; watch out!) #include +#include #include // --------------- Private data types for compilation in either prec --------- @@ -25,8 +26,8 @@ // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. -using BIGINT = int64_t; -using UBIGINT = uint64_t; +// using BIGINT = int64_t; +// using UBIGINT = uint64_t; // Precision-independent real and complex types, for private lib/test compile #ifdef SINGLE using FLT = float; @@ -36,59 +37,6 @@ using FLT = double; #include // we define C++ complex type only using CPX = std::complex; -// inline macro, to force inlining of small functions -// this avoids the use of macros to implement functions -#if defined(_MSC_VER) -#define FINUFFT_ALWAYS_INLINE __forceinline inline -#define FINUFFT_NEVER_INLINE __declspec(noinline) -#define FINUFFT_RESTRICT __restrict -#define FINUFFT_UNREACHABLE __assume(0) -#define FINUFFT_UNLIKELY(x) (x) -#define FINUFFT_LIKELY(x) (x) -#elif defined(__GNUC__) || defined(__clang__) -#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline -#define FINUFFT_NEVER_INLINE __attribute__((noinline)) -#define FINUFFT_RESTRICT __restrict__ -#define FINUFFT_UNREACHABLE __builtin_unreachable() -#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0) -#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1) -#else -#define FINUFFT_ALWAYS_INLINE inline -#define FINUFFT_NEVER_INLINE -#define FINUFFT_RESTRICT -#define FINUFFT_UNREACHABLE -#define FINUFFT_UNLIKELY(x) (x) -#define FINUFFT_LIKELY(x) (x) -#endif - -// ------------- Library-wide algorithm parameter settings ---------------- - -// Library version (is a string) -#define FINUFFT_VER "2.3.0" - -// Smallest possible kernel spread width per dimension, in fine grid points -// (used only in spreadinterp.cpp) -inline constexpr int MIN_NSPREAD = 2; - -// Largest possible kernel spread width per dimension, in fine grid points -// (used only in spreadinterp.cpp) -inline constexpr int MAX_NSPREAD = 16; - -// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 -inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1; - -// Max number of positive quadr nodes for kernel FT (used only in common.cpp) -inline constexpr int MAX_NQUAD = 100; - -// Internal (nf1 etc) array allocation size that immediately raises error. -// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.) -// Increase this if you need >10TB (!) RAM... -inline constexpr BIGINT MAX_NF = BIGINT(1e12); - -// Maximum allowed number M of NU points; useful to catch incorrectly cast int32 -// values for M = nj (also nk in type 3)... -inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); - // -------------- Math consts (not in math.h) and useful math macros ---------- #include @@ -108,13 +56,6 @@ inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); // to avoid mixed precision operators in eg i*pi, an either-prec PI... #define PI FLT(M_PI) -// machine epsilon for decisions of achievable tolerance... -#ifdef SINGLE -#define EPSILON (float)6e-08 -#else -#define EPSILON (double)1.1e-16 -#endif - // Random numbers: crappy unif random number generator in [0,1). // These macros should probably be replaced by modern C++ std lib or random123. // (RAND_MAX is in stdlib.h) @@ -148,32 +89,6 @@ static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) { } #endif -// ----- OpenMP macros which also work when omp not present ----- -// Allows compile-time switch off of openmp, so compilation without any openmp -// is done (Note: _OPENMP is automatically set by -fopenmp compile flag) -#ifdef _OPENMP -#include -// point to actual omp utils -static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { - return omp_get_num_threads(); -} -static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { - return omp_get_max_threads(); -} -static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { - return omp_get_thread_num(); -} -static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) { - omp_set_num_threads(x); -} -#else -// non-omp safe dummy versions of omp utils... -static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; } -static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; } -static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; } -static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} -#endif - // Prec-switching name macros (respond to SINGLE), used in lib & test sources // and the plan object below. // Note: crucially, these are now indep of macros used to gen public finufft.h! @@ -219,70 +134,6 @@ static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} // NB: now private (the public C++ or C etc user sees an opaque pointer to it) #include // (must come after complex.h) - -// group together a bunch of type 3 rescaling/centering/phasing parameters: -template struct type3params { - T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale - T X2, C2, D2, h2, gam2; // y - T X3, C3, D3, h3, gam3; // z -}; - -struct FINUFFT_PLAN_S { // the main plan object, fully C++ - // These default and delete specifications just state the obvious, - // but are here to silence compiler warnings. - FINUFFT_PLAN_S() = default; - // Copy construction and assignent are already deleted implicitly - // because of the unique_ptr member. - FINUFFT_PLAN_S(const FINUFFT_PLAN_S &) = delete; - FINUFFT_PLAN_S &operator=(const FINUFFT_PLAN_S &) = delete; - - int type; // transform type (Rokhlin naming): 1,2 or 3 - int dim; // overall dimension: 1,2 or 3 - int ntrans; // how many transforms to do at once (vector or "many" mode) - BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) - BIGINT nk; // number of NU freq pts (type 3 only) - FLT tol; // relative user tolerance - int batchSize; // # strength vectors to group together for FFTW, etc - int nbatch; // how many batches done to cover all ntrans vectors - - BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 - BIGINT mt; // number of modes in y (2) direction = N2 - BIGINT mu; // number of modes in z (3) direction = N3 - BIGINT N; // total # modes (prod of above three) - - BIGINT nf1; // size of internal fine grid in x (1) direction - BIGINT nf2; // " y (2) - BIGINT nf3; // " z (3) - BIGINT nf; // total # fine grid points (product of the above three) - - int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 - - FLT *phiHat1; // FT of kernel in t1,2, on x-axis mode grid - FLT *phiHat2; // " y-axis. - FLT *phiHat3; // " z-axis. - - CPX *fwBatch; // (batches of) fine grid(s) for FFTW to plan - // & act on. Usually the largest working array - - BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp - bool didSort; // whether binsorting used (false: identity perm used) - - FLT *X, *Y, *Z; // for t1,2: ptr to user-supplied NU pts (no new allocs). - // for t3: allocated as "primed" (scaled) src pts x'_j, etc - - // type 3 specific - FLT *S, *T, *U; // pointers to user's target NU pts arrays (no new allocs) - CPX *prephase; // pre-phase, for all input NU pts - CPX *deconv; // reciprocal of kernel FT, phase, all output NU pts - CPX *CpBatch; // working array of prephased strengths - FLT *Sp, *Tp, *Up; // internal primed targs (s'_k, etc), allocated - type3params t3P; // groups together type 3 shift, scale, phase, parameters - FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3 - - // other internal structs - std::unique_ptr> fftPlan; - finufft_opts opts; // this and spopts could be made ptrs - finufft_spread_opts spopts; -}; +struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T {}; #endif // DEFS_H diff --git a/include/finufft/fft.h b/include/finufft/fft.h index bab43966c..c6d5de7a5 100644 --- a/include/finufft/fft.h +++ b/include/finufft/fft.h @@ -171,19 +171,22 @@ template<> struct Finufft_FFT_plan { #endif -#include +#include static inline void finufft_fft_forget_wisdom [[maybe_unused]] () { - Finufft_FFT_plan::forget_wisdom(); + Finufft_FFT_plan::forget_wisdom(); + Finufft_FFT_plan::forget_wisdom(); } static inline void finufft_fft_cleanup [[maybe_unused]] () { - Finufft_FFT_plan::cleanup(); + Finufft_FFT_plan::cleanup(); + Finufft_FFT_plan::cleanup(); } static inline void finufft_fft_cleanup_threads [[maybe_unused]] () { - Finufft_FFT_plan::cleanup_threads(); + Finufft_FFT_plan::cleanup_threads(); + Finufft_FFT_plan::cleanup_threads(); } - -std::vector gridsize_for_fft(FINUFFT_PLAN p); -void do_fft(FINUFFT_PLAN p); +template struct FINUFFT_PLAN_T; +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); +template void do_fft(FINUFFT_PLAN_T *p); #endif // FINUFFT_INCLUDE_FINUFFT_FFT_H diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h new file mode 100644 index 000000000..afc6ef864 --- /dev/null +++ b/include/finufft/finufft_core.h @@ -0,0 +1,210 @@ +#ifndef FINUFFT_CORE_H +#define FINUFFT_CORE_H + +/* IMPORTANT: for Windows compilers, you should add a line + #define FINUFFT_DLL + here if you are compiling/using FINUFFT as a DLL, + in order to do the proper importing/exporting, or + alternatively compile with -DFINUFFT_DLL or the equivalent + command-line flag. This is not necessary under MinGW/Cygwin, where + libtool does the imports/exports automatically. + Alternatively use include(GenerateExportHeader) and + generate_export_header(finufft) to auto generate an header containing + these defines.The main reason is that if msvc changes the way it deals + with it in the future we just need to update cmake for it to work + instead of having a check on the msvc version. */ +#if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__)) +#if defined(dll_EXPORTS) +#define FINUFFT_EXPORT __declspec(dllexport) +#else +#define FINUFFT_EXPORT __declspec(dllimport) +#endif +#else +#define FINUFFT_EXPORT +#endif + +/* specify calling convention (Windows only) + The cdecl calling convention is actually not the default in all but a very + few C/C++ compilers. + If the user code changes the default compiler calling convention, may need + this when generating DLL. */ +#if defined(_WIN32) || defined(__WIN32__) +#define FINUFFT_CDECL __cdecl +#else +#define FINUFFT_CDECL +#endif + +// inline macro, to force inlining of small functions +// this avoids the use of macros to implement functions +#if defined(_MSC_VER) +#define FINUFFT_ALWAYS_INLINE __forceinline inline +#define FINUFFT_NEVER_INLINE __declspec(noinline) +#define FINUFFT_RESTRICT __restrict +#define FINUFFT_UNREACHABLE __assume(0) +#define FINUFFT_UNLIKELY(x) (x) +#define FINUFFT_LIKELY(x) (x) +#elif defined(__GNUC__) || defined(__clang__) +#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline +#define FINUFFT_NEVER_INLINE __attribute__((noinline)) +#define FINUFFT_RESTRICT __restrict__ +#define FINUFFT_UNREACHABLE __builtin_unreachable() +#define FINUFFT_UNLIKELY(x) __builtin_expect(!!(x), 0) +#define FINUFFT_LIKELY(x) __builtin_expect(!!(x), 1) +#else +#define FINUFFT_ALWAYS_INLINE inline +#define FINUFFT_NEVER_INLINE +#define FINUFFT_RESTRICT +#define FINUFFT_UNREACHABLE +#define FINUFFT_UNLIKELY(x) (x) +#define FINUFFT_LIKELY(x) (x) +#endif + +#include +#include + +// All indexing in library that potentially can exceed 2^31 uses 64-bit signed. +// This includes all calling arguments (eg M,N) that could be huge someday. +using BIGINT = int64_t; +using UBIGINT = uint64_t; + +// ------------- Library-wide algorithm parameter settings ---------------- + +// Library version (is a string) +#define FINUFFT_VER "2.3.0" + +// Smallest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +inline constexpr int MIN_NSPREAD = 2; + +// Largest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +inline constexpr int MAX_NSPREAD = 16; + +// Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3 +inline constexpr double ARRAYWIDCEN_GROWFRAC = 0.1; + +// Max number of positive quadr nodes for kernel FT (used only in common.cpp) +inline constexpr int MAX_NQUAD = 100; + +// Internal (nf1 etc) array allocation size that immediately raises error. +// (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.) +// Increase this if you need >10TB (!) RAM... +inline constexpr BIGINT MAX_NF = BIGINT(1e12); + +// Maximum allowed number M of NU points; useful to catch incorrectly cast int32 +// values for M = nj (also nk in type 3)... +inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); + +// ----- OpenMP macros which also work when omp not present ----- +// Allows compile-time switch off of openmp, so compilation without any openmp +// is done (Note: _OPENMP is automatically set by -fopenmp compile flag) +#ifdef _OPENMP +#include +// point to actual omp utils +static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { + return omp_get_num_threads(); +} +static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { + return omp_get_max_threads(); +} +static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { + return omp_get_thread_num(); +} +static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int x) { + omp_set_num_threads(x); +} +#else +// non-omp safe dummy versions of omp utils... +static inline int MY_OMP_GET_NUM_THREADS [[maybe_unused]] () { return 1; } +static inline int MY_OMP_GET_MAX_THREADS [[maybe_unused]] () { return 1; } +static inline int MY_OMP_GET_THREAD_NUM [[maybe_unused]] () { return 0; } +static inline void MY_OMP_SET_NUM_THREADS [[maybe_unused]] (int) {} +#endif + +#include // (must come after complex.h) +#include +#include + +// group together a bunch of type 3 rescaling/centering/phasing parameters: +template struct type3params { + T X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale + T X2, C2, D2, h2, gam2; // y + T X3, C3, D3, h3, gam3; // z +}; + +template struct FINUFFT_PLAN_T { // the main plan object, fully C++ + + using TC = std::complex; + + // These default and delete specifications just state the obvious, + // but are here to silence compiler warnings. + FINUFFT_PLAN_T() = default; + // Copy construction and assignent are already deleted implicitly + // because of the unique_ptr member. + FINUFFT_PLAN_T(const FINUFFT_PLAN_T &) = delete; + FINUFFT_PLAN_T &operator=(const FINUFFT_PLAN_T &) = delete; + ~FINUFFT_PLAN_T(); + + int type; // transform type (Rokhlin naming): 1,2 or 3 + int dim; // overall dimension: 1,2 or 3 + int ntrans; // how many transforms to do at once (vector or "many" mode) + BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) + BIGINT nk; // number of NU freq pts (type 3 only) + TF tol; // relative user tolerance + int batchSize; // # strength vectors to group together for FFTW, etc + int nbatch; // how many batches done to cover all ntrans vectors + + BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 + BIGINT mt; // number of modes in y (2) direction = N2 + BIGINT mu; // number of modes in z (3) direction = N3 + BIGINT N; // total # modes (prod of above three) + + BIGINT nf1; // size of internal fine grid in x (1) direction + BIGINT nf2; // " y (2) + BIGINT nf3; // " z (3) + BIGINT nf; // total # fine grid points (product of the above three) + + int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 + + TF *phiHat1 = nullptr; // FT of kernel in t1,2, on x-axis mode grid + TF *phiHat2 = nullptr; // " y-axis. + TF *phiHat3 = nullptr; // " z-axis. + + TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan + // & act on. Usually the largest working array + + BIGINT *sortIndices = nullptr; // precomputed NU pt permutation, speeds spread/interp + bool didSort; // whether binsorting used (false: identity perm used) + + TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts + // (no new allocs). for t3: allocated as + // "primed" (scaled) src pts x'_j, etc + + // type 3 specific + TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays + // (no new allocs) + TC *prephase = nullptr; // pre-phase, for all input NU pts + TC *deconv = nullptr; // reciprocal of kernel FT, phase, all output NU pts + TC *CpBatch = nullptr; // working array of prephased strengths + TF *Sp = nullptr, *Tp = nullptr, *Up = nullptr; // internal primed targs (s'_k, etc), + // allocated + type3params t3P; // groups together type 3 shift, scale, phase, parameters + FINUFFT_PLAN_T *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3 + + // other internal structs + std::unique_ptr> fftPlan; + finufft_opts opts; // this and spopts could be made ptrs + finufft_spread_opts spopts; +}; + +void finufft_default_opts_t(finufft_opts *o); +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts); +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u); +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); + +#endif // FINUFFT_CORE_H diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 56d705563..779101be8 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -7,7 +7,6 @@ #ifndef SPREADINTERP_H #define SPREADINTERP_H -#include #include /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags. diff --git a/include/finufft/utils.h b/include/finufft/utils.h index b4fe64681..132fafb53 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -4,7 +4,7 @@ #ifndef UTILS_H #define UTILS_H -#include "finufft/defs.h" +#include "finufft/finufft_core.h" namespace finufft { namespace utils { diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h index 0504bb8df..8dd3839d8 100644 --- a/include/finufft/utils_precindep.h +++ b/include/finufft/utils_precindep.h @@ -4,9 +4,9 @@ #ifndef UTILS_PRECINDEP_H #define UTILS_PRECINDEP_H -#include "defs.h" -// for CNTime... -// using chrono since the interface is portable between linux and windows +// #include "defs.h" +// for CNTime... +// using chrono since the interface is portable between linux and windows #include namespace finufft { diff --git a/makefile b/makefile index 85a6a9a78..9d3e4c29c 100644 --- a/makefile +++ b/makefile @@ -31,7 +31,7 @@ PYTHON = python3 # they allow gcc to vectorize the code more effectively CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast\ -fno-math-errno -fno-signed-zeros -fno-trapping-math -fassociative-math\ - -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) + -freciprocal-math -fmerge-all-constants -ftree-vectorize $(CFLAGS) -Wfatal-errors FFLAGS := $(CFLAGS) $(FFLAGS) CXXFLAGS := $(CFLAGS) $(CXXFLAGS) # FFTW base name, and math linking... @@ -138,17 +138,17 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB) SOBJS = # their single-prec versions SOBJSF = $(SOBJS:%.o=%_32.o) -# precision-dependent spreader object files (compiled & linked only once)... +# precision-independent spreader object files (compiled & linked only once)... SOBJS_PI = src/utils_precindep.o src/spreadinterp.o # spreader dual-precision objs SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI) # double-prec library object files that also need single precision... -OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o src/fft.o +OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o # their single-prec versions OBJSF = $(OBJS:%.o=%_32.o) -# precision-dependent library object files (compiled & linked only once)... -OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o +# precision-independent library object files (compiled & linked only once)... +OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS) @@ -435,7 +435,7 @@ endif # python --------------------------------------------------------------------- python: $(STATICLIB) $(DYNLIB) - FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install python/finufft + FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages python/finufft # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps $(PYTHON) python/finufft/test/run_accuracy_tests.py $(PYTHON) python/finufft/examples/simple1d1.py diff --git a/src/fft.cpp b/src/fft.cpp index bb7e32442..3a7fbf2f6 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -7,7 +7,7 @@ using namespace std; #include "ducc0/fft/fftnd_impl.h" #endif -std::vector gridsize_for_fft(FINUFFT_PLAN p) { +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p) { // local helper func returns a new int array of length dim, extracted from // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument. if (p->dim == 1) return {(int)p->nf1}; @@ -15,8 +15,10 @@ std::vector gridsize_for_fft(FINUFFT_PLAN p) { // if (p->dim == 3) return {(int)p->nf3, (int)p->nf2, (int)p->nf1}; } +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); +template std::vector gridsize_for_fft(FINUFFT_PLAN_T *p); -void do_fft(FINUFFT_PLAN p) { +template void do_fft(FINUFFT_PLAN_T *p) { #ifdef FINUFFT_USE_DUCC0 size_t nthreads = min(MY_OMP_GET_MAX_THREADS(), p->opts.nthreads); const auto ns = gridsize_for_fft(p); @@ -106,3 +108,5 @@ void do_fft(FINUFFT_PLAN p) { p->fftPlan->execute(); // if thisBatchSize(FINUFFT_PLAN_T *p); +template void do_fft(FINUFFT_PLAN_T *p); diff --git a/src/finufft.cpp b/src/finufft.cpp index 96e986a5d..fddb3fb6e 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -4,1182 +4,25 @@ // private headers for lib build // (must come after finufft.h which clobbers FINUFFT* macros) #include -#include -#include -#include -#include -#include "../contrib/legendre_rule_fast.h" -#include -#include -#include -#include -#include -#include -#include +void FINUFFT_DEFAULT_OPTS(finufft_opts *o) { finufft_default_opts_t(o); } -using namespace std; -using namespace finufft; -using namespace finufft::utils; -using namespace finufft::spreadinterp; -using namespace finufft::quadrature; - -/* Computational core for FINUFFT. - - Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus - 2d1/2d2 many-vector drivers by Melody Shih, summer 2018. - Original guru interface written by Andrea Malleo, summer 2019, mentored - by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu. - - As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions - and the two finufft2d?many() functions. The (now 18) simple C++ interfaces - are in simpleinterfaces.cpp. - -Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: - - TYPE 1: - The type 1 NUFFT proceeds in three main steps: - 1) spread data to oversampled regular mesh using kernel. - 2) compute FFT on uniform mesh - 3) deconvolve by division of each Fourier mode independently by the kernel - Fourier series coeffs (not merely FFT of kernel), shuffle to output. - The kernel coeffs are precomputed in what is called step 0 in the code. - - TYPE 2: - The type 2 algorithm proceeds in three main steps: - 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff - 2) compute inverse FFT on uniform fine grid - 3) spread (dir=2, ie interpolate) data to regular mesh - The kernel coeffs are precomputed in what is called step 0 in the code. - - TYPE 3: - The type 3 algorithm is basically a type 2 (which is implemented precisely - as call to type 2) replacing the middle FFT (Step 2) of a type 1. - Beyond this, the new twists are: - i) nf1, number of upsampled points for the type-1, depends on the product - of interval widths containing input and output points (X*S). - ii) The deconvolve (post-amplify) step is division by the Fourier transform - of the scaled kernel, evaluated on the *nonuniform* output frequency - grid; this is done by direct approximation of the Fourier integral - using quadrature of the kernel function times exponentials. - iii) Shifts in x (real) and s (Fourier) are done to minimize the interval - half-widths X and S, hence nf1. - - MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): - maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so - this is good only for small problems. - - -Design notes for guru interface implementation: - -* Since finufft_plan is C-compatible, we need to use malloc/free for its - allocatable arrays, keeping it quite low-level. We can't use std::vector - since that would only survive in the scope of each function. - -* Thread-safety: FINUFFT plans are passed as pointers, so it has no global - state apart from that associated with FFTW (and the did_fftw_init). -*/ - -// ---------- local math routines (were in common.cpp; no need now): -------- - -namespace finufft { -namespace common { - -static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf) -// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts -// and requested number of Fourier modes ms. Returns 0 if success, else an -// error code if nf was unreasonably big (& tell the world). -{ - *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial - if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails - if (*nf < MAX_NF) { - *nf = next235even(*nf); // expensive at huge nf - return 0; - } else { - fprintf(stderr, - "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " - "malloc\n", - __func__, (double)*nf, (double)MAX_NF); - return FINUFFT_ERR_MAXNALLOC; - } -} - -template -static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts, - int dim) -// Set up the spreader parameters given eps, and pass across various nufft -// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 -{ - // this calls spreadinterp.cpp... - int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, - opts.spread_debug, opts.showwarn, dim); - // override various spread opts from their defaults... - spopts.debug = opts.spread_debug; - spopts.sort = opts.spread_sort; // could make dim or CPU choices here? - spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) - spopts.chkbnds = opts.chkbnds; - spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here - if (opts.spread_nthr_atomic >= 0) // overrides - spopts.atomic_threshold = opts.spread_nthr_atomic; - if (opts.spread_max_sp_size > 0) // overrides - spopts.max_subproblem_size = opts.spread_max_sp_size; - if (opts.chkbnds != 1) // deprecated default value hardcoded here - fprintf(stderr, - "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", - __func__); - return ier; -} - -template -static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts, - BIGINT *nf, T *h, T *gam) -/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), - for type 3 only. - Inputs: - X and S are the xj and sk interval half-widths respectively. - opts and spopts are the NUFFT and spreader opts strucs, respectively. - Outputs: - nf is the size of upsampled grid for a given single dimension. - h is the grid spacing = 2pi/nf - gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). - Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 - New logic 6/12/17 -*/ -{ - int nss = spopts.nspread + 1; // since ns may be odd - T Xsafe = X, Ssafe = S; // may be tweaked locally - if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 - if (S == 0.0) { - Xsafe = 1.0; - Ssafe = 1.0; - } else - Xsafe = max(Xsafe, 1 / S); - else - Ssafe = max(Ssafe, 1 / X); - // use the safe X and S... - auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); - if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf - *nf = (BIGINT)nfd; - // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); - // catch too small nf, and nan or +-inf, otherwise spread fails... - if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; - if (*nf < MAX_NF) // otherwise will fail anyway - *nf = next235even(*nf); // expensive at huge nf - *h = T(2.0 * PI / *nf); // upsampled grid spacing - *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' -} - -template -static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) -/* - Approximates exact Fourier series coeffs of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Uses phase winding for cheap eval on the regular freq - grid. Note that this is also the Fourier transform of the non-periodized - kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an - overall prefactor of 1/h, which is needed anyway for the correction, and - arises because the quadrature weights are scaled for grid units not x units. - The kernel is actually centered at nf/2, related to the centering of the grid; - this is now achieved by the sign flip in a[n] below. - - Inputs: - nf - size of 1d uniform spread grid, must be even. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, - divided by h = 2pi/n. - (should be allocated for at least nf/2+1 Ts) - - Compare onedim_dct_kernel which has same interface, but computes DFT of - sampled kernel, not quite the same object. - - Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18. - Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. - */ -{ - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD - T f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - std::complex a[MAX_NQUAD]; - for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n - z[n] *= J2; // rescale nodes - f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei - a[n] = -exp(2 * PI * IMA * (T)z[n] / (T)nf); // phase winding rates - } - BIGINT nout = nf / 2 + 1; // how many values we're writing to - int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks - std::vector brk(nt + 1); // start indices for each thread - for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads - brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); -#pragma omp parallel num_threads(nt) - { // each thread gets own chunk to do - int t = MY_OMP_GET_THREAD_NUM(); - std::complex aj[MAX_NQUAD]; // phase rotator for this thread - for (int n = 0; n < q; ++n) - aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk - for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array - T x = 0.0; // accumulator for answer at this j - for (int n = 0; n < q; ++n) { - x += f[n] * 2 * real(aj[n]); // include the negative freq - aj[n] *= a[n]; // wind the phases - } - fwkerhalf[j] = x; - } - } -} - -template -static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts) -/* - Approximates exact 1D Fourier transform of cnufftspread's real symmetric - kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting - narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), - for a kernel with x measured in grid-spacings. (See previous routine for - FT definition). - - Inputs: - nk - number of freqs - k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) - Note, z is in grid-point units, and k values must be in [-pi, pi) for - accuracy. - opts - spreading opts object, needed to eval kernel (must be already set up) - - Outputs: - phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) - - Barnett 2/8/17. openmp since cos slow 2/9/17 - */ -{ - T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support - // # quadr nodes in z (from 0 to J/2; reflections will be added)... - int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD - if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); - T f[MAX_NQUAD]; - double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double - legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) - for (int n = 0; n < q; ++n) { - z[n] *= (T)J2; // quadr nodes for [0,J/2] - f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights - } -#pragma omp parallel for num_threads(opts.nthreads) - for (BIGINT j = 0; j < nk; ++j) { // loop along output array - T x = 0.0; // register - for (int n = 0; n < q; ++n) - x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! - phihat[j] = x; - } -} - -template -static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1, - std::complex *fw, int modeord) -/* - if dir==1: copies fw to fk with amplification by prefac/ker - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) - 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). - - fk is a size-ms T complex array (2*ms Ts alternating re,im parts) - fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts) - ker is real-valued T array of length nf1/2+1. - - Single thread only, but shouldn't matter since mostly data movement. - - It has been tested that the repeated floating division in this inner loop - only contributes at the <3% level in 3D relative to the FFT cost (8 threads). - This could be removed by passing in an inverse kernel and doing mults. - - todo: rewrite w/ C++-complex I/O, check complex divide not slower than - real divide, or is there a way to force a real divide? - - Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 -*/ -{ - BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices - if (ms == 0) kmax = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (kmax + 1); - } // or, instead, FFT ordering - if (dir == 1) { // read fw, write out to fk... - for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k - fk[pp++] = prefac * fw[k].real() / ker[k]; // re - fk[pp++] = prefac * fw[k].imag() / ker[k]; // im - } - for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k - fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re - fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im - } - } else { // read fk, write out to fw w/ zero padding... - for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where - // needed - fw[k] = 0.0; - } - for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k - fw[k].real(prefac * fk[pp++] / ker[k]); // re - fw[k].imag(prefac * fk[pp++] / ker[k]); // im - } - for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k - fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re - fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im - } - } -} - -template -static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt, - T *fk, BIGINT nf1, BIGINT nf2, std::complex *fw, - int modeord) -/* - 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. - - if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)). - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) - - fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with - ms looped over fast and mt slow. - fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with - nf1 looped over fast and nf2 slow. - ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1 - respectively. - - Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 -*/ -{ - BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices - if (mt == 0) k2max = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (k2max + 1) * ms; - } // or, instead, FFT ordering - if (dir == 2) // zero pad needed x-lines (contiguous in memory) - for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all - // dims - fw[j] = 0.0; - for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs - // point fk and fw to the start of this y value's row (2* is for complex): - common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1, - &fw[nf1 * k2], modeord); - for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs - common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1, - &fw[nf1 * (nf2 + k2)], modeord); -} - -template -static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms, - BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, - BIGINT nf3, std::complex *fw, int modeord) -/* - 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. - - if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)). - if dir==2: copies fk to fw (and zero pads rest of it), same amplification. - - modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) - - fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with - ms looped over fastest and mu slowest. - fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with - nf1 looped over fastest and nf3 slowest. - ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1, - and nf3/2+1 respectively. - - Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 -*/ -{ - BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices - if (mu == 0) k3max = -1; // fixes zero-pad for trivial no-mode case - // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array - BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx) - if (modeord == 1) { - pp = 0; - pn = 2 * (k3max + 1) * ms * mt; - } // or FFT ordering - BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane - if (dir == 2) // zero pad needed xy-planes (contiguous in memory) - for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims - fw[j] = 0.0; - for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt) // non-neg z-freqs - // point fk and fw to the start of this z value's plane (2* is for complex): - common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1, - nf2, &fw[np * k3], modeord); - for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs - common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1, - nf2, &fw[np * (nf3 + k3)], modeord); -} - -// --------- batch helper functions for t1,2 exec: --------------------------- - -template -static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, std::complex *cBatch) -/* - Spreads (or interpolates) a batch of batchSize strength vectors in cBatch - to (or from) the batch of fine working grids p->fwBatch, using the same set of - (index-sorted) NU points p->X,Y,Z for each vector in the batch. - The direction (spread vs interpolate) is set by p->spopts.spread_direction. - Returns 0 (no error reporting for now). - Notes: - 1) cBatch is already assumed to have the correct offset, ie here we - read from the start of cBatch (unlike Malleo). fwBatch also has zero offset - 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp - Barnett 5/19/20, based on Malleo 2019. -*/ -{ - // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread. - // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. - // But when nthr_outer=1 here, omp par inside the loop sees all threads... -#ifdef _OPENMP - int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; -#endif -#pragma omp parallel for num_threads(nthr_outer) - for (int i = 0; i < batchSize; i++) { - std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch - spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, - p->Y, p->Z, (T *)ci, p->spopts, p->didSort); - } - return 0; -} - -template -static int deconvolveBatch(int batchSize, FINUFFT_PLAN p, std::complex *fkBatch) -/* - Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch - into each output array fk in fkBatch. - Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, - again looping over fk in fkBatch and fw in p->fwBatch. - The direction (spread vs interpolate) is set by p->spopts.spread_direction. - This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize - times. - Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here) -*/ -{ - // since deconvolveshuffle?d are single-thread, omp par seems to help here... -#pragma omp parallel for num_threads(batchSize) - for (int i = 0; i < batchSize; i++) { - std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch - - // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... - if (p->dim == 1) - deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki, - p->nf1, fwi, p->opts.modeord); - else if (p->dim == 2) - deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms, - p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); - else - deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, - p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2, - p->nf3, fwi, p->opts.modeord); - } - return 0; -} - -} // namespace common -} // namespace finufft - -// --------------- rest is the 5 user guru (plan) interface drivers: --------- -// (not namespaced since have safe names finufft{f}_* ) -using namespace finufft::common; // accesses routines defined above - -// Marco Barbone: 5.8.2024 -// These are user-facing. -// The various options could be macros to follow c standard library conventions. -// Question: would these be enums? - -// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO -void FINUFFT_DEFAULT_OPTS(finufft_opts *o) -// Sets default nufft opts (referenced by all language interfaces too). -// See finufft_opts.h for meanings. -// This was created to avoid uncertainty about C++11 style static initialization -// when called from MEX, but now is generally used. Barnett 10/30/17 onwards. -// Sphinx sucks the below code block into the web docs, hence keep it clean... -{ - // sphinx tag (don't remove): @defopts_start - o->modeord = 0; - o->chkbnds = 1; - - o->debug = 0; - o->spread_debug = 0; - o->showwarn = 1; - - o->nthreads = 0; -#ifdef FINUFFT_USE_DUCC0 - o->fftw = 0; -#else - o->fftw = FFTW_ESTIMATE; -#endif - o->spread_sort = 2; - o->spread_kerevalmeth = 1; - o->spread_kerpad = 1; - o->upsampfac = 0.0; - o->spread_thread = 0; - o->maxbatchsize = 0; - o->spread_nthr_atomic = -1; - o->spread_max_sp_size = 0; - o->fftw_lock_fun = nullptr; - o->fftw_unlock_fun = nullptr; - o->fftw_lock_data = nullptr; - // sphinx tag (don't remove): @defopts_end -} - -// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP int FINUFFT_MAKEPLAN(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, - FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) -// Populates the fields of finufft_plan which is pointed to by "pp". -// opts is ptr to a finufft_opts to set options, or NULL to use defaults. -// For some of the fields (if "auto" selected) here choose the actual setting. -// For types 1,2 allocates memory for internal working arrays, -// evaluates spreading kernel coefficients, and instantiates the fftw_plan -{ - FINUFFT_PLAN p; - p = new FINUFFT_PLAN_S; // allocate fresh plan struct - *pp = p; // pass out plan as ptr to plan struct - - if (opts == NULL) // use default opts - FINUFFT_DEFAULT_OPTS(&(p->opts)); - else // or read from what's passed in - p->opts = *opts; // keep a deep copy; changing *opts now has no effect - - if (p->opts.debug) // do a hello world - printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", - __func__); - - p->fftPlan = std::make_unique>( - p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); - - if ((type != 1) && (type != 2) && (type != 3)) { - fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); - return FINUFFT_ERR_TYPE_NOTVALID; - } - if ((dim != 1) && (dim != 2) && (dim != 3)) { - fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); - return FINUFFT_ERR_DIM_NOTVALID; - } - if (ntrans < 1) { - fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); - return FINUFFT_ERR_NTRANS_NOTVALID; - } - if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { - fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", - __func__); - return FINUFFT_ERR_LOCK_FUNS_INVALID; - ; - } - - // get stuff from args... - p->type = type; - p->dim = dim; - p->ntrans = ntrans; - p->tol = tol; - p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input - - // choose overall # threads... -#ifdef _OPENMP - int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); - int nthr = ompmaxnthr; // default: use as many as OMP gives us - // (the above could be set, or suggested set, to 1 for small enough problems...) - if (p->opts.nthreads > 0) { - nthr = p->opts.nthreads; // user override, now without limit - if (p->opts.showwarn && (nthr > ompmaxnthr)) - fprintf(stderr, - "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " - "available; note large nthreads can be slower.\n", - __func__, nthr, ompmaxnthr); - } -#else - int nthr = 1; // always 1 thread (avoid segfault) - if (p->opts.nthreads > 1) - fprintf(stderr, - "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", - __func__, p->opts.nthreads); -#endif - p->opts.nthreads = nthr; // store actual # thr planned for - // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) - - // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) - if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize - p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss - p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b - } else { // batchSize override by user - p->batchSize = min(p->opts.maxbatchsize, ntrans); - p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches - } - if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice - if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { - fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); - return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; - } - - if (type != 3) { // read in user Fourier mode array sizes... - p->ms = n_modes[0]; - p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims - p->mu = (dim > 2) ? n_modes[2] : 1; - p->N = p->ms * p->mt * p->mu; // N = total # modes - } - - // heuristic to choose default upsampfac... (currently two poss) - if (p->opts.upsampfac == 0.0) { // indicates auto-choose - p->opts.upsampfac = 2.0; // default, and need for tol small - if (tol >= (FLT)1E-9) { // the tol sigma=5/4 can reach - if (type == 3) // could move to setpts, more known? - p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT - else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || - (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, - // typ tol, 12-core xeon - p->opts.upsampfac = 1.25; - } - if (p->opts.debug > 1) - printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); - } - // use opts to choose and write into plan's spread options... - int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); - if (ier > 1) // proceed if success or warning - return ier; - - // set others as defaults (or unallocated for arrays)... - p->X = NULL; - p->Y = NULL; - p->Z = NULL; - p->phiHat1 = NULL; - p->phiHat2 = NULL; - p->phiHat3 = NULL; - p->nf1 = 1; - p->nf2 = 1; - p->nf3 = 1; // crucial to leave as 1 for unused dims - p->sortIndices = NULL; // used in all three types - - // ------------------------ types 1,2: planning needed --------------------- - if (type == 1 || type == 2) { - - int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) - // Note: batchSize not used since might be only 1. - - p->spopts.spread_direction = type; - - if (p->opts.showwarn) { // user warn round-off error... - if (EPSILON * p->ms > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->ms)); - if (EPSILON * p->mt > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mt)); - if (EPSILON * p->mu > 1.0) - fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mu)); - } - - // determine fine grid sizes, sanity check.. - int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); - if (nfier) return nfier; // nf too big; we're done - p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1)); - if (dim > 1) { - nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); - if (nfier) return nfier; - p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1)); - } - if (dim > 2) { - nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); - if (nfier) return nfier; - p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1)); - } - - if (p->opts.debug) { // "long long" here is to avoid warnings with printf... - printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " - "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " - "batchSize=%d ", - __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, - (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, - p->batchSize); - if (p->batchSize == 1) // spread_thread has no effect in this case - printf("\n"); - else - printf(" spread_thread=%d\n", p->opts.spread_thread); - } - - // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim - CNTime timer; - timer.start(); - onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); - if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); - if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); - if (p->opts.debug) - printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, - timer.elapsedsec()); - - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 - return FINUFFT_ERR_MAXNALLOC; - } - - timer.restart(); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace - if (p->opts.debug) - printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, - (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec()); - if (!p->fwBatch) { // we don't catch all such mallocs, just this big one - fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", - __func__); - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); - return FINUFFT_ERR_ALLOC; - } - - timer.restart(); // plan the FFTW - const auto ns = gridsize_for_fft(p); - p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); - if (p->opts.debug) - printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, - nthr_fft, timer.elapsedsec()); - - } else { // -------------------------- type 3 (no planning) ------------ - - if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); - // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->CpBatch = NULL; - p->fwBatch = NULL; - p->Sp = NULL; - p->Tp = NULL; - p->Up = NULL; - p->prephase = NULL; - p->deconv = NULL; - p->innerT2plan = NULL; - // Type 3 will call finufft_makeplan for type 2; no need to init FFTW - // Note we don't even know nj or nk yet, so can't do anything else! - } - return ier; // report setup_spreader status (could be warning) + FLT tol, FINUFFT_PLAN *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), opts); } -// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk, - FLT *s, FLT *t, FLT *u) -/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for - spreading. (The last 4 arguments are ignored.) - For type 3: allocates internal working arrays, scales/centers the NU points - and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. -*/ -{ - int d = p->dim; // abbrev for spatial dim - CNTime timer; - timer.start(); - p->nj = nj; // the user only now chooses how many NU (x,y,z) pts - if (nj < 0) { - fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nj > MAX_NU_PTS) { - fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } - - if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- - // (all we can do is check and maybe bin-sort the NU pts) - p->X = xj; // plan must keep pointers to user's fixed NU pts - p->Y = yj; - p->Z = zj; - int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug > 1) - printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, - timer.elapsedsec()); - if (ier) // no warnings allowed here - return ier; - timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } - p->didSort = - indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug) - printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, - timer.elapsedsec()); - - } else { // ------------------------- TYPE 3 SETPTS ----------------------- - // (here we can precompute pre/post-phase factors and plan the t2) - - if (nk < 0) { - fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } else if (nk > MAX_NU_PTS) { - fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); - return FINUFFT_ERR_NUM_NU_PTS_INVALID; - } - p->nk = nk; // user set # targ freq pts - p->S = s; // keep pointers to user's input target pts - p->T = t; - p->U = u; - - // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... - FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}... - arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); - arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} - set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), - &(p->t3P.gam1)); // applies twist i) - p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc - p->t3P.D2 = 0.0; - if (d > 1) { - arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} - arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} - set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), - &(p->t3P.gam2)); - } - p->t3P.C3 = 0.0; - p->t3P.D3 = 0.0; - if (d > 2) { - arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} - arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} - set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), - &(p->t3P.gam3)); - } - - if (p->opts.debug) { // report on choices of shifts, centers, etc... - printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, - p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); - if (d > 1) - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, - p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); - if (d > 2) - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, - p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); - } - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - return FINUFFT_ERR_MAXNALLOC; - } - p->fftPlan->free(p->fwBatch); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace - - // (note FFTW_ALLOC is not needed over malloc, but matches its type) - if (p->CpBatch) free(p->CpBatch); - p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work - - if (p->opts.debug) - printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, - (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize, - timer.elapsedsec()); - if (!p->fwBatch || !p->CpBatch) { - fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); - return FINUFFT_ERR_ALLOC; - } - // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); - - // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... - // FIXME: should use realloc - if (p->X) free(p->X); - if (p->Sp) free(p->Sp); - p->X = (FLT *)malloc(sizeof(FLT) * nj); - p->Sp = (FLT *)malloc(sizeof(FLT) * nk); - if (d > 1) { - if (p->Y) free(p->Y); - if (p->Tp) free(p->Tp); - p->Y = (FLT *)malloc(sizeof(FLT) * nj); - p->Tp = (FLT *)malloc(sizeof(FLT) * nk); - } - if (d > 2) { - if (p->Z) free(p->Z); - if (p->Up) free(p->Up); - p->Z = (FLT *)malloc(sizeof(FLT) * nj); - p->Up = (FLT *)malloc(sizeof(FLT) * nk); - } - - // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... - FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim - if (d > 1) ig2 = 1.0 / p->t3P.gam2; - if (d > 2) ig3 = 1.0 / p->t3P.gam3; -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j = 0; j < nj; ++j) { - p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j - if (d > 1) // (ok to do inside loop because of branch predict) - p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j - if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j - } - - // set up prephase array... - CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i - if (p->prephase) free(p->prephase); - p->prephase = (CPX *)malloc(sizeof(CPX) * nj); - if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs - FLT phase = p->t3P.D1 * xj[j]; - if (d > 1) phase += p->t3P.D2 * yj[j]; - if (d > 2) phase += p->t3P.D3 * zj[j]; - p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler - // e^{+-i.phase} - } - } else - for (BIGINT j = 0; j < nj; ++j) - p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec?? - - // rescale the target s_k etc to s'_k etc... -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { - p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R - if (d > 1) - p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < - // pi/R - if (d > 2) - p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < - // pi/R - } - // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... - // (exploits that FT separates because kernel is prod of 1D funcs) - if (p->deconv) free(p->deconv); - p->deconv = (CPX *)malloc(sizeof(CPX) * nk); - FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk); // don't confuse w/ p->phiHat - onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 - FLT *phiHatk2 = NULL, *phiHatk3 = NULL; - if (d > 1) { - phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk); - onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 - } - if (d > 2) { - phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk); - onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 - } - int Cfinite = - isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan - // or inf if - // M=0, no - // input NU pts - int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) - for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs - FLT phiHat = phiHatk1[k]; - if (d > 1) phiHat *= phiHatk2[k]; - if (d > 2) phiHat *= phiHatk3[k]; - p->deconv[k] = (CPX)(1.0 / phiHat); - if (Cfinite && Cnonzero) { - FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1; - if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; - if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; - p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} - } - } - free(phiHatk1); - free(phiHatk2); - free(phiHatk3); // done w/ deconv fill - if (p->opts.debug) - printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); - - // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... - timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } - p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, - p->Z, p->spopts); - if (p->opts.debug) - printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, - timer.elapsedsec()); - - // Plan and setpts once, for the (repeated) inner type 2 finufft call... - timer.restart(); - BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw - finufft_opts t2opts = p->opts; // deep copy, since not ptrs - t2opts.modeord = 0; // needed for correct t3! - t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail - t2opts.spread_debug = max(0, p->opts.spread_debug - 1); - t2opts.showwarn = 0; // so don't see warnings 2x - // (...could vary other t2opts here?) - if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan); - int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, - &p->innerT2plan, &t2opts); - if (ier > 1) { // if merely warning, still proceed - fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", - __func__, ier); - return ier; - } - ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, - NULL); // note nk = # output points (not nj) - if (ier > 1) { - fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); - return ier; - } - if (p->opts.debug) - printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); - } - return 0; + FLT *s, FLT *t, FLT *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, zj, + nk, s, t, u); } -// ............ end setpts .................................................. -// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { - /* See ../docs/cguru.doc for current documentation. - - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and FFT as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. -*/ - CNTime timer; - timer.start(); - - if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ - - double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing - if (p->opts.debug) - printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); - - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches - - // current batch is either batchSize, or possibly truncated if last one - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; // index of vector, since batchsizes same - CPX *cjb = cj + bB * p->nj; // point to batch of weights - CPX *fkb = fk + bB * p->N; // point to batch of mode coeffs - if (p->opts.debug > 1) - printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); - - // STEP 1: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid - spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); - } else { // type 2: amplify Fourier coeffs fk into 0-padded fw - deconvolveBatch(thisBatchSize, p, fkb); - t_deconv += timer.elapsedsec(); - } - - // STEP 2: call the FFT on this batch - timer.restart(); - do_fft(p); - t_fft += timer.elapsedsec(); - if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); - - // STEP 3: (varies by type) - timer.restart(); - if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk - deconvolveBatch(thisBatchSize, p, fkb); - t_deconv += timer.elapsedsec(); - } else { // type 2: interpolate unif fw grid to NU target pts - spreadinterpSortedBatch(thisBatchSize, p, cjb); - t_sprint += timer.elapsedsec(); - } - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - if (p->type == 1) { - printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); - printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); - } else { - printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); - printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); - printf(" tot interp:\t\t\t%.3g s\n", t_sprint); - } - } - } - - else { // ----------------------------- TYPE 3 EXEC --------------------- - - // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long - // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug - - double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, - t_deconv = 0.0; // accumulated timings - if (p->opts.debug) - printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); - - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches - - // batching and pointers to this batch, identical to t1,2 above... - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; - CPX *cjb = cj + bB * p->nj; // batch of input strengths - CPX *fkb = fk + bB * p->nk; // batch of output strengths - if (p->opts.debug > 1) - printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); - - // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... - timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? - for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nj; - for (BIGINT j = 0; j < p->nj; ++j) { - p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; - } - } - t_pre += timer.elapsedsec(); - - // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... - timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed - t_spr += timer.elapsedsec(); - - // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... - timer.restart(); - // illegal possible shrink of ntrans *after* plan for smaller last batch: - p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! - /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array - still the same size, as Andrea explained; just wastes a few flops) */ - FINUFFT_EXECUTE(p->innerT2plan, fkb, p->fwBatch); - t_t2 += timer.elapsedsec(); - // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... - timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) - for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nk; - for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; - } - t_deconv += timer.elapsedsec(); - } // ........end b loop - - if (p->opts.debug) { // report total times in their natural order... - printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); - printf(" tot spread:\t\t\t%.3g s\n", t_spr); - printf(" tot type 2:\t\t\t%.3g s\n", t_t2); - printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); - } - } - // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long - // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug - - return 0; + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); } -// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD int FINUFFT_DESTROY(FINUFFT_PLAN p) // Free everything we allocated inside of finufft_plan pointed to by p. // Also must not crash if called immediately after finufft_makeplan. @@ -1189,24 +32,7 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) if (!p) // NULL ptr, so not a ptr to a plan, report error return 1; - p->fftPlan->free(p->fwBatch); // free the big FFTW (or t3 spread) working array - free(p->sortIndices); - if (p->type == 1 || p->type == 2) { - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); - } else { // free the stuff alloc for type 3 only - FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code - free(p->CpBatch); - free(p->Sp); - free(p->Tp); - free(p->Up); - free(p->X); - free(p->Y); - free(p->Z); - free(p->prephase); - free(p->deconv); - } - delete p; + delete reinterpret_cast *>(p); + p = nullptr; return 0; // success } diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp new file mode 100644 index 000000000..70a52afa8 --- /dev/null +++ b/src/finufft_core.cpp @@ -0,0 +1,1237 @@ +#include +#include +#include +#include +#include + +#include "../contrib/legendre_rule_fast.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace std; +using namespace finufft; +using namespace finufft::utils; +using namespace finufft::spreadinterp; +using namespace finufft::quadrature; + +/* Computational core for FINUFFT. + + Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus + 2d1/2d2 many-vector drivers by Melody Shih, summer 2018. + Original guru interface written by Andrea Malleo, summer 2019, mentored + by Alex Barnett. Many rewrites in early 2020 by Alex Barnett & Libin Lu. + + As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions + and the two finufft2d?many() functions. The (now 18) simple C++ interfaces + are in simpleinterfaces.cpp. + +Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: + + TYPE 1: + The type 1 NUFFT proceeds in three main steps: + 1) spread data to oversampled regular mesh using kernel. + 2) compute FFT on uniform mesh + 3) deconvolve by division of each Fourier mode independently by the kernel + Fourier series coeffs (not merely FFT of kernel), shuffle to output. + The kernel coeffs are precomputed in what is called step 0 in the code. + + TYPE 2: + The type 2 algorithm proceeds in three main steps: + 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff + 2) compute inverse FFT on uniform fine grid + 3) spread (dir=2, ie interpolate) data to regular mesh + The kernel coeffs are precomputed in what is called step 0 in the code. + + TYPE 3: + The type 3 algorithm is basically a type 2 (which is implemented precisely + as call to type 2) replacing the middle FFT (Step 2) of a type 1. + Beyond this, the new twists are: + i) nf1, number of upsampled points for the type-1, depends on the product + of interval widths containing input and output points (X*S). + ii) The deconvolve (post-amplify) step is division by the Fourier transform + of the scaled kernel, evaluated on the *nonuniform* output frequency + grid; this is done by direct approximation of the Fourier integral + using quadrature of the kernel function times exponentials. + iii) Shifts in x (real) and s (Fourier) are done to minimize the interval + half-widths X and S, hence nf1. + + MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): + maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so + this is good only for small problems. + + +Design notes for guru interface implementation: + +* Since finufft_plan is C-compatible, we need to use malloc/free for its + allocatable arrays, keeping it quite low-level. We can't use std::vector + since that would only survive in the scope of each function. + +* Thread-safety: FINUFFT plans are passed as pointers, so it has no global + state apart from that associated with FFTW (and the did_fftw_init). +*/ + +// ---------- local math routines (were in common.cpp; no need now): -------- + +namespace finufft { +namespace common { + +static constexpr double PI = 3.14159265358979329; + +static int set_nf_type12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf) +// Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts +// and requested number of Fourier modes ms. Returns 0 if success, else an +// error code if nf was unreasonably big (& tell the world). +{ + *nf = BIGINT(opts.upsampfac * double(ms)); // manner of rounding not crucial + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails + if (*nf < MAX_NF) { + *nf = next235even(*nf); // expensive at huge nf + return 0; + } else { + fprintf(stderr, + "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " + "malloc\n", + __func__, (double)*nf, (double)MAX_NF); + return FINUFFT_ERR_MAXNALLOC; + } +} + +template +static int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, finufft_opts opts, + int dim) +// Set up the spreader parameters given eps, and pass across various nufft +// options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17 +{ + // this calls spreadinterp.cpp... + int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth, + opts.spread_debug, opts.showwarn, dim); + // override various spread opts from their defaults... + spopts.debug = opts.spread_debug; + spopts.sort = opts.spread_sort; // could make dim or CPU choices here? + spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0) + spopts.chkbnds = opts.chkbnds; + spopts.nthreads = opts.nthreads; // 0 passed in becomes omp max by here + if (opts.spread_nthr_atomic >= 0) // overrides + spopts.atomic_threshold = opts.spread_nthr_atomic; + if (opts.spread_max_sp_size > 0) // overrides + spopts.max_subproblem_size = opts.spread_max_sp_size; + if (opts.chkbnds != 1) // deprecated default value hardcoded here + fprintf(stderr, + "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n", + __func__); + return ier; +} + +template +static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopts, + BIGINT *nf, T *h, T *gam) +/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor), + for type 3 only. + Inputs: + X and S are the xj and sk interval half-widths respectively. + opts and spopts are the NUFFT and spreader opts strucs, respectively. + Outputs: + nf is the size of upsampled grid for a given single dimension. + h is the grid spacing = 2pi/nf + gam is the x rescale factor, ie x'_j = x_j/gam (modulo shifts). + Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17 + New logic 6/12/17 +*/ +{ + int nss = spopts.nspread + 1; // since ns may be odd + T Xsafe = X, Ssafe = S; // may be tweaked locally + if (X == 0.0) // logic ensures XS>=1, handle X=0 a/o S=0 + if (S == 0.0) { + Xsafe = 1.0; + Ssafe = 1.0; + } else + Xsafe = max(Xsafe, 1 / S); + else + Ssafe = max(Ssafe, 1 / X); + // use the safe X and S... + auto nfd = T(2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss); + if (!isfinite(nfd)) nfd = 0.0; // use T to catch inf + *nf = (BIGINT)nfd; + // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread); + // catch too small nf, and nan or +-inf, otherwise spread fails... + if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; + if (*nf < MAX_NF) // otherwise will fail anyway + *nf = next235even(*nf); // expensive at huge nf + *h = T(2.0 * PI / *nf); // upsampled grid spacing + *gam = T(*nf / (2.0 * opts.upsampfac * Ssafe)); // x scale fac to x' +} + +template +static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) +/* + Approximates exact Fourier series coeffs of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Uses phase winding for cheap eval on the regular freq + grid. Note that this is also the Fourier transform of the non-periodized + kernel. The FT definition is f(k) = int e^{-ikx} f(x) dx. The output has an + overall prefactor of 1/h, which is needed anyway for the correction, and + arises because the quadrature weights are scaled for grid units not x units. + The kernel is actually centered at nf/2, related to the centering of the grid; + this is now achieved by the sign flip in a[n] below. + + Inputs: + nf - size of 1d uniform spread grid, must be even. + opts - spreading opts object, needed to eval kernel (must be already set up) + + Outputs: + fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, + divided by h = 2pi/n. + (should be allocated for at least nf/2+1 Ts) + + Compare onedim_dct_kernel which has same interface, but computes DFT of + sampled kernel, not quite the same object. + + Barnett 2/7/17. openmp (since slow vs fftw in 1D large-N case) 3/3/18. + Fixed num_threads 7/20/20. Reduced rounding error in a[n] calc 8/20/24. + */ +{ + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD + T f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + std::complex a[MAX_NQUAD]; + for (int n = 0; n < q; ++n) { // set up nodes z_n and vals f_n + z[n] *= J2; // rescale nodes + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei + a[n] = -exp(2 * PI * std::complex(0, 1) * z[n] / double(nf)); // phase winding + // rates + } + BIGINT nout = nf / 2 + 1; // how many values we're writing to + int nt = min(nout, (BIGINT)opts.nthreads); // how many chunks + std::vector brk(nt + 1); // start indices for each thread + for (int t = 0; t <= nt; ++t) // split nout mode indices btw threads + brk[t] = (BIGINT)(0.5 + nout * t / (double)nt); +#pragma omp parallel num_threads(nt) + { // each thread gets own chunk to do + int t = MY_OMP_GET_THREAD_NUM(); + std::complex aj[MAX_NQUAD]; // phase rotator for this thread + for (int n = 0; n < q; ++n) + aj[n] = pow(a[n], (T)brk[t]); // init phase factors for chunk + for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array + T x = 0.0; // accumulator for answer at this j + for (int n = 0; n < q; ++n) { + x += f[n] * 2 * real(aj[n]); // include the negative freq + aj[n] *= a[n]; // wind the phases + } + fwkerhalf[j] = x; + } + } +} + +template +static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts) +/* + Approximates exact 1D Fourier transform of cnufftspread's real symmetric + kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting + narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi), + for a kernel with x measured in grid-spacings. (See previous routine for + FT definition). + + Inputs: + nk - number of freqs + k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) + Note, z is in grid-point units, and k values must be in [-pi, pi) for + accuracy. + opts - spreading opts object, needed to eval kernel (must be already set up) + + Outputs: + phihat - real Fourier transform evaluated at freqs (alloc for nk Ts) + + Barnett 2/8/17. openmp since cos slow 2/9/17 + */ +{ + T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support + // # quadr nodes in z (from 0 to J/2; reflections will be added)... + int q = (int)(2 + 2.0 * J2); // > pi/2 ratio. cannot exceed MAX_NQUAD + if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q); + T f[MAX_NQUAD]; + double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double + legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1) + for (int n = 0; n < q; ++n) { + z[n] *= (T)J2; // quadr nodes for [0,J/2] + f[n] = J2 * (T)w[n] * evaluate_kernel((T)z[n], opts); // w/ quadr weights + } +#pragma omp parallel for num_threads(opts.nthreads) + for (BIGINT j = 0; j < nk; ++j) { // loop along output array + T x = 0.0; // register + for (int n = 0; n < q; ++n) + x += f[n] * 2 * cos(k[j] * (T)z[n]); // pos & neg freq pair. use T cos! + phihat[j] = x; + } +} + +template +static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1, + std::complex *fw, int modeord) +/* + if dir==1: copies fw to fk with amplification by prefac/ker + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) + 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). + + fk is a size-ms T complex array (2*ms Ts alternating re,im parts) + fw is a size-nf1 complex array (2*nf1 Ts alternating re,im parts) + ker is real-valued T array of length nf1/2+1. + + Single thread only, but shouldn't matter since mostly data movement. + + It has been tested that the repeated floating division in this inner loop + only contributes at the <3% level in 3D relative to the FFT cost (8 threads). + This could be removed by passing in an inverse kernel and doing mults. + + todo: rewrite w/ C++-complex I/O, check complex divide not slower than + real divide, or is there a way to force a real divide? + + Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 +*/ +{ + BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices + if (ms == 0) kmax = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (kmax + 1); + } // or, instead, FFT ordering + if (dir == 1) { // read fw, write out to fk... + for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k + fk[pp++] = prefac * fw[k].real() / ker[k]; // re + fk[pp++] = prefac * fw[k].imag() / ker[k]; // im + } + for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k + fk[pn++] = prefac * fw[nf1 + k].real() / ker[-k]; // re + fk[pn++] = prefac * fw[nf1 + k].imag() / ker[-k]; // im + } + } else { // read fk, write out to fw w/ zero padding... + for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where + // needed + fw[k] = 0.0; + } + for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k + fw[k].real(prefac * fk[pp++] / ker[k]); // re + fw[k].imag(prefac * fk[pp++] / ker[k]); // im + } + for (BIGINT k = kmin; k < 0; ++k) { // neg freqs k + fw[nf1 + k].real(prefac * fk[pn++] / ker[-k]); // re + fw[nf1 + k].imag(prefac * fk[pn++] / ker[-k]); // im + } + } +} + +template +static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt, + T *fk, BIGINT nf1, BIGINT nf2, std::complex *fw, + int modeord) +/* + 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. + + if dir==1: copies fw to fk with amplification by prefac/(ker1(k1)*ker2(k2)). + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) + 1: use FFT-style (pos then negative, on each dim) + + fk is a complex array stored as 2*ms*mt Ts alternating re,im parts, with + ms looped over fast and mt slow. + fw is a complex array stored as 2*nf1*nf2] Ts alternating re,im parts, with + nf1 looped over fast and nf2 slow. + ker1, ker2 are real-valued T arrays of lengths nf1/2+1, nf2/2+1 + respectively. + + Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 +*/ +{ + BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices + if (mt == 0) k2max = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (k2max + 1) * ms; + } // or, instead, FFT ordering + if (dir == 2) // zero pad needed x-lines (contiguous in memory) + for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all + // dims + fw[j] = 0.0; + for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs + // point fk and fw to the start of this y value's row (2* is for complex): + common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1, + &fw[nf1 * k2], modeord); + for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs + common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1, + &fw[nf1 * (nf2 + k2)], modeord); +} + +template +static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms, + BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, + BIGINT nf3, std::complex *fw, int modeord) +/* + 3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac. + + if dir==1: copies fw to fk with ampl by prefac/(ker1(k1)*ker2(k2)*ker3(k3)). + if dir==2: copies fk to fw (and zero pads rest of it), same amplification. + + modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) + 1: use FFT-style (pos then negative, on each dim) + + fk is a complex array stored as 2*ms*mt*mu Ts alternating re,im parts, with + ms looped over fastest and mu slowest. + fw is a complex array stored as 2*nf1*nf2*nf3 Ts alternating re,im parts, with + nf1 looped over fastest and nf3 slowest. + ker1, ker2, ker3 are real-valued T arrays of lengths nf1/2+1, nf2/2+1, + and nf3/2+1 respectively. + + Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 +*/ +{ + BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices + if (mu == 0) k3max = -1; // fixes zero-pad for trivial no-mode case + // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array + BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx) + if (modeord == 1) { + pp = 0; + pn = 2 * (k3max + 1) * ms * mt; + } // or FFT ordering + BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane + if (dir == 2) // zero pad needed xy-planes (contiguous in memory) + for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims + fw[j] = 0.0; + for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt) // non-neg z-freqs + // point fk and fw to the start of this z value's plane (2* is for complex): + common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1, + nf2, &fw[np * k3], modeord); + for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs + common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1, + nf2, &fw[np * (nf3 + k3)], modeord); +} + +// --------- batch helper functions for t1,2 exec: --------------------------- + +template +static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T *p, + std::complex *cBatch) +/* + Spreads (or interpolates) a batch of batchSize strength vectors in cBatch + to (or from) the batch of fine working grids p->fwBatch, using the same set of + (index-sorted) NU points p->X,Y,Z for each vector in the batch. + The direction (spread vs interpolate) is set by p->spopts.spread_direction. + Returns 0 (no error reporting for now). + Notes: + 1) cBatch is already assumed to have the correct offset, ie here we + read from the start of cBatch (unlike Malleo). fwBatch also has zero offset + 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp + Barnett 5/19/20, based on Malleo 2019. +*/ +{ + // opts.spread_thread: 1 sequential multithread, 2 parallel single-thread. + // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work. + // But when nthr_outer=1 here, omp par inside the loop sees all threads... +#ifdef _OPENMP + int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize; +#endif +#pragma omp parallel for num_threads(nthr_outer) + for (int i = 0; i < batchSize; i++) { + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch + spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, + p->Y, p->Z, (T *)ci, p->spopts, p->didSort); + } + return 0; +} + +template +static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T *p, std::complex *fkBatch) +/* + Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch + into each output array fk in fkBatch. + Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, + again looping over fk in fkBatch and fw in p->fwBatch. + The direction (spread vs interpolate) is set by p->spopts.spread_direction. + This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize + times. + Barnett 5/21/20, simplified from Malleo 2019 (eg t3 logic won't be in here) +*/ +{ + // since deconvolveshuffle?d are single-thread, omp par seems to help here... +#pragma omp parallel for num_threads(batchSize) + for (int i = 0; i < batchSize; i++) { + std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace + std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch + + // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... + if (p->dim == 1) + deconvolveshuffle1d(p->spopts.spread_direction, T(1), p->phiHat1, p->ms, (T *)fki, + p->nf1, fwi, p->opts.modeord); + else if (p->dim == 2) + deconvolveshuffle2d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, p->ms, + p->mt, (T *)fki, p->nf1, p->nf2, fwi, p->opts.modeord); + else + deconvolveshuffle3d(p->spopts.spread_direction, T(1), p->phiHat1, p->phiHat2, + p->phiHat3, p->ms, p->mt, p->mu, (T *)fki, p->nf1, p->nf2, + p->nf3, fwi, p->opts.modeord); + } + return 0; +} + +} // namespace common +} // namespace finufft + +// --------------- rest is the 5 user guru (plan) interface drivers: --------- +// (not namespaced since have safe names finufft{f}_* ) +using namespace finufft::common; // accesses routines defined above + +// Marco Barbone: 5.8.2024 +// These are user-facing. +// The various options could be macros to follow c standard library conventions. +// Question: would these be enums? + +// OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO +void finufft_default_opts_t(finufft_opts *o) +// Sets default nufft opts (referenced by all language interfaces too). +// See finufft_opts.h for meanings. +// This was created to avoid uncertainty about C++11 style static initialization +// when called from MEX, but now is generally used. Barnett 10/30/17 onwards. +// Sphinx sucks the below code block into the web docs, hence keep it clean... +{ + // sphinx tag (don't remove): @defopts_start + o->modeord = 0; + o->chkbnds = 1; + + o->debug = 0; + o->spread_debug = 0; + o->showwarn = 1; + + o->nthreads = 0; +#ifdef FINUFFT_USE_DUCC0 + o->fftw = 0; +#else + o->fftw = FFTW_ESTIMATE; +#endif + o->spread_sort = 2; + o->spread_kerevalmeth = 1; + o->spread_kerpad = 1; + o->upsampfac = 0.0; + o->spread_thread = 0; + o->maxbatchsize = 0; + o->spread_nthr_atomic = -1; + o->spread_max_sp_size = 0; + o->fftw_lock_fun = nullptr; + o->fftw_unlock_fun = nullptr; + o->fftw_lock_data = nullptr; + // sphinx tag (don't remove): @defopts_end +} + +// PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts) +// Populates the fields of finufft_plan which is pointed to by "pp". +// opts is ptr to a finufft_opts to set options, or NULL to use defaults. +// For some of the fields (if "auto" selected) here choose the actual setting. +// For types 1,2 allocates memory for internal working arrays, +// evaluates spreading kernel coefficients, and instantiates the fftw_plan +{ + FINUFFT_PLAN_T *p; + p = new FINUFFT_PLAN_T; // allocate fresh plan struct + *pp = p; // pass out plan as ptr to plan struct + + if (opts == NULL) // use default opts + finufft_default_opts_t(&(p->opts)); + else // or read from what's passed in + p->opts = *opts; // keep a deep copy; changing *opts now has no effect + + if (p->opts.debug) // do a hello world + printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", + __func__); + + p->fftPlan = std::make_unique>( + p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); + + if ((type != 1) && (type != 2) && (type != 3)) { + fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); + return FINUFFT_ERR_TYPE_NOTVALID; + } + if ((dim != 1) && (dim != 2) && (dim != 3)) { + fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); + return FINUFFT_ERR_DIM_NOTVALID; + } + if (ntrans < 1) { + fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); + return FINUFFT_ERR_NTRANS_NOTVALID; + } + if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { + fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", + __func__); + return FINUFFT_ERR_LOCK_FUNS_INVALID; + ; + } + + // get stuff from args... + p->type = type; + p->dim = dim; + p->ntrans = ntrans; + p->tol = tol; + p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input + + // choose overall # threads... +#ifdef _OPENMP + int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); + int nthr = ompmaxnthr; // default: use as many as OMP gives us + // (the above could be set, or suggested set, to 1 for small enough problems...) + if (p->opts.nthreads > 0) { + nthr = p->opts.nthreads; // user override, now without limit + if (p->opts.showwarn && (nthr > ompmaxnthr)) + fprintf(stderr, + "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " + "available; note large nthreads can be slower.\n", + __func__, nthr, ompmaxnthr); + } +#else + int nthr = 1; // always 1 thread (avoid segfault) + if (p->opts.nthreads > 1) + fprintf(stderr, + "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", + __func__, p->opts.nthreads); +#endif + p->opts.nthreads = nthr; // store actual # thr planned for + // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) + + // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) + if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize + p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss + p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b + } else { // batchSize override by user + p->batchSize = min(p->opts.maxbatchsize, ntrans); + p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches + } + if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice + if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { + fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); + return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; + } + + if (type != 3) { // read in user Fourier mode array sizes... + p->ms = n_modes[0]; + p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims + p->mu = (dim > 2) ? n_modes[2] : 1; + p->N = p->ms * p->mt * p->mu; // N = total # modes + } + + // heuristic to choose default upsampfac... (currently two poss) + if (p->opts.upsampfac == 0.0) { // indicates auto-choose + p->opts.upsampfac = 2.0; // default, and need for tol small + if (tol >= (TF)1E-9) { // the tol sigma=5/4 can reach + if (type == 3) // could move to setpts, more known? + p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT + else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || + (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, + // typ tol, 12-core xeon + p->opts.upsampfac = 1.25; + } + if (p->opts.debug > 1) + printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); + } + // use opts to choose and write into plan's spread options... + int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); + if (ier > 1) // proceed if success or warning + return ier; + + // set others as defaults (or unallocated for arrays)... + p->X = NULL; + p->Y = NULL; + p->Z = NULL; + p->phiHat1 = NULL; + p->phiHat2 = NULL; + p->phiHat3 = NULL; + p->nf1 = 1; + p->nf2 = 1; + p->nf3 = 1; // crucial to leave as 1 for unused dims + p->sortIndices = NULL; // used in all three types + + // ------------------------ types 1,2: planning needed --------------------- + if (type == 1 || type == 2) { + + int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) + // Note: batchSize not used since might be only 1. + + p->spopts.spread_direction = type; + + constexpr TF EPSILON = std::numeric_limits::epsilon(); + if (p->opts.showwarn) { // user warn round-off error... + if (EPSILON * p->ms > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->ms)); + if (EPSILON * p->mt > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mt)); + if (EPSILON * p->mu > 1.0) + fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", + __func__, (double)(EPSILON * p->mu)); + } + + // determine fine grid sizes, sanity check.. + int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); + if (nfier) return nfier; // nf too big; we're done + p->phiHat1 = (TF *)malloc(sizeof(TF) * (p->nf1 / 2 + 1)); + if (dim > 1) { + nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); + if (nfier) return nfier; + p->phiHat2 = (TF *)malloc(sizeof(TF) * (p->nf2 / 2 + 1)); + } + if (dim > 2) { + nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); + if (nfier) return nfier; + p->phiHat3 = (TF *)malloc(sizeof(TF) * (p->nf3 / 2 + 1)); + } + + if (p->opts.debug) { // "long long" here is to avoid warnings with printf... + printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " + "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " + "batchSize=%d ", + __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, + (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, + p->batchSize); + if (p->batchSize == 1) // spread_thread has no effect in this case + printf("\n"); + else + printf(" spread_thread=%d\n", p->opts.spread_thread); + } + + // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim + CNTime timer; + timer.start(); + onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); + if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); + if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); + if (p->opts.debug) + printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, + timer.elapsedsec()); + + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points + if (p->nf * p->batchSize > MAX_NF) { + fprintf(stderr, + "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 + return FINUFFT_ERR_MAXNALLOC; + } + + timer.restart(); + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace + if (p->opts.debug) + printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, + (double)1E-09 * sizeof(std::complex) * p->nf * p->batchSize, + timer.elapsedsec()); + if (!p->fwBatch) { // we don't catch all such mallocs, just this big one + fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", + __func__); + free(p->phiHat1); + free(p->phiHat2); + free(p->phiHat3); + return FINUFFT_ERR_ALLOC; + } + + timer.restart(); // plan the FFTW + const auto ns = gridsize_for_fft(p); + p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); + if (p->opts.debug) + printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, + nthr_fft, timer.elapsedsec()); + + } else { // -------------------------- type 3 (no planning) ------------ + + if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); + // in case destroy occurs before setpts, need safe dummy ptrs/plans... + p->CpBatch = NULL; + p->fwBatch = NULL; + p->Sp = NULL; + p->Tp = NULL; + p->Up = NULL; + p->prephase = NULL; + p->deconv = NULL; + p->innerT2plan = NULL; + // Type 3 will call finufft_makeplan for type 2; no need to init FFTW + // Note we don't even know nj or nk yet, so can't do anything else! + } + return ier; // report setup_spreader status (could be warning) +} +template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, + int iflag, int ntrans, float tol, + FINUFFT_PLAN_T **pp, finufft_opts *opts); +template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, + int iflag, int ntrans, double tol, + FINUFFT_PLAN_T **pp, finufft_opts *opts); + +// SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u) +/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for + spreading. (The last 4 arguments are ignored.) + For type 3: allocates internal working arrays, scales/centers the NU points + and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. +*/ +{ + int d = p->dim; // abbrev for spatial dim + CNTime timer; + timer.start(); + p->nj = nj; // the user only now chooses how many NU (x,y,z) pts + if (nj < 0) { + fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } else if (nj > MAX_NU_PTS) { + fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + + if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- + // (all we can do is check and maybe bin-sort the NU pts) + p->X = xj; // plan must keep pointers to user's fixed NU pts + p->Y = yj; + p->Z = zj; + int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); + if (p->opts.debug > 1) + printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, + timer.elapsedsec()); + if (ier) // no warnings allowed here + return ier; + timer.restart(); + // Free sortIndices if it has been allocated before in case of repeated setpts + // calls causing memory leak. We don't know it is the same size as before, so we + // have to malloc each time. + if (p->sortIndices) free(p->sortIndices); + p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); + if (!p->sortIndices) { + fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); + return FINUFFT_ERR_SPREAD_ALLOC; + } + p->didSort = + indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); + if (p->opts.debug) + printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + timer.elapsedsec()); + + } else { // ------------------------- TYPE 3 SETPTS ----------------------- + // (here we can precompute pre/post-phase factors and plan the t2) + + if (nk < 0) { + fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } else if (nk > MAX_NU_PTS) { + fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); + return FINUFFT_ERR_NUM_NU_PTS_INVALID; + } + p->nk = nk; // user set # targ freq pts + p->S = s; // keep pointers to user's input target pts + p->T = t; + p->U = u; + + // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... + TF S1, S2, S3; // get half-width X, center C, which contains {x_j}... + arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); + arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} + set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), + &(p->t3P.gam1)); // applies twist i) + p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc + p->t3P.D2 = 0.0; + if (d > 1) { + arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} + arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} + set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), + &(p->t3P.gam2)); + } + p->t3P.C3 = 0.0; + p->t3P.D3 = 0.0; + if (d > 2) { + arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} + arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} + set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), + &(p->t3P.gam3)); + } + + if (p->opts.debug) { // report on choices of shifts, centers, etc... + printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, + p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); + if (d > 1) + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, + p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); + if (d > 2) + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, + p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); + } + p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points + if (p->nf * p->batchSize > MAX_NF) { + fprintf(stderr, + "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + __func__); + return FINUFFT_ERR_MAXNALLOC; + } + p->fftPlan->free(p->fwBatch); + p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace + + // (note FFTW_ALLOC is not needed over malloc, but matches its type) + if (p->CpBatch) free(p->CpBatch); + p->CpBatch = + (std::complex *)malloc(sizeof(std::complex) * nj * p->batchSize); // batch + // c' + // work + + if (p->opts.debug) + printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, + (double)1E-09 * sizeof(std::complex) * (p->nf + nj) * p->batchSize, + timer.elapsedsec()); + if (!p->fwBatch || !p->CpBatch) { + fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); + return FINUFFT_ERR_ALLOC; + } + // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); + + // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... + // FIXME: should use realloc + if (p->X) free(p->X); + if (p->Sp) free(p->Sp); + p->X = (TF *)malloc(sizeof(TF) * nj); + p->Sp = (TF *)malloc(sizeof(TF) * nk); + if (d > 1) { + if (p->Y) free(p->Y); + if (p->Tp) free(p->Tp); + p->Y = (TF *)malloc(sizeof(TF) * nj); + p->Tp = (TF *)malloc(sizeof(TF) * nk); + } + if (d > 2) { + if (p->Z) free(p->Z); + if (p->Up) free(p->Up); + p->Z = (TF *)malloc(sizeof(TF) * nj); + p->Up = (TF *)malloc(sizeof(TF) * nk); + } + + // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... + TF ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim + if (d > 1) ig2 = 1.0 / p->t3P.gam2; + if (d > 2) ig3 = 1.0 / p->t3P.gam3; +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT j = 0; j < nj; ++j) { + p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j + if (d > 1) // (ok to do inside loop because of branch predict) + p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j + if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j + } + + // set up prephase array... + std::complex imasign = + (p->fftSign >= 0) ? std::complex(0, 1) : std::complex(0, -1); // +-i + if (p->prephase) free(p->prephase); + p->prephase = (std::complex *)malloc(sizeof(std::complex) * nj); + if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs + TF phase = p->t3P.D1 * xj[j]; + if (d > 1) phase += p->t3P.D2 * yj[j]; + if (d > 2) phase += p->t3P.D3 * zj[j]; + p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler + // e^{+-i.phase} + } + } else + for (BIGINT j = 0; j < nj; ++j) + p->prephase[j] = (std::complex)1.0; // *** or keep flag so no mult in exec?? + + // rescale the target s_k etc to s'_k etc... +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT k = 0; k < nk; ++k) { + p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R + if (d > 1) + p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < + // pi/R + if (d > 2) + p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < + // pi/R + } + // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... + // (exploits that FT separates because kernel is prod of 1D funcs) + if (p->deconv) free(p->deconv); + p->deconv = (std::complex *)malloc(sizeof(std::complex) * nk); + TF *phiHatk1 = (TF *)malloc(sizeof(TF) * nk); // don't confuse w/ p->phiHat + onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 + TF *phiHatk2 = NULL, *phiHatk3 = NULL; + if (d > 1) { + phiHatk2 = (TF *)malloc(sizeof(TF) * nk); + onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 + } + if (d > 2) { + phiHatk3 = (TF *)malloc(sizeof(TF) * nk); + onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 + } + int Cfinite = + isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan + // or inf if + // M=0, no + // input NU pts + int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen +#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs + TF phiHat = phiHatk1[k]; + if (d > 1) phiHat *= phiHatk2[k]; + if (d > 2) phiHat *= phiHatk3[k]; + p->deconv[k] = (std::complex)(1.0 / phiHat); + if (Cfinite && Cnonzero) { + TF phase = (s[k] - p->t3P.D1) * p->t3P.C1; + if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; + if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; + p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} + } + } + free(phiHatk1); + free(phiHatk2); + free(phiHatk3); // done w/ deconv fill + if (p->opts.debug) + printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); + + // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... + timer.restart(); + // Free sortIndices if it has been allocated before in case of repeated setpts + // calls causing memory leak. We don't know it is the same size as before, so we + // have to malloc each time. + if (p->sortIndices) free(p->sortIndices); + p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); + if (!p->sortIndices) { + fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); + return FINUFFT_ERR_SPREAD_ALLOC; + } + p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, + p->Z, p->spopts); + if (p->opts.debug) + printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + timer.elapsedsec()); + + // Plan and setpts once, for the (repeated) inner type 2 finufft call... + timer.restart(); + BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw + finufft_opts t2opts = p->opts; // deep copy, since not ptrs + t2opts.modeord = 0; // needed for correct t3! + t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail + t2opts.spread_debug = max(0, p->opts.spread_debug - 1); + t2opts.showwarn = 0; // so don't see warnings 2x + // (...could vary other t2opts here?) + if (p->innerT2plan) { + delete p->innerT2plan; + p->innerT2plan = nullptr; + } + int ier = finufft_makeplan_t(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, + &p->innerT2plan, &t2opts); + if (ier > 1) { // if merely warning, still proceed + fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", + __func__, ier); + return ier; + } + ier = finufft_setpts_t(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, + NULL); // note nk = # output points (not nj) + if (ier > 1) { + fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); + return ier; + } + if (p->opts.debug) + printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); + } + return 0; +} +template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, float *xj, + float *yj, float *zj, BIGINT nk, float *s, float *t, + float *u); +template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, double *xj, + double *yj, double *zj, BIGINT nk, double *s, + double *t, double *u); + +// ............ end setpts .................................................. + +// EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk) { + /* See ../docs/cguru.doc for current documentation. + + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and FFT as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. +*/ + CNTime timer; + timer.start(); + + if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ + + double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing + if (p->opts.debug) + printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, + p->nbatch, p->batchSize); + + for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches + + // current batch is either batchSize, or possibly truncated if last one + int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); + int bB = b * p->batchSize; // index of vector, since batchsizes same + std::complex *cjb = cj + bB * p->nj; // point to batch of weights + std::complex *fkb = fk + bB * p->N; // point to batch of mode coeffs + if (p->opts.debug > 1) + printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); + + // STEP 1: (varies by type) + timer.restart(); + if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid + spreadinterpSortedBatch(thisBatchSize, p, cjb); + t_sprint += timer.elapsedsec(); + } else { // type 2: amplify Fourier coeffs fk into 0-padded fw + deconvolveBatch(thisBatchSize, p, fkb); + t_deconv += timer.elapsedsec(); + } + + // STEP 2: call the FFT on this batch + timer.restart(); + do_fft(p); + t_fft += timer.elapsedsec(); + if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); + + // STEP 3: (varies by type) + timer.restart(); + if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk + deconvolveBatch(thisBatchSize, p, fkb); + t_deconv += timer.elapsedsec(); + } else { // type 2: interpolate unif fw grid to NU target pts + spreadinterpSortedBatch(thisBatchSize, p, cjb); + t_sprint += timer.elapsedsec(); + } + } // ........end b loop + + if (p->opts.debug) { // report total times in their natural order... + if (p->type == 1) { + printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); + printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); + printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); + } else { + printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv); + printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); + printf(" tot interp:\t\t\t%.3g s\n", t_sprint); + } + } + } + + else { // ----------------------------- TYPE 3 EXEC --------------------- + + // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long + // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug + + double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, + t_deconv = 0.0; // accumulated timings + if (p->opts.debug) + printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, + p->nbatch, p->batchSize); + + for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches + + // batching and pointers to this batch, identical to t1,2 above... + int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); + int bB = b * p->batchSize; + std::complex *cjb = cj + bB * p->nj; // batch of input strengths + std::complex *fkb = fk + bB * p->nk; // batch of output strengths + if (p->opts.debug > 1) + printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); + + // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... + timer.restart(); +#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * p->nj; + for (BIGINT j = 0; j < p->nj; ++j) { + p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; + } + } + t_pre += timer.elapsedsec(); + + // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... + timer.restart(); + p->spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed + t_spr += timer.elapsedsec(); + + // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... + timer.restart(); + // illegal possible shrink of ntrans *after* plan for smaller last batch: + p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! + /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array + still the same size, as Andrea explained; just wastes a few flops) */ + finufft_execute_t(p->innerT2plan, fkb, p->fwBatch); + t_t2 += timer.elapsedsec(); + // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... + timer.restart(); +#pragma omp parallel for num_threads(p->opts.nthreads) + for (int i = 0; i < thisBatchSize; i++) { + BIGINT ioff = i * p->nk; + for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; + } + t_deconv += timer.elapsedsec(); + } // ........end b loop + + if (p->opts.debug) { // report total times in their natural order... + printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); + printf(" tot spread:\t\t\t%.3g s\n", t_spr); + printf(" tot type 2:\t\t\t%.3g s\n", t_t2); + printf(" tot deconvolve:\t\t%.3g s\n", t_deconv); + } + } + // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long + // int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug + + return 0; +} +template int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, + std::complex *fk); +template int finufft_execute_t( + FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); + +// DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { + // Free everything we allocated inside of finufft_plan pointed to by p. + // Also must not crash if called immediately after finufft_makeplan. + // Thus either each thing free'd here is guaranteed to be NULL or correctly + // allocated. + if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array + free(sortIndices); + if (type == 1 || type == 2) { + free(phiHat1); + free(phiHat2); + free(phiHat3); + } else { // free the stuff alloc for type 3 only + delete innerT2plan; + innerT2plan = nullptr; // if NULL, ignore its error code + free(CpBatch); + free(Sp); + free(Tp); + free(Up); + free(X); + free(Y); + free(Z); + free(prephase); + free(deconv); + } +} +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); +template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index 1fb49db9e..454f22af7 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -31,12 +31,12 @@ static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FL // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { - FINUFFT_PLAN plan; + FINUFFT_PLAN plan = nullptr; int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, popts); // popts (ptr to opts) can be NULL if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); - delete plan; + FINUFFT_DESTROY(plan); return ier; } diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index b2b5ea8b0..126bcd2d7 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -2162,6 +2162,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. */ { + constexpr T EPSILON = std::numeric_limits::epsilon(); if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma if (kerevalmeth == 1) { fprintf(stderr, diff --git a/test/testutils.cpp b/test/testutils.cpp index 64b5d7a0a..7b550ebff 100644 --- a/test/testutils.cpp +++ b/test/testutils.cpp @@ -57,7 +57,8 @@ int main(int argc, char *argv[]) { a[j] = CPX(1.0, 0.0); b[j] = a[j]; } - FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly + constexpr FLT EPSILON = std::numeric_limits::epsilon(); + FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1; if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1; b[0] = CPX(0.0, 0.0); // perturb b from a From aca3e5bbc8c8fb1a5faca959fe0dc46212781c1c Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 15:19:18 +0200 Subject: [PATCH 12/20] fixes --- include/finufft/utils_precindep.h | 2 +- src/fft.cpp | 38 +++++++++++++++---------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h index 8dd3839d8..41726eba8 100644 --- a/include/finufft/utils_precindep.h +++ b/include/finufft/utils_precindep.h @@ -4,7 +4,7 @@ #ifndef UTILS_PRECINDEP_H #define UTILS_PRECINDEP_H -// #include "defs.h" +#include "finufft/finufft_core.h" // for CNTime... // using chrono since the interface is portable between linux and windows #include diff --git a/src/fft.cpp b/src/fft.cpp index 3a7fbf2f6..68877cacd 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -34,9 +34,9 @@ template void do_fft(FINUFFT_PLAN_T *p) { arrdims.push_back(size_t(ns[2])); axes.push_back(3); } - ducc0::vfmav data(p->fwBatch, arrdims); + ducc0::vfmav> data(p->fwBatch, arrdims); #ifdef FINUFFT_NO_DUCC0_TWEAKS - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); #else /* For type 1 NUFFTs, only the low-frequency parts of the output fine grid are going to be used, and for type 2 NUFFTs, the high frequency parts of the @@ -47,10 +47,10 @@ template void do_fft(FINUFFT_PLAN_T *p) { of all 1D FFTs, and for the last remaining axis the factor is 1/oversampling_factor^2. */ if (p->dim == 1) // 1D: no chance for FFT shortcuts - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); else if (p->dim == 2) { // 2D: do partial FFTs if (p->ms < 2) // something is weird, do standard FFT - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); else { size_t y_lo = size_t((p->ms + 1) / 2); size_t y_hi = size_t(ns[1] - p->ms / 2); @@ -60,17 +60,17 @@ template void do_fft(FINUFFT_PLAN_T *p) { auto sub2 = ducc0::subarray(data, {{}, {}, {y_hi, ducc0::MAXIDX}}); if (p->type == 1) // spreading, not all parts of the output array are needed // do axis 2 in full - ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {2}, p->fftSign < 0, TF(1), nthreads); // do only parts of axis 1 - ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub1, sub1, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub2, sub2, {1}, p->fftSign < 0, TF(1), nthreads); if (p->type == 2) // interpolation, parts of the input array are zero // do axis 2 in full - ducc0::c2c(data, data, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {2}, p->fftSign < 0, TF(1), nthreads); } } else { // 3D if ((p->ms < 2) || (p->mt < 2)) // something is weird, do standard FFT - ducc0::c2c(data, data, axes, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); else { size_t z_lo = size_t((p->ms + 1) / 2); size_t z_hi = size_t(ns[2] - p->ms / 2); @@ -84,22 +84,22 @@ template void do_fft(FINUFFT_PLAN_T *p) { auto sub6 = ducc0::subarray(sub2, {{}, {}, {y_hi, ducc0::MAXIDX}, {}}); if (p->type == 1) { // spreading, not all parts of the output array are needed // do axis 3 in full - ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {3}, p->fftSign < 0, TF(1), nthreads); // do only parts of axis 2 - ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, TF(1), nthreads); } // do even smaller parts of axis 1 - ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub3, sub3, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub4, sub4, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub5, sub5, {1}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub6, sub6, {1}, p->fftSign < 0, TF(1), nthreads); if (p->type == 2) { // interpolation, parts of the input array are zero // do only parts of axis 2 - ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, FLT(1), nthreads); - ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(sub1, sub1, {2}, p->fftSign < 0, TF(1), nthreads); + ducc0::c2c(sub2, sub2, {2}, p->fftSign < 0, TF(1), nthreads); // do axis 3 in full - ducc0::c2c(data, data, {3}, p->fftSign < 0, FLT(1), nthreads); + ducc0::c2c(data, data, {3}, p->fftSign < 0, TF(1), nthreads); } } } From f6da37f09596d7540d155618e4190303f7e03b7b Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 16:39:43 +0200 Subject: [PATCH 13/20] more templatizing --- CMakeLists.txt | 9 +- perftest/manysmallprobs.cpp | 1 + src/simpleinterfaces.cpp | 368 ++++++++++++++++++++++++++++-------- src/utils_precindep.cpp | 1 - 4 files changed, 295 insertions(+), 84 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 435bcd8c9..0002da9e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,8 +121,7 @@ endif() # This set of sources is compiled twice, once in single precision and once in # double precision The single precision compilation is done with -DSINGLE -set(FINUFFT_PRECISION_DEPENDENT_SOURCES src/finufft.cpp - src/simpleinterfaces.cpp) +set(FINUFFT_PRECISION_DEPENDENT_SOURCES) # If we're building for Fortran, make sure we also include the translation # layer. @@ -261,12 +260,14 @@ if(FINUFFT_USE_CPU) add_library( finufft SHARED src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp) + contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cp + src/simpleinterfaces.cpp) else() add_library( finufft STATIC src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp) + contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp + src/simpleinterfaces.cpp) endif() target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64) set_finufft_options(finufft) diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index 0f2c9d0bb..8bc379f3c 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -1,5 +1,6 @@ // public header #include "finufft.h" +#include "finufft/defs.h" // private access to timer #include "finufft/utils_precindep.h" diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index 454f22af7..43d9806aa 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -3,7 +3,8 @@ // private headers #include #include -#include +#include // (must come after complex.h) + using namespace std; /* --------------------------------------------------------------------------- @@ -19,42 +20,103 @@ using namespace std; --------------------------------------------------------------------------- */ +void finufft_default_opts(finufft_opts *o) { finufft_default_opts_t(o); } +void finufftf_default_opts(finufft_opts *o) { finufft_default_opts_t(o); } + +int finufft_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + double tol, finufft_plan *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), + opts); +} +int finufftf_makeplan(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + float tol, finufftf_plan *pp, finufft_opts *opts) { + return finufft_makeplan_t(type, dim, n_modes, iflag, ntrans, tol, + reinterpret_cast **>(pp), opts); +} + +int finufft_setpts(finufft_plan p, BIGINT nj, double *xj, double *yj, double *zj, + BIGINT nk, double *s, double *t, double *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, + yj, zj, nk, s, t, u); +} +int finufftf_setpts(finufftf_plan p, BIGINT nj, float *xj, float *yj, float *zj, + BIGINT nk, float *s, float *t, float *u) { + return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, + zj, nk, s, t, u); +} + +int finufft_execute(finufft_plan p, std::complex *cj, std::complex *fk) { + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); +} +int finufftf_execute(finufftf_plan p, std::complex *cj, std::complex *fk) { + return finufft_execute_t(reinterpret_cast *>(p), cj, fk); +} + +int finufft_destroy(finufft_plan p) +// Free everything we allocated inside of finufft_plan pointed to by p. +// Also must not crash if called immediately after finufft_makeplan. +// Thus either each thing free'd here is guaranteed to be NULL or correctly +// allocated. +{ + if (!p) // NULL ptr, so not a ptr to a plan, report error + return 1; + + delete reinterpret_cast *>(p); + p = nullptr; + return 0; // success +} +int finufftf_destroy(finufftf_plan p) +// Free everything we allocated inside of finufft_plan pointed to by p. +// Also must not crash if called immediately after finufft_makeplan. +// Thus either each thing free'd here is guaranteed to be NULL or correctly +// allocated. +{ + if (!p) // NULL ptr, so not a ptr to a plan, report error + return 1; + + delete reinterpret_cast *>(p); + p = nullptr; + return 0; // success +} // Helper layer ........................................................... namespace finufft { namespace common { -static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, - FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - const std::array &n_modes, BIGINT nk, FLT *s, - FLT *t, FLT *u, CPX *fk, finufft_opts *popts) +template +static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, T *xj, + T *yj, T *zj, std::complex *cj, int iflag, T eps, + const std::array &n_modes, BIGINT nk, T *s, + T *t, T *u, std::complex *fk, finufft_opts *popts) // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { - FINUFFT_PLAN plan = nullptr; - int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, - popts); // popts (ptr to opts) can be NULL - if (ier > 1) { // since 1 (a warning) still allows proceeding... + FINUFFT_PLAN_T *plan = nullptr; + int ier = + finufft_makeplan_t(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, + popts); // popts (ptr to opts) can be NULL + if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); - FINUFFT_DESTROY(plan); + delete plan; return ier; } - int ier2 = FINUFFT_SETPTS(plan, nj, xj, yj, zj, nk, s, t, u); + int ier2 = finufft_setpts_t(plan, nj, xj, yj, zj, nk, s, t, u); if (ier2 > 1) { fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2); - FINUFFT_DESTROY(plan); + delete plan; return ier2; } - int ier3 = FINUFFT_EXECUTE(plan, cj, fk); + int ier3 = finufft_execute_t(plan, cj, fk); if (ier3 > 1) { fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3); - FINUFFT_DESTROY(plan); + delete plan; return ier3; } - FINUFFT_DESTROY(plan); + delete plan; return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning } @@ -65,135 +127,283 @@ using namespace finufft::common; // Dimension 1111111111111111111111111111111111111111111111111111111111111111 -int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, - BIGINT ms, CPX *fk, finufft_opts *opts) +int finufft1d1many(int n_transf, BIGINT nj, double *xj, std::complex *cj, + int iflag, double eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) +// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); +} +int finufftf1d1many(int n_transf, BIGINT nj, float *xj, std::complex *cj, + int iflag, float eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts); + return invokeGuruInterface(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); } -int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, - finufft_opts *opts) +int finufft1d1(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) +// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} +int finufftf1d1(BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) // Type-1 1D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT1D1MANY(1, nj, xj, cj, iflag, eps, ms, fk, opts); + return finufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } -int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, - BIGINT ms, CPX *fk, finufft_opts *opts) +int finufft1d2many(int n_transf, BIGINT nj, double *xj, std::complex *cj, + int iflag, double eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) // Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 2, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, - {ms, 1, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(1, 2, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, + {ms, 1, 1}, 0, NULL, NULL, NULL, fk, opts); +} +int finufftf1d2many(int n_transf, BIGINT nj, float *xj, std::complex *cj, + int iflag, float eps, BIGINT ms, std::complex *fk, + finufft_opts *opts) +// Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(1, 2, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, + {ms, 1, 1}, 0, NULL, NULL, NULL, fk, opts); } -int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk, - finufft_opts *opts) +int finufft1d2(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) // Type-2 1D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT1D2MANY(1, nj, xj, cj, iflag, eps, ms, fk, opts); + return finufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); +} +int finufftf1d2(BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, + BIGINT ms, std::complex *fk, finufft_opts *opts) +// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } -int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts) +int finufft1d3many(int n_transf, BIGINT nj, double *xj, std::complex *cj, + int iflag, double eps, BIGINT nk, double *s, std::complex *fk, + finufft_opts *opts) +// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(1, 3, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, + {0, 0, 0}, nk, s, NULL, NULL, fk, opts); +} +int finufftf1d3many(int n_transf, BIGINT nj, float *xj, std::complex *cj, + int iflag, float eps, BIGINT nk, float *s, std::complex *fk, + finufft_opts *opts) // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 3, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, - {0, 0, 0}, nk, s, NULL, NULL, fk, opts); + return invokeGuruInterface(1, 3, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, + {0, 0, 0}, nk, s, NULL, NULL, fk, opts); +} +int finufft1d3(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, + BIGINT nk, double *s, std::complex *fk, finufft_opts *opts) +// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } -int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s, - CPX *fk, finufft_opts *opts) +int finufftf1d3(BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, + BIGINT nk, float *s, std::complex *fk, finufft_opts *opts) // Type-3 1D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT1D3MANY(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); + return finufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } // Dimension 22222222222222222222222222222222222222222222222222222222222222222 -int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, - BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) +int finufft2d1many(int n_transf, BIGINT nj, double *xj, double *yj, + std::complex *c, int iflag, double eps, BIGINT ms, BIGINT mt, + std::complex *fk, finufft_opts *opts) // Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, NULL, c, iflag, eps, {ms, mt, 1}, - 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, NULL, c, iflag, eps, + {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); } -int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, - BIGINT mt, CPX *fk, finufft_opts *opts) +int finufftf2d1many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *c, + int iflag, float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, NULL, c, iflag, eps, + {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); +} +int finufft2d1(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, + double eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} +int finufftf2d1(BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, + float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) // Type-1 2D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT2D1MANY(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); + return finufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } -int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps, - BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts) +int finufft2d2many(int n_transf, BIGINT nj, double *xj, double *yj, + std::complex *c, int iflag, double eps, BIGINT ms, BIGINT mt, + std::complex *fk, finufft_opts *opts) +// Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, NULL, c, iflag, eps, + {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); +} +int finufftf2d2many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *c, + int iflag, float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) // Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, NULL, c, iflag, eps, {ms, mt, 1}, - 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, NULL, c, iflag, eps, + {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); } -int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms, - BIGINT mt, CPX *fk, finufft_opts *opts) +int finufft2d2(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, + double eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) // Type-2 2D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT2D2MANY(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); + return finufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); +} +int finufftf2d2(BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, + float eps, BIGINT ms, BIGINT mt, std::complex *fk, + finufft_opts *opts) +// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } -int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts) +int finufft2d3many(int n_transf, BIGINT nj, double *xj, double *yj, + std::complex *cj, int iflag, double eps, BIGINT nk, double *s, + double *t, std::complex *fk, finufft_opts *opts) +// Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, NULL, cj, iflag, eps, + {0, 0, 0}, nk, s, t, NULL, fk, opts); +} +int finufftf2d3many(int n_transf, BIGINT nj, float *xj, float *yj, + std::complex *cj, int iflag, float eps, BIGINT nk, float *s, + float *t, std::complex *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, NULL, cj, iflag, eps, {0, 0, 0}, - nk, s, t, NULL, fk, opts); + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, NULL, cj, iflag, eps, + {0, 0, 0}, nk, s, t, NULL, fk, opts); +} +int finufft2d3(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, + double eps, BIGINT nk, double *s, double *t, std::complex *fk, + finufft_opts *opts) +// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); } -int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk, - FLT *s, FLT *t, CPX *fk, finufft_opts *opts) +int finufftf2d3(BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, + float eps, BIGINT nk, float *s, float *t, std::complex *fk, + finufft_opts *opts) // Type-3 2D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT2D3MANY(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); + return finufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); } // Dimension 3333333333333333333333333333333333333333333333333333333333333333 -int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, - FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +int finufft3d1many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, + std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) // Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, - 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); +} +int finufftf3d1many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, + std::complex *cj, int iflag, float eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) +// Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); +} +int finufft3d1(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, + int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) +// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } -int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +int finufftf3d1(BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, + int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) // Type-1 3D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT3D1MANY(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); + return finufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } -int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, - FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +int finufft3d2many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, + std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) +// Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); +} +int finufftf3d2many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, + std::complex *cj, int iflag, float eps, BIGINT ms, BIGINT mt, + BIGINT mu, std::complex *fk, finufft_opts *opts) // Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, - 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); } -int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts) +int finufft3d2(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, + int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) // Type-2 3D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT3D2MANY(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); + return finufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); +} +int finufftf3d2(BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, + int iflag, float eps, BIGINT ms, BIGINT mt, BIGINT mu, + std::complex *fk, finufft_opts *opts) +// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } -int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, - FLT eps, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, - finufft_opts *opts) +int finufft3d3many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, + std::complex *cj, int iflag, double eps, BIGINT nk, double *s, + double *t, double *u, std::complex *fk, finufft_opts *opts) +// Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst +{ + return invokeGuruInterface(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {0, 0, 0}, nk, s, t, u, fk, opts); +} +int finufftf3d3many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, + std::complex *cj, int iflag, float eps, BIGINT nk, float *s, + float *t, float *u, std::complex *fk, finufft_opts *opts) // Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, - nk, s, t, u, fk, opts); + return invokeGuruInterface(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, + {0, 0, 0}, nk, s, t, u, fk, opts); +} +int finufft3d3(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, + int iflag, double eps, BIGINT nk, double *s, double *t, double *u, + std::complex *fk, finufft_opts *opts) +// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst +{ + return finufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); } -int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps, - BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts) +int finufftf3d3(BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, + int iflag, float eps, BIGINT nk, float *s, float *t, float *u, + std::complex *fk, finufft_opts *opts) // Type-3 3D complex nonuniform FFT. See ../docs/usage.rst { - return FINUFFT3D3MANY(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); + return finufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); } diff --git a/src/utils_precindep.cpp b/src/utils_precindep.cpp index 194fae7f0..37693d424 100644 --- a/src/utils_precindep.cpp +++ b/src/utils_precindep.cpp @@ -5,7 +5,6 @@ #include -#include "finufft/defs.h" #include "finufft/utils_precindep.h" using namespace std; From 22baa470462d49ba8a7015a1c26d87df333c1119 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 17:11:07 +0200 Subject: [PATCH 14/20] no more precision-dependent sources in library --- CMakeLists.txt | 29 ++-- fortran/finufftfort.cpp | 350 +++++++++++++++++++++++++++++----------- 2 files changed, 266 insertions(+), 113 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0002da9e4..85a514ebf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,31 +250,30 @@ endfunction() if(FINUFFT_USE_CPU) # Main finufft libraries - add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) - target_compile_definitions(finufft_f32 PRIVATE SINGLE) - set_finufft_options(finufft_f32) - - add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) - set_finufft_options(finufft_f64) if(NOT FINUFFT_STATIC_LINKING) add_library( finufft SHARED - src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cp - src/simpleinterfaces.cpp) + src/spreadinterp.cpp + src/utils_precindep.cpp + contrib/legendre_rule_fast.cpp + src/fft.cpp + src/finufft_core.cp + src/simpleinterfaces.cpp + fortran/finufftfort.cpp) else() add_library( finufft STATIC - src/spreadinterp.cpp src/utils_precindep.cpp - contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp - src/simpleinterfaces.cpp) + src/spreadinterp.cpp + src/utils_precindep.cpp + contrib/legendre_rule_fast.cpp + src/fft.cpp + src/finufft_core.cpp + src/simpleinterfaces.cpp + fortran/finufftfort.cpp) endif() - target_link_libraries(finufft PRIVATE finufft_f32 finufft_f64) set_finufft_options(finufft) if(WIN32 AND FINUFFT_SHARED_LINKING) - target_compile_definitions(finufft_f32 PRIVATE dll_EXPORTS FINUFFT_DLL) - target_compile_definitions(finufft_f64 PRIVATE dll_EXPORTS FINUFFT_DLL) target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL) endif() find_library(MATH_LIBRARY m) diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp index 799a10041..c95230c50 100644 --- a/fortran/finufftfort.cpp +++ b/fortran/finufftfort.cpp @@ -19,43 +19,15 @@ // public header #include - -// private headers needed... (must come after finufft.h which clobbers FINUFFT*) -#include - -// local prec-switching macros for fortran names, ie -// underscore-suffixed versions of those at end of defs.h -#define FINUFFT_DEFAULT_OPTS_ FINUFFTIFY(_default_opts_) -#define FINUFFT_MAKEPLAN_ FINUFFTIFY(_makeplan_) -#define FINUFFT_SETPTS_ FINUFFTIFY(_setpts_) -#define FINUFFT_EXECUTE_ FINUFFTIFY(_execute_) -#define FINUFFT_DESTROY_ FINUFFTIFY(_destroy_) -#define FINUFFT1D1_ FINUFFTIFY(1d1_) -#define FINUFFT1D2_ FINUFFTIFY(1d2_) -#define FINUFFT1D3_ FINUFFTIFY(1d3_) -#define FINUFFT2D1_ FINUFFTIFY(2d1_) -#define FINUFFT2D2_ FINUFFTIFY(2d2_) -#define FINUFFT2D3_ FINUFFTIFY(2d3_) -#define FINUFFT3D1_ FINUFFTIFY(3d1_) -#define FINUFFT3D2_ FINUFFTIFY(3d2_) -#define FINUFFT3D3_ FINUFFTIFY(3d3_) -#define FINUFFT1D1MANY_ FINUFFTIFY(1d1many_) -#define FINUFFT1D2MANY_ FINUFFTIFY(1d2many_) -#define FINUFFT1D3MANY_ FINUFFTIFY(1d3many_) -#define FINUFFT2D1MANY_ FINUFFTIFY(2d1many_) -#define FINUFFT2D2MANY_ FINUFFTIFY(2d2many_) -#define FINUFFT2D3MANY_ FINUFFTIFY(2d3many_) -#define FINUFFT3D1MANY_ FINUFFTIFY(3d1many_) -#define FINUFFT3D2MANY_ FINUFFTIFY(3d2many_) -#define FINUFFT3D3MANY_ FINUFFTIFY(3d3many_) +#include #ifdef __cplusplus extern "C" { #endif // --------------------- guru interface from fortran ------------------------ -void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, - FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) { +void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, + double *tol, finufft_plan *plan, finufft_opts *o, int *ier) { if (!plan) fprintf(stderr, "%s fortran: plan must be allocated as at least the size of a C pointer " @@ -63,143 +35,325 @@ void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int __func__); else { // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts: - *ier = FINUFFT_MAKEPLAN(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); + *ier = finufft_makeplan(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); } } -void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk, - FLT *s, FLT *t, FLT *u, int *ier) { +void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, double *zj, + BIGINT *nk, double *s, double *t, double *u, int *ier) { if (!*plan) { fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); return; } int nk_safe = 0; // catches the case where user passes NULL in if (nk) nk_safe = *nk; - *ier = FINUFFT_SETPTS(*plan, *M, xj, yj, zj, nk_safe, s, t, u); + *ier = finufft_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } -void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) { +void finufft_execute_(finufft_plan *plan, std::complex *weights, + std::complex *result, int *ier) { if (!plan) fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else - *ier = FINUFFT_EXECUTE(*plan, weights, result); + *ier = finufft_execute(*plan, weights, result); } -void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) { +void finufft_destroy_(finufft_plan *plan, int *ier) { if (!plan) fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else - *ier = FINUFFT_DESTROY(*plan); + *ier = finufft_destroy(*plan); } // ------------ use FINUFFT to set the default options --------------------- // (Note the finufft_opts is created in f90-style derived types, not here) -void FINUFFT_DEFAULT_OPTS_(finufft_opts *o) { +void finufft_default_opts_(finufft_opts *o) { if (!o) fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__); else // o is a ptr to already-allocated fortran finufft_opts derived type... - FINUFFT_DEFAULT_OPTS(o); + finufft_default_opts(o); } // -------------- simple and many-vector interfaces -------------------- // --- 1D --- -void FINUFFT1D1_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk, - finufft_opts *o, int *ier) { - *ier = FINUFFT1D1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d1_(BIGINT *nj, double *xj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, std::complex *fk, finufft_opts *o, + int *ier) { + *ier = finufft1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT1D1MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d1many_(int *ntransf, BIGINT *nj, double *xj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D2_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk, - finufft_opts *o, int *ier) { - *ier = FINUFFT1D2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d2_(BIGINT *nj, double *xj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, std::complex *fk, finufft_opts *o, + int *ier) { + *ier = finufft1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT1D2MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +void finufft1d2many_(int *ntransf, BIGINT *nj, double *xj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void FINUFFT1D3_(BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, BIGINT *nk, FLT *s, - CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT1D3(*nj, x, c, *iflag, *eps, *nk, s, f, o); +void finufft1d3_(BIGINT *nj, double *x, std::complex *c, int *iflag, double *eps, + BIGINT *nk, double *s, std::complex *f, finufft_opts *o, + int *ier) { + *ier = finufft1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o); } -void FINUFFT1D3MANY_(int *ntransf, BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, - BIGINT *nk, FLT *s, CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT1D3MANY(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); +void finufft1d3many_(int *ntransf, BIGINT *nj, double *x, std::complex *c, + int *iflag, double *eps, BIGINT *nk, double *s, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufft1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); } // --- 2D --- -void FINUFFT2D1_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, - BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +void finufft2d1_(BIGINT *nj, double *xj, double *yj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void FINUFFT2D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, - FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o, +void finufft2d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufft2d2_(BIGINT *nj, double *xj, double *yj, std::complex *cj, int *iflag, + double *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufft2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufft2d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufft2d3_(BIGINT *nj, double *x, double *y, std::complex *c, int *iflag, + double *eps, BIGINT *nk, double *s, double *t, std::complex *f, + finufft_opts *o, int *ier) { + *ier = finufft2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +} + +void finufft2d3many_(int *ntransf, BIGINT *nj, double *x, double *y, + std::complex *c, int *iflag, double *eps, BIGINT *nk, + double *s, double *t, std::complex *f, finufft_opts *o, int *ier) { - *ier = FINUFFT2D1MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); + *ier = finufft2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } -void FINUFFT2D2_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, - BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +// --- 3D --- +void finufft3d1_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT2D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, - FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o, + +void finufft3d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D2MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); + *ier = + finufft3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT2D3_(BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, FLT *eps, BIGINT *nk, - FLT *s, FLT *t, CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT2D3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +void finufft3d2_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex *cj, + int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufft3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT2D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, - FLT *eps, BIGINT *nk, FLT *s, FLT *t, CPX *f, finufft_opts *o, +void finufft3d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj, + std::complex *cj, int *iflag, double *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT2D3MANY(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); + *ier = + finufft3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -// --- 3D --- -void FINUFFT3D1_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT3D1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); +void finufft3d3_(BIGINT *nj, double *x, double *y, double *z, std::complex *c, + int *iflag, double *eps, BIGINT *nk, double *s, double *t, double *u, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufft3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void FINUFFT3D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, - int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, +void finufft3d3many_(int *ntransf, BIGINT *nj, double *x, double *y, double *z, + std::complex *c, int *iflag, double *eps, BIGINT *nk, + double *s, double *t, double *u, std::complex *f, finufft_opts *o, int *ier) { + *ier = finufft3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); +} + +// --------------------- guru interface from fortran ------------------------ +void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, + int *n_transf, float *tol, finufftf_plan *plan, finufft_opts *o, + int *ier) { + if (!plan) + fprintf(stderr, + "%s fortran: plan must be allocated as at least the size of a C pointer " + "(usually 8 bytes)!\n", + __func__); + else { + // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts: + *ier = finufftf_makeplan(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o); + } +} + +void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, float *zj, + BIGINT *nk, float *s, float *t, float *u, int *ier) { + if (!*plan) { + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); + return; + } + int nk_safe = 0; // catches the case where user passes NULL in + if (nk) nk_safe = *nk; + *ier = finufftf_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); +} + +void finufftf_execute_(finufftf_plan *plan, std::complex *weights, + std::complex *result, int *ier) { + if (!plan) + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); + else + *ier = finufftf_execute(*plan, weights, result); +} + +void finufftf_destroy_(finufftf_plan *plan, int *ier) { + if (!plan) + fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); + else + *ier = finufftf_destroy(*plan); +} + +// ------------ use FINUFFT to set the default options --------------------- +// (Note the finufft_opts is created in f90-style derived types, not here) +void finufftf_default_opts_(finufft_opts *o) { + if (!o) + fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__); + else + // o is a ptr to already-allocated fortran finufft_opts derived type... + finufft_default_opts(o); +} + +// -------------- simple and many-vector interfaces -------------------- +// --- 1D --- +void finufftf1d1_(BIGINT *nj, float *xj, std::complex *cj, int *iflag, float *eps, + BIGINT *ms, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d1many_(int *ntransf, BIGINT *nj, float *xj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d2_(BIGINT *nj, float *xj, std::complex *cj, int *iflag, float *eps, + BIGINT *ms, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d2many_(int *ntransf, BIGINT *nj, float *xj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); +} + +void finufftf1d3_(BIGINT *nj, float *x, std::complex *c, int *iflag, float *eps, + BIGINT *nk, float *s, std::complex *f, finufft_opts *o, + int *ier) { + *ier = finufftf1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o); +} + +void finufftf1d3many_(int *ntransf, BIGINT *nj, float *x, std::complex *c, + int *iflag, float *eps, BIGINT *nk, float *s, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufftf1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); +} + +// --- 2D --- +void finufftf2d1_(BIGINT *nj, float *xj, float *yj, std::complex *cj, int *iflag, + float *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufftf2d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufftf2d2_(BIGINT *nj, float *xj, float *yj, std::complex *cj, int *iflag, + float *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, + finufft_opts *o, int *ier) { + *ier = finufftf2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} +void finufftf2d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); +} + +void finufftf2d3_(BIGINT *nj, float *x, float *y, std::complex *c, int *iflag, + float *eps, BIGINT *nk, float *s, float *t, std::complex *f, + finufft_opts *o, int *ier) { + *ier = finufftf2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +} + +void finufftf2d3many_(int *ntransf, BIGINT *nj, float *x, float *y, + std::complex *c, int *iflag, float *eps, BIGINT *nk, + float *s, float *t, std::complex *f, finufft_opts *o, + int *ier) { + *ier = finufftf2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); +} + +// --- 3D --- +void finufftf3d1_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); +} + +void finufftf3d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, + int *ier) { *ier = - FINUFFT3D1MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); + finufftf3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D2_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps, - BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) { - *ier = FINUFFT3D2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); +void finufftf3d2_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex *cj, + int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, + std::complex *fk, finufft_opts *o, int *ier) { + *ier = finufftf3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, - int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, - finufft_opts *o, int *ier) { +void finufftf3d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj, + std::complex *cj, int *iflag, float *eps, BIGINT *ms, + BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, + int *ier) { *ier = - FINUFFT3D2MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); + finufftf3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void FINUFFT3D3_(BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, FLT *eps, - BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, finufft_opts *o, int *ier) { - *ier = FINUFFT3D3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); +void finufftf3d3_(BIGINT *nj, float *x, float *y, float *z, std::complex *c, + int *iflag, float *eps, BIGINT *nk, float *s, float *t, float *u, + std::complex *f, finufft_opts *o, int *ier) { + *ier = finufftf3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void FINUFFT3D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, - FLT *eps, BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, - finufft_opts *o, int *ier) { - *ier = FINUFFT3D3MANY(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); +void finufftf3d3many_(int *ntransf, BIGINT *nj, float *x, float *y, float *z, + std::complex *c, int *iflag, float *eps, BIGINT *nk, + float *s, float *t, float *u, std::complex *f, + finufft_opts *o, int *ier) { + *ier = finufftf3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } #ifdef __cplusplus From 47a284a9284134857c9e905d321c87071012b40d Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 17:19:00 +0200 Subject: [PATCH 15/20] update makefile --- makefile | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/makefile b/makefile index 9d3e4c29c..1a7b8967e 100644 --- a/makefile +++ b/makefile @@ -133,24 +133,13 @@ STATICLIB = lib-static/$(LIBNAME).a # absolute path to the .so, useful for linking so executables portable... ABSDYNLIB = $(FINUFFT)$(DYNLIB) -# spreader is subset of the library with self-contained testing, hence own objs: -# double-prec spreader object files that also need single precision... -SOBJS = -# their single-prec versions -SOBJSF = $(SOBJS:%.o=%_32.o) -# precision-independent spreader object files (compiled & linked only once)... -SOBJS_PI = src/utils_precindep.o src/spreadinterp.o # spreader dual-precision objs -SOBJSD = $(SOBJS) $(SOBJSF) $(SOBJS_PI) +SOBJSD = src/utils_precindep.o src/spreadinterp.o -# double-prec library object files that also need single precision... -OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o -# their single-prec versions -OBJSF = $(OBJS:%.o=%_32.o) # precision-independent library object files (compiled & linked only once)... -OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o +OBJS_PI = $(SOBJSD) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) -OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) $(DUCC_OBJS) +OBJSD = $(OBJS_PI) $(DUCC_OBJS) .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean @@ -190,12 +179,8 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) # implicit rules for objects (note -o ensures writes to correct dir) %.o: %.cpp $(HEADERS) $(CXX) -c $(CXXFLAGS) $< -o $@ -%_32.o: %.cpp $(HEADERS) - $(CXX) -DSINGLE -c $(CXXFLAGS) $< -o $@ %.o: %.c $(HEADERS) $(CC) -c $(CFLAGS) $< -o $@ -%_32.o: %.c $(HEADERS) - $(CC) -DSINGLE -c $(CFLAGS) $< -o $@ %.o: %.f $(FC) -c $(FFLAGS) $< -o $@ %_32.o: %.f @@ -324,14 +309,14 @@ ST=perftest/spreadtestnd STA=perftest/spreadtestndall STF=$(ST)f STAF=$(STA)f -$(ST): $(ST).cpp $(SOBJS) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@ -$(STF): $(ST).cpp $(SOBJSF) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@ -$(STA): $(STA).cpp $(SOBJS) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJS) $(SOBJS_PI) $(LIBS) -o $@ -$(STAF): $(STA).cpp $(SOBJSF) $(SOBJS_PI) - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSF) $(SOBJS_PI) $(LIBS) -o $@ +$(ST): $(ST).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@ +$(STF): $(ST).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@ +$(STA): $(STA).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(SOBJSD) $(LIBS) -o $@ +$(STAF): $(STA).cpp $(SOBJSD) + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(SOBJSD) $(LIBS) -o $@ spreadtest: $(ST) $(STF) # run one thread per core... (escape the $ to get single $ in bash; one big cmd) (export OMP_NUM_THREADS=$$(perftest/mynumcores.sh) ;\ From 654da01e3d9a5ab1a6a722acbaadb37a053acf97 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 17:23:41 +0200 Subject: [PATCH 16/20] fix typo --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85a514ebf..b50486d0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,7 +257,7 @@ if(FINUFFT_USE_CPU) src/utils_precindep.cpp contrib/legendre_rule_fast.cpp src/fft.cpp - src/finufft_core.cp + src/finufft_core.cpp src/simpleinterfaces.cpp fortran/finufftfort.cpp) else() From f83b7d6fc6f9a19cf28f5d95c611dd49724089e8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 25 Sep 2024 19:46:57 +0200 Subject: [PATCH 17/20] get rid od utils_precindep --- CMakeLists.txt | 4 +-- include/finufft/test_defs.h | 1 - include/finufft/utils.h | 29 +++++++++++++++++ include/finufft/utils_precindep.h | 44 -------------------------- makefile | 10 +++--- perftest/manysmallprobs.cpp | 2 +- perftest/spreadtestnd.cpp | 1 - perftest/spreadtestndall.cpp | 2 +- src/finufft_core.cpp | 1 - src/spreadinterp.cpp | 1 - src/{utils_precindep.cpp => utils.cpp} | 2 +- test/testutils.cpp | 6 ++-- 12 files changed, 42 insertions(+), 61 deletions(-) delete mode 100644 include/finufft/utils_precindep.h rename src/{utils_precindep.cpp => utils.cpp} (98%) diff --git a/CMakeLists.txt b/CMakeLists.txt index b50486d0e..7e5e2cf5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -254,7 +254,7 @@ if(FINUFFT_USE_CPU) add_library( finufft SHARED src/spreadinterp.cpp - src/utils_precindep.cpp + src/utils.cpp contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp @@ -264,7 +264,7 @@ if(FINUFFT_USE_CPU) add_library( finufft STATIC src/spreadinterp.cpp - src/utils_precindep.cpp + src/utils.cpp contrib/legendre_rule_fast.cpp src/fft.cpp src/finufft_core.cpp diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h index 387bef20d..bdd4cf147 100644 --- a/include/finufft/test_defs.h +++ b/include/finufft/test_defs.h @@ -17,7 +17,6 @@ // convenient private finufft internals (must come after finufft.h) #include -#include // prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc... #include diff --git a/include/finufft/utils.h b/include/finufft/utils.h index 132fafb53..040f60543 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -5,6 +5,9 @@ #define UTILS_H #include "finufft/finufft_core.h" +// for CNTime... +// using chrono since the interface is portable between linux and windows +#include namespace finufft { namespace utils { @@ -85,7 +88,33 @@ FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, T *a, T *w, T *c) } } +FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n); + +// jfm's timer class +class FINUFFT_EXPORT CNTime { +public: + void start(); + double restart(); + double elapsedsec(); + +private: + double initial; +}; + +// openmp helpers +int get_num_threads_parallel_block(); + +} // namespace utils +} // namespace finufft + +// thread-safe rand number generator for Windows platform +#ifdef _WIN32 +#include +namespace finufft { +namespace utils { +FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp); } // namespace utils } // namespace finufft +#endif #endif // UTILS_H diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h deleted file mode 100644 index 41726eba8..000000000 --- a/include/finufft/utils_precindep.h +++ /dev/null @@ -1,44 +0,0 @@ -// Header for utils_precindep.cpp, a little library of array and timer stuff. -// Only the precision-independent routines here (get compiled once) - -#ifndef UTILS_PRECINDEP_H -#define UTILS_PRECINDEP_H - -#include "finufft/finufft_core.h" -// for CNTime... -// using chrono since the interface is portable between linux and windows -#include - -namespace finufft { -namespace utils { - -FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n); - -// jfm's timer class -class FINUFFT_EXPORT CNTime { -public: - void start(); - double restart(); - double elapsedsec(); - -private: - double initial; -}; - -// openmp helpers -int get_num_threads_parallel_block(); - -} // namespace utils -} // namespace finufft - -// thread-safe rand number generator for Windows platform -#ifdef _WIN32 -#include -namespace finufft { -namespace utils { -FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp); -} // namespace utils -} // namespace finufft -#endif - -#endif // UTILS_PRECINDEP_H diff --git a/makefile b/makefile index 1a7b8967e..7ad454198 100644 --- a/makefile +++ b/makefile @@ -134,7 +134,7 @@ STATICLIB = lib-static/$(LIBNAME).a ABSDYNLIB = $(FINUFFT)$(DYNLIB) # spreader dual-precision objs -SOBJSD = src/utils_precindep.o src/spreadinterp.o +SOBJSD = src/utils.o src/spreadinterp.o # precision-independent library object files (compiled & linked only once)... OBJS_PI = $(SOBJSD) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o @@ -261,10 +261,10 @@ test/%: test/%.cpp $(DYNLIB) test/%f: test/%.cpp $(DYNLIB) $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@ # low-level tests that are cleaner if depend on only specific objects... -test/testutils: test/testutils.cpp src/utils_precindep.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils_precindep.o $(LIBS) -o test/testutils -test/testutilsf: test/testutils.cpp src/utils_precindep.o - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils_precindep.o $(LIBS) -o test/testutilsf +test/testutils: test/testutils.cpp src/utils.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o $(LIBS) -o test/testutils +test/testutilsf: test/testutils.cpp src/utils.o + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE test/testutils.cpp src/utils.o $(LIBS) -o test/testutilsf # make sure all double-prec test executables ready for testing TESTS := $(basename $(wildcard test/*.cpp)) diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index 8bc379f3c..5e27289d8 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -3,7 +3,7 @@ #include "finufft/defs.h" // private access to timer -#include "finufft/utils_precindep.h" +#include "finufft/utils.h" using namespace finufft::utils; #include diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp index 9b560a25e..d30626007 100644 --- a/perftest/spreadtestnd.cpp +++ b/perftest/spreadtestnd.cpp @@ -1,7 +1,6 @@ #include #include #include -#include #include #include diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp index 666003137..14aad3420 100644 --- a/perftest/spreadtestndall.cpp +++ b/perftest/spreadtestndall.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 70a52afa8..5d6fe3a11 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include "../contrib/legendre_rule_fast.h" #include diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 126bcd2d7..a7a9db467 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include "ker_horner_allw_loop_constexpr.h" #include "ker_lowupsampfac_horner_allw_loop_constexpr.h" diff --git a/src/utils_precindep.cpp b/src/utils.cpp similarity index 98% rename from src/utils_precindep.cpp rename to src/utils.cpp index 37693d424..488792f78 100644 --- a/src/utils_precindep.cpp +++ b/src/utils.cpp @@ -5,7 +5,7 @@ #include -#include "finufft/utils_precindep.h" +#include "finufft/utils.h" using namespace std; namespace finufft { diff --git a/test/testutils.cpp b/test/testutils.cpp index 7b550ebff..6facb72cd 100644 --- a/test/testutils.cpp +++ b/test/testutils.cpp @@ -1,4 +1,4 @@ -/* unit tests for utils & utils_precindep modules. +/* unit tests for utils module. Usage: ./testutils{f} @@ -10,8 +10,8 @@ Suggested compile (double/float versions): g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o - ../src/utils_precindep.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp - -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE + ../src/utils.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp + -I../include ../src/utils.o -o testutilsf -lgomp -DSINGLE */ // This switches FLT macro from double to float if SINGLE is defined, etc... From 79b90803929161d8e18bb02263a24b7bcaf1ffc8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Thu, 26 Sep 2024 08:17:57 +0200 Subject: [PATCH 18/20] start migrating to std::vector --- include/finufft/finufft_core.h | 54 ++++++++--------- include/finufft/spreadinterp.h | 12 ++-- src/finufft_core.cpp | 108 +++++++++++---------------------- src/spreadinterp.cpp | 73 +++++++++++----------- 4 files changed, 102 insertions(+), 145 deletions(-) diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index afc6ef864..4f81728dd 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -145,36 +145,36 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ FINUFFT_PLAN_T &operator=(const FINUFFT_PLAN_T &) = delete; ~FINUFFT_PLAN_T(); - int type; // transform type (Rokhlin naming): 1,2 or 3 - int dim; // overall dimension: 1,2 or 3 - int ntrans; // how many transforms to do at once (vector or "many" mode) - BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) - BIGINT nk; // number of NU freq pts (type 3 only) - TF tol; // relative user tolerance - int batchSize; // # strength vectors to group together for FFTW, etc - int nbatch; // how many batches done to cover all ntrans vectors + int type; // transform type (Rokhlin naming): 1,2 or 3 + int dim; // overall dimension: 1,2 or 3 + int ntrans; // how many transforms to do at once (vector or "many" mode) + BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) + BIGINT nk; // number of NU freq pts (type 3 only) + TF tol; // relative user tolerance + int batchSize; // # strength vectors to group together for FFTW, etc + int nbatch; // how many batches done to cover all ntrans vectors - BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 - BIGINT mt; // number of modes in y (2) direction = N2 - BIGINT mu; // number of modes in z (3) direction = N3 - BIGINT N; // total # modes (prod of above three) + BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 + BIGINT mt; // number of modes in y (2) direction = N2 + BIGINT mu; // number of modes in z (3) direction = N3 + BIGINT N; // total # modes (prod of above three) - BIGINT nf1; // size of internal fine grid in x (1) direction - BIGINT nf2; // " y (2) - BIGINT nf3; // " z (3) - BIGINT nf; // total # fine grid points (product of the above three) + BIGINT nf1; // size of internal fine grid in x (1) direction + BIGINT nf2; // " y (2) + BIGINT nf3; // " z (3) + BIGINT nf; // total # fine grid points (product of the above three) - int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 + int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 - TF *phiHat1 = nullptr; // FT of kernel in t1,2, on x-axis mode grid - TF *phiHat2 = nullptr; // " y-axis. - TF *phiHat3 = nullptr; // " z-axis. + std::vector phiHat1; // FT of kernel in t1,2, on x-axis mode grid + std::vector phiHat2; // " y-axis. + std::vector phiHat3; // " z-axis. - TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan - // & act on. Usually the largest working array + TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan + // & act on. Usually the largest working array - BIGINT *sortIndices = nullptr; // precomputed NU pt permutation, speeds spread/interp - bool didSort; // whether binsorting used (false: identity perm used) + std::vector sortIndices; // precomputed NU pt permutation, speeds spread/interp + bool didSort; // whether binsorting used (false: identity perm used) TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts // (no new allocs). for t3: allocated as @@ -183,9 +183,9 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ // type 3 specific TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays // (no new allocs) - TC *prephase = nullptr; // pre-phase, for all input NU pts - TC *deconv = nullptr; // reciprocal of kernel FT, phase, all output NU pts - TC *CpBatch = nullptr; // working array of prephased strengths + std::vector prephase; // pre-phase, for all input NU pts + std::vector deconv; // reciprocal of kernel FT, phase, all output NU pts + std::vector CpBatch; // working array of prephased strengths TF *Sp = nullptr, *Tp = nullptr, *Up = nullptr; // internal primed targs (s'_k, etc), // allocated type3params t3P; // groups together type 3 shift, scale, phase, parameters diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 779101be8..8a83af3ce 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -39,14 +39,14 @@ FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT N, T *kx, T *ky, T *kz, const finufft_spread_opts &opts); template -FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, - UBIGINT N3, UBIGINT N, T *kx, T *ky, T *kz, - const finufft_spread_opts &opts); +FINUFFT_EXPORT int FINUFFT_CDECL indexSort(std::vector &sort_indices, UBIGINT N1, + UBIGINT N2, UBIGINT N3, UBIGINT N, T *kx, + T *ky, T *kz, const finufft_spread_opts &opts); template FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, - T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, + T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); template FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel(T x, const finufft_spread_opts &opts); diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 5d6fe3a11..8987a37a7 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -167,7 +167,8 @@ static void set_nhg_type3(T S, T X, finufft_opts opts, finufft_spread_opts spopt } template -static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts opts) +static void onedim_fseries_kernel(BIGINT nf, std::vector &fwkerhalf, + finufft_spread_opts opts) /* Approximates exact Fourier series coeffs of cnufftspread's real symmetric kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting @@ -231,7 +232,8 @@ static void onedim_fseries_kernel(BIGINT nf, T *fwkerhalf, finufft_spread_opts o } template -static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts opts) +static void onedim_nuft_kernel(BIGINT nk, T *k, std::vector &phihat, + finufft_spread_opts opts) /* Approximates exact 1D Fourier transform of cnufftspread's real symmetric kernel, directly via q-node quadrature on Euler-Fourier formula, exploiting @@ -273,8 +275,8 @@ static void onedim_nuft_kernel(BIGINT nk, T *k, T *phihat, finufft_spread_opts o } template -static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIGINT nf1, - std::complex *fw, int modeord) +static void deconvolveshuffle1d(int dir, T prefac, const std::vector &ker, BIGINT ms, + T *fk, BIGINT nf1, std::complex *fw, int modeord) /* if dir==1: copies fw to fk with amplification by prefac/ker if dir==2: copies fk to fw (and zero pads rest of it), same amplification. @@ -332,9 +334,9 @@ static void deconvolveshuffle1d(int dir, T prefac, T *ker, BIGINT ms, T *fk, BIG } template -static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, BIGINT mt, - T *fk, BIGINT nf1, BIGINT nf2, std::complex *fw, - int modeord) +static void deconvolveshuffle2d(int dir, T prefac, const std::vector &ker1, + const std::vector &ker2, BIGINT ms, BIGINT mt, T *fk, + BIGINT nf1, BIGINT nf2, std::complex *fw, int modeord) /* 2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac. @@ -376,7 +378,8 @@ static void deconvolveshuffle2d(int dir, T prefac, T *ker1, T *ker2, BIGINT ms, } template -static void deconvolveshuffle3d(int dir, T prefac, T *ker1, T *ker2, T *ker3, BIGINT ms, +static void deconvolveshuffle3d(int dir, T prefac, std::vector &ker1, + std::vector &ker2, std::vector &ker3, BIGINT ms, BIGINT mt, BIGINT mu, T *fk, BIGINT nf1, BIGINT nf2, BIGINT nf3, std::complex *fw, int modeord) /* @@ -651,16 +654,12 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int return ier; // set others as defaults (or unallocated for arrays)... - p->X = NULL; - p->Y = NULL; - p->Z = NULL; - p->phiHat1 = NULL; - p->phiHat2 = NULL; - p->phiHat3 = NULL; - p->nf1 = 1; - p->nf2 = 1; - p->nf3 = 1; // crucial to leave as 1 for unused dims - p->sortIndices = NULL; // used in all three types + p->X = NULL; + p->Y = NULL; + p->Z = NULL; + p->nf1 = 1; + p->nf2 = 1; + p->nf3 = 1; // crucial to leave as 1 for unused dims // ------------------------ types 1,2: planning needed --------------------- if (type == 1 || type == 2) { @@ -686,16 +685,16 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int // determine fine grid sizes, sanity check.. int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); if (nfier) return nfier; // nf too big; we're done - p->phiHat1 = (TF *)malloc(sizeof(TF) * (p->nf1 / 2 + 1)); + p->phiHat1.resize(p->nf1 / 2 + 1); if (dim > 1) { nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); if (nfier) return nfier; - p->phiHat2 = (TF *)malloc(sizeof(TF) * (p->nf2 / 2 + 1)); + p->phiHat2.resize(p->nf2 / 2 + 1); } if (dim > 2) { nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); if (nfier) return nfier; - p->phiHat3 = (TF *)malloc(sizeof(TF) * (p->nf3 / 2 + 1)); + p->phiHat3.resize(p->nf3 / 2 + 1); } if (p->opts.debug) { // "long long" here is to avoid warnings with printf... @@ -739,9 +738,6 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int if (!p->fwBatch) { // we don't catch all such mallocs, just this big one fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", __func__); - free(p->phiHat1); - free(p->phiHat2); - free(p->phiHat3); return FINUFFT_ERR_ALLOC; } @@ -756,13 +752,10 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->CpBatch = NULL; p->fwBatch = NULL; p->Sp = NULL; p->Tp = NULL; p->Up = NULL; - p->prephase = NULL; - p->deconv = NULL; p->innerT2plan = NULL; // Type 3 will call finufft_makeplan for type 2; no need to init FFTW // Note we don't even know nj or nk yet, so can't do anything else! @@ -810,15 +803,7 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B if (ier) // no warnings allowed here return ier; timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } + p->sortIndices.resize(p->nj); p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); if (p->opts.debug) @@ -884,18 +869,13 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B p->fftPlan->free(p->fwBatch); p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace - // (note FFTW_ALLOC is not needed over malloc, but matches its type) - if (p->CpBatch) free(p->CpBatch); - p->CpBatch = - (std::complex *)malloc(sizeof(std::complex) * nj * p->batchSize); // batch - // c' - // work + p->CpBatch.resize(nj * p->batchSize); // batch c' work if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09 * sizeof(std::complex) * (p->nf + nj) * p->batchSize, timer.elapsedsec()); - if (!p->fwBatch || !p->CpBatch) { + if (!p->fwBatch) { fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); return FINUFFT_ERR_ALLOC; } @@ -935,8 +915,7 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B // set up prephase array... std::complex imasign = (p->fftSign >= 0) ? std::complex(0, 1) : std::complex(0, -1); // +-i - if (p->prephase) free(p->prephase); - p->prephase = (std::complex *)malloc(sizeof(std::complex) * nj); + p->prephase.resize(nj); if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs @@ -963,17 +942,16 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B } // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... // (exploits that FT separates because kernel is prod of 1D funcs) - if (p->deconv) free(p->deconv); - p->deconv = (std::complex *)malloc(sizeof(std::complex) * nk); - TF *phiHatk1 = (TF *)malloc(sizeof(TF) * nk); // don't confuse w/ p->phiHat + p->deconv.resize(nk); + std::vector phiHatk1(nk); // don't confuse w/ p->phiHat onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 - TF *phiHatk2 = NULL, *phiHatk3 = NULL; + std::vector phiHatk2, phiHatk3; if (d > 1) { - phiHatk2 = (TF *)malloc(sizeof(TF) * nk); + phiHatk2.resize(nk); onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 } if (d > 2) { - phiHatk3 = (TF *)malloc(sizeof(TF) * nk); + phiHatk3.resize(nk); onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 } int Cfinite = @@ -995,23 +973,12 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} } } - free(phiHatk1); - free(phiHatk2); - free(phiHatk3); // done w/ deconv fill if (p->opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. - if (p->sortIndices) free(p->sortIndices); - p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); - if (!p->sortIndices) { - fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } + p->sortIndices.resize(p->nj); p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, p->Z, p->spopts); if (p->opts.debug) @@ -1167,8 +1134,8 @@ int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex< // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed + p->spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch.data()); // p->X are primed t_spr += timer.elapsedsec(); // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... @@ -1213,23 +1180,16 @@ template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { // Thus either each thing free'd here is guaranteed to be NULL or correctly // allocated. if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array - free(sortIndices); if (type == 1 || type == 2) { - free(phiHat1); - free(phiHat2); - free(phiHat3); - } else { // free the stuff alloc for type 3 only + } else { // free the stuff alloc for type 3 only delete innerT2plan; - innerT2plan = nullptr; // if NULL, ignore its error code - free(CpBatch); + innerT2plan = nullptr; // if NULL, ignore its error code free(Sp); free(Tp); free(Up); free(X); free(Y); free(Z); - free(prephase); - free(deconv); } } template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index a7a9db467..7c7309de2 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1401,9 +1401,10 @@ static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, } template -static void bin_sort_singlethread( - BIGINT *ret, UBIGINT M, const T *kx, const T *ky, const T *kz, UBIGINT N1, UBIGINT N2, - UBIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug) +static void bin_sort_singlethread(std::vector &ret, UBIGINT M, const T *kx, + const T *ky, const T *kz, UBIGINT N1, UBIGINT N2, + UBIGINT N3, double bin_size_x, double bin_size_y, + double bin_size_z, int debug) /* Returns permutation of all nonuniform points with good RAM access, * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version * @@ -1475,9 +1476,10 @@ static void bin_sort_singlethread( } template -static void bin_sort_multithread( - BIGINT *ret, UBIGINT M, T *kx, T *ky, T *kz, UBIGINT N1, UBIGINT N2, UBIGINT N3, - double bin_size_x, double bin_size_y, double bin_size_z, int debug, int nthr) +static void bin_sort_multithread(std::vector &ret, UBIGINT M, T *kx, T *ky, T *kz, + UBIGINT N1, UBIGINT N2, UBIGINT N3, double bin_size_x, + double bin_size_y, double bin_size_z, int debug, + int nthr) /* Mostly-OpenMP'ed version of bin_sort. For documentation see: bin_sort_singlethread. Caution: when M (# NU pts) << N (# U pts), is SLOWER than single-thread. @@ -1690,15 +1692,10 @@ FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( { int ier = spreadcheck(N1, N2, N3, M, kx, ky, kz, opts); if (ier) return ier; - BIGINT *sort_indices = (BIGINT *)malloc(sizeof(BIGINT) * M); - if (!sort_indices) { - fprintf(stderr, "%s failed to allocate sort_indices!\n", __func__); - return FINUFFT_ERR_SPREAD_ALLOC; - } + std::vector sort_indices(M); int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts); spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts, did_sort); - free(sort_indices); return 0; } @@ -1749,8 +1746,8 @@ template int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, const finufft_spread_opts &opts); template -int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, - T *ky, T *kz, const finufft_spread_opts &opts) +int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, + UBIGINT M, T *kx, T *ky, T *kz, const finufft_spread_opts &opts) /* This makes a decision whether or not to sort the NU pts (influenced by opts.sort), and if yes, calls either single- or multi-threaded bin sort, writing reordered index list to sort_indices. If decided not to sort, the @@ -1825,17 +1822,17 @@ int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT } return did_sort; } -template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - UBIGINT M, float *kx, float *ky, float *kz, +template int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, + UBIGINT N3, UBIGINT M, float *kx, float *ky, float *kz, const finufft_spread_opts &opts); -template int indexSort(BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - UBIGINT M, double *kx, double *ky, double *kz, +template int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, + UBIGINT N3, UBIGINT M, double *kx, double *ky, double *kz, const finufft_spread_opts &opts); // -------------------------------------------------------------------------- template -static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT N3, - T *FINUFFT_RESTRICT data_uniform, UBIGINT M, +static int spreadSorted(const std::vector &sort_indices, UBIGINT N1, UBIGINT N2, + UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, const T *data_nonuniform, const finufft_spread_opts &opts, int did_sort) @@ -1961,8 +1958,8 @@ static int spreadSorted(const BIGINT *sort_indices, UBIGINT N1, UBIGINT N2, UBIG // -------------------------------------------------------------------------- template FINUFFT_NEVER_INLINE static int interpSorted_kernel( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - const T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, const T *data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) // Interpolate to NU pts in sorted order from a uniform grid. @@ -2069,10 +2066,10 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( template static int interpSorted_dispatch( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - T *FINUFFT_RESTRICT data_uniform, const UBIGINT M, T *FINUFFT_RESTRICT kx, - T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts) { + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, + T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { static_assert(MIN_NSPREAD <= NS && NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case @@ -2100,19 +2097,19 @@ static int interpSorted_dispatch( } template -static int interpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, - const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, - T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, - const finufft_spread_opts &opts) { +static int interpSorted( + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, T *FINUFFT_RESTRICT data_uniform, const UBIGINT M, + T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, + T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); } template -int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, - const UBIGINT N3, T *data_uniform, const UBIGINT M, - T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, +int spreadinterpSorted(const std::vector &sort_indices, const UBIGINT N1, + const UBIGINT N2, const UBIGINT N3, T *data_uniform, + const UBIGINT M, T *FINUFFT_RESTRICT kx, T *FINUFFT_RESTRICT ky, T *FINUFFT_RESTRICT kz, T *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort) /* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. @@ -2132,14 +2129,14 @@ int spreadinterpSorted(const BIGINT *sort_indices, const UBIGINT N1, const UBIGI return 0; } template int spreadinterpSorted( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - float *data_uniform, const UBIGINT M, float *FINUFFT_RESTRICT kx, + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, float *data_uniform, const UBIGINT M, float *FINUFFT_RESTRICT kx, float *FINUFFT_RESTRICT ky, float *FINUFFT_RESTRICT kz, float *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); template int spreadinterpSorted( - const BIGINT *sort_indices, const UBIGINT N1, const UBIGINT N2, const UBIGINT N3, - double *data_uniform, const UBIGINT M, double *FINUFFT_RESTRICT kx, + const std::vector &sort_indices, const UBIGINT N1, const UBIGINT N2, + const UBIGINT N3, double *data_uniform, const UBIGINT M, double *FINUFFT_RESTRICT kx, double *FINUFFT_RESTRICT ky, double *FINUFFT_RESTRICT kz, double *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); From ff8da04fb1cbf9dea429048b0e82bdf10e7821cb Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Thu, 26 Sep 2024 08:38:35 +0200 Subject: [PATCH 19/20] NULL -> nullptr, more vectors --- include/finufft/finufft_core.h | 10 +++--- src/finufft.cpp | 4 +-- src/finufft_core.cpp | 46 +++++++++++-------------- src/simpleinterfaces.cpp | 62 ++++++++++++++++++---------------- 4 files changed, 59 insertions(+), 63 deletions(-) diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index 4f81728dd..038d079ad 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -184,11 +184,11 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays // (no new allocs) std::vector prephase; // pre-phase, for all input NU pts - std::vector deconv; // reciprocal of kernel FT, phase, all output NU pts - std::vector CpBatch; // working array of prephased strengths - TF *Sp = nullptr, *Tp = nullptr, *Up = nullptr; // internal primed targs (s'_k, etc), - // allocated - type3params t3P; // groups together type 3 shift, scale, phase, parameters + std::vector deconv; // reciprocal of kernel FT, phase, all output NU pts + std::vector CpBatch; // working array of prephased strengths + std::vector Sp, Tp, Up; // internal primed targs (s'_k, etc), + // allocated + type3params t3P; // groups together type 3 shift, scale, phase, parameters FINUFFT_PLAN_T *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3 // other internal structs diff --git a/src/finufft.cpp b/src/finufft.cpp index fddb3fb6e..758fcb723 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -26,10 +26,10 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { int FINUFFT_DESTROY(FINUFFT_PLAN p) // Free everything we allocated inside of finufft_plan pointed to by p. // Also must not crash if called immediately after finufft_makeplan. -// Thus either each thing free'd here is guaranteed to be NULL or correctly +// Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. { - if (!p) // NULL ptr, so not a ptr to a plan, report error + if (!p) // nullptr, so not a ptr to a plan, report error return 1; delete reinterpret_cast *>(p); diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 8987a37a7..e7e368581 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -232,7 +232,7 @@ static void onedim_fseries_kernel(BIGINT nf, std::vector &fwkerhalf, } template -static void onedim_nuft_kernel(BIGINT nk, T *k, std::vector &phihat, +static void onedim_nuft_kernel(BIGINT nk, const std::vector &k, std::vector &phihat, finufft_spread_opts opts) /* Approximates exact 1D Fourier transform of cnufftspread's real symmetric @@ -543,7 +543,7 @@ template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts) // Populates the fields of finufft_plan which is pointed to by "pp". -// opts is ptr to a finufft_opts to set options, or NULL to use defaults. +// opts is ptr to a finufft_opts to set options, or nullptr to use defaults. // For some of the fields (if "auto" selected) here choose the actual setting. // For types 1,2 allocates memory for internal working arrays, // evaluates spreading kernel coefficients, and instantiates the fftw_plan @@ -552,7 +552,7 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int p = new FINUFFT_PLAN_T; // allocate fresh plan struct *pp = p; // pass out plan as ptr to plan struct - if (opts == NULL) // use default opts + if (!opts) // use default opts finufft_default_opts_t(&(p->opts)); else // or read from what's passed in p->opts = *opts; // keep a deep copy; changing *opts now has no effect @@ -654,9 +654,9 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int return ier; // set others as defaults (or unallocated for arrays)... - p->X = NULL; - p->Y = NULL; - p->Z = NULL; + p->X = nullptr; + p->Y = nullptr; + p->Z = nullptr; p->nf1 = 1; p->nf2 = 1; p->nf3 = 1; // crucial to leave as 1 for unused dims @@ -752,11 +752,8 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->fwBatch = NULL; - p->Sp = NULL; - p->Tp = NULL; - p->Up = NULL; - p->innerT2plan = NULL; + p->fwBatch = nullptr; + p->innerT2plan = nullptr; // Type 3 will call finufft_makeplan for type 2; no need to init FFTW // Note we don't even know nj or nk yet, so can't do anything else! } @@ -884,20 +881,17 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... // FIXME: should use realloc if (p->X) free(p->X); - if (p->Sp) free(p->Sp); - p->X = (TF *)malloc(sizeof(TF) * nj); - p->Sp = (TF *)malloc(sizeof(TF) * nk); + p->X = (TF *)malloc(sizeof(TF) * nj); + p->Sp.resize(nk); if (d > 1) { if (p->Y) free(p->Y); - if (p->Tp) free(p->Tp); - p->Y = (TF *)malloc(sizeof(TF) * nj); - p->Tp = (TF *)malloc(sizeof(TF) * nk); + p->Y = (TF *)malloc(sizeof(TF) * nj); + p->Tp.resize(nk); } if (d > 2) { if (p->Z) free(p->Z); - if (p->Up) free(p->Up); - p->Z = (TF *)malloc(sizeof(TF) * nj); - p->Up = (TF *)malloc(sizeof(TF) * nk); + p->Z = (TF *)malloc(sizeof(TF) * nj); + p->Up.resize(nk); } // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... @@ -1005,8 +999,9 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B __func__, ier); return ier; } - ier = finufft_setpts_t(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, - NULL); // note nk = # output points (not nj) + ier = finufft_setpts_t(p->innerT2plan, nk, p->Sp.data(), p->Tp.data(), + p->Up.data(), 0, nullptr, nullptr, + nullptr); // note nk = # output points (not nj) if (ier > 1) { fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); return ier; @@ -1177,16 +1172,13 @@ template int finufft_execute_t( template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { // Free everything we allocated inside of finufft_plan pointed to by p. // Also must not crash if called immediately after finufft_makeplan. - // Thus either each thing free'd here is guaranteed to be NULL or correctly + // Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array if (type == 1 || type == 2) { } else { // free the stuff alloc for type 3 only delete innerT2plan; - innerT2plan = nullptr; // if NULL, ignore its error code - free(Sp); - free(Tp); - free(Up); + innerT2plan = nullptr; free(X); free(Y); free(Z); diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index 43d9806aa..4b3630d93 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -56,10 +56,10 @@ int finufftf_execute(finufftf_plan p, std::complex *cj, std::complex *>(p); @@ -69,10 +69,10 @@ int finufft_destroy(finufft_plan p) int finufftf_destroy(finufftf_plan p) // Free everything we allocated inside of finufft_plan pointed to by p. // Also must not crash if called immediately after finufft_makeplan. -// Thus either each thing free'd here is guaranteed to be NULL or correctly +// Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. { - if (!p) // NULL ptr, so not a ptr to a plan, report error + if (!p) // nullptr ptr, so not a ptr to a plan, report error return 1; delete reinterpret_cast *>(p); @@ -95,7 +95,7 @@ static int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, T FINUFFT_PLAN_T *plan = nullptr; int ier = finufft_makeplan_t(type, n_dims, n_modes.data(), iflag, n_transf, eps, &plan, - popts); // popts (ptr to opts) can be NULL + popts); // popts (ptr to opts) can be nullptr if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); delete plan; @@ -164,16 +164,18 @@ int finufft1d2many(int n_transf, BIGINT nj, double *xj, std::complex *cj finufft_opts *opts) // Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 2, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, - {ms, 1, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); } int finufftf1d2many(int n_transf, BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, BIGINT ms, std::complex *fk, finufft_opts *opts) // Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 2, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, - {ms, 1, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, + opts); } int finufft1d2(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, @@ -194,16 +196,16 @@ int finufft1d3many(int n_transf, BIGINT nj, double *xj, std::complex *cj finufft_opts *opts) // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 3, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, - {0, 0, 0}, nk, s, NULL, NULL, fk, opts); + return invokeGuruInterface(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts); } int finufftf1d3many(int n_transf, BIGINT nj, float *xj, std::complex *cj, int iflag, float eps, BIGINT nk, float *s, std::complex *fk, finufft_opts *opts) // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(1, 3, n_transf, nj, xj, NULL, NULL, cj, iflag, eps, - {0, 0, 0}, nk, s, NULL, NULL, fk, opts); + return invokeGuruInterface(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, + eps, {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts); } int finufft1d3(BIGINT nj, double *xj, std::complex *cj, int iflag, double eps, BIGINT nk, double *s, std::complex *fk, finufft_opts *opts) @@ -225,16 +227,16 @@ int finufft2d1many(int n_transf, BIGINT nj, double *xj, double *yj, std::complex *fk, finufft_opts *opts) // Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, NULL, c, iflag, eps, - {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } int finufftf2d1many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *c, int iflag, float eps, BIGINT ms, BIGINT mt, std::complex *fk, finufft_opts *opts) // Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, NULL, c, iflag, eps, - {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } int finufft2d1(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, std::complex *fk, @@ -256,16 +258,16 @@ int finufft2d2many(int n_transf, BIGINT nj, double *xj, double *yj, std::complex *fk, finufft_opts *opts) // Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, NULL, c, iflag, eps, - {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } int finufftf2d2many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *c, int iflag, float eps, BIGINT ms, BIGINT mt, std::complex *fk, finufft_opts *opts) // Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, NULL, c, iflag, eps, - {ms, mt, 1}, 0, NULL, NULL, NULL, fk, opts); + return invokeGuruInterface(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, + {ms, mt, 1}, 0, nullptr, nullptr, nullptr, fk, opts); } int finufft2d2(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, std::complex *fk, @@ -287,16 +289,16 @@ int finufft2d3many(int n_transf, BIGINT nj, double *xj, double *yj, double *t, std::complex *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - {0, 0, 0}, nk, s, t, NULL, fk, opts); + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, + {0, 0, 0}, nk, s, t, nullptr, fk, opts); } int finufftf2d3many(int n_transf, BIGINT nj, float *xj, float *yj, std::complex *cj, int iflag, float eps, BIGINT nk, float *s, float *t, std::complex *fk, finufft_opts *opts) // Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst { - return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, NULL, cj, iflag, eps, - {0, 0, 0}, nk, s, t, NULL, fk, opts); + return invokeGuruInterface(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, + {0, 0, 0}, nk, s, t, nullptr, fk, opts); } int finufft2d3(BIGINT nj, double *xj, double *yj, std::complex *cj, int iflag, double eps, BIGINT nk, double *s, double *t, std::complex *fk, @@ -321,7 +323,8 @@ int finufft3d1many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, // Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, - {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, + opts); } int finufftf3d1many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, int iflag, float eps, BIGINT ms, BIGINT mt, @@ -329,7 +332,7 @@ int finufftf3d1many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, // Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { return invokeGuruInterface(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, - {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts); } int finufft3d1(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu, @@ -352,7 +355,8 @@ int finufft3d2many(int n_transf, BIGINT nj, double *xj, double *yj, double *zj, // Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, - {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, + opts); } int finufftf3d2many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, std::complex *cj, int iflag, float eps, BIGINT ms, BIGINT mt, @@ -360,7 +364,7 @@ int finufftf3d2many(int n_transf, BIGINT nj, float *xj, float *yj, float *zj, // Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst { return invokeGuruInterface(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, - {ms, mt, mu}, 0, NULL, NULL, NULL, fk, opts); + {ms, mt, mu}, 0, nullptr, nullptr, nullptr, fk, opts); } int finufft3d2(BIGINT nj, double *xj, double *yj, double *zj, std::complex *cj, int iflag, double eps, BIGINT ms, BIGINT mt, BIGINT mu, From f411b15da31b446f989b43091655799e48663fdc Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Thu, 26 Sep 2024 09:41:00 +0200 Subject: [PATCH 20/20] more OOP and some warning fixes --- fortran/finufftfort.cpp | 4 +- include/finufft/finufft_core.h | 3 + src/finufft_core.cpp | 364 +++++++++++++++++---------------- src/utils.cpp | 10 +- 4 files changed, 200 insertions(+), 181 deletions(-) diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp index c95230c50..400ff0985 100644 --- a/fortran/finufftfort.cpp +++ b/fortran/finufftfort.cpp @@ -46,7 +46,7 @@ void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, doub return; } int nk_safe = 0; // catches the case where user passes NULL in - if (nk) nk_safe = *nk; + if (nk) nk_safe = int(*nk); *ier = finufft_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } @@ -213,7 +213,7 @@ void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, floa return; } int nk_safe = 0; // catches the case where user passes NULL in - if (nk) nk_safe = *nk; + if (nk) nk_safe = int(*nk); *ier = finufftf_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index 038d079ad..de2f2dab9 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -195,6 +195,9 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ std::unique_ptr> fftPlan; finufft_opts opts; // this and spopts could be made ptrs finufft_spread_opts spopts; + + int setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t, TF *u); + int execute(std::complex *cj, std::complex *fk); }; void finufft_default_opts_t(finufft_opts *o); diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index e7e368581..834420bbe 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -768,18 +768,12 @@ template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes // SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS template -int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, - TF *s, TF *t, TF *u) -/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for - spreading. (The last 4 arguments are ignored.) - For type 3: allocates internal working arrays, scales/centers the NU points - and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. -*/ -{ - int d = p->dim; // abbrev for spatial dim +int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF *s, TF *t, + TF *u) { + int d = dim; // abbrev for spatial dim CNTime timer; timer.start(); - p->nj = nj; // the user only now chooses how many NU (x,y,z) pts + this->nj = nj; // the user only now chooses how many NU (x,y,z) pts if (nj < 0) { fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj); return FINUFFT_ERR_NUM_NU_PTS_INVALID; @@ -788,23 +782,22 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B return FINUFFT_ERR_NUM_NU_PTS_INVALID; } - if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- - // (all we can do is check and maybe bin-sort the NU pts) - p->X = xj; // plan must keep pointers to user's fixed NU pts - p->Y = yj; - p->Z = zj; - int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug > 1) - printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, + if (type != 3) { // ------------------ TYPE 1,2 SETPTS ------------------- + // (all we can do is check and maybe bin-sort the NU pts) + X = xj; // plan must keep pointers to user's fixed NU pts + Y = yj; + Z = zj; + int ier = spreadcheck(nf1, nf2, nf3, nj, xj, yj, zj, spopts); + if (opts.debug > 1) + printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, spopts.chkbnds, timer.elapsedsec()); if (ier) // no warnings allowed here return ier; timer.restart(); - p->sortIndices.resize(p->nj); - p->didSort = - indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts); - if (p->opts.debug) - printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + sortIndices.resize(nj); + didSort = indexSort(sortIndices, nf1, nf2, nf3, nj, xj, yj, zj, spopts); + if (opts.debug) + printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, didSort, timer.elapsedsec()); } else { // ------------------------- TYPE 3 SETPTS ----------------------- @@ -817,200 +810,208 @@ int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, B fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk); return FINUFFT_ERR_NUM_NU_PTS_INVALID; } - p->nk = nk; // user set # targ freq pts - p->S = s; // keep pointers to user's input target pts - p->T = t; - p->U = u; + this->nk = nk; // user set # targ freq pts + S = s; // keep pointers to user's input target pts + T = t; + U = u; // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... TF S1, S2, S3; // get half-width X, center C, which contains {x_j}... - arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1)); - arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k} - set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1), - &(p->t3P.gam1)); // applies twist i) - p->t3P.C2 = 0.0; // their defaults if dim 2 unused, etc - p->t3P.D2 = 0.0; + arraywidcen(nj, xj, &(t3P.X1), &(t3P.C1)); + arraywidcen(nk, s, &S1, &(t3P.D1)); // same D, S, but for {s_k} + set_nhg_type3(S1, t3P.X1, opts, spopts, &(nf1), &(t3P.h1), + &(t3P.gam1)); // applies twist i) + t3P.C2 = 0.0; // their defaults if dim 2 unused, etc + t3P.D2 = 0.0; if (d > 1) { - arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j} - arraywidcen(nk, t, &S2, &(p->t3P.D2)); // {t_k} - set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2), - &(p->t3P.gam2)); + arraywidcen(nj, yj, &(t3P.X2), &(t3P.C2)); // {y_j} + arraywidcen(nk, t, &S2, &(t3P.D2)); // {t_k} + set_nhg_type3(S2, t3P.X2, opts, spopts, &(nf2), &(t3P.h2), &(t3P.gam2)); } - p->t3P.C3 = 0.0; - p->t3P.D3 = 0.0; + t3P.C3 = 0.0; + t3P.D3 = 0.0; if (d > 2) { - arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j} - arraywidcen(nk, u, &S3, &(p->t3P.D3)); // {u_k} - set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3), - &(p->t3P.gam3)); + arraywidcen(nj, zj, &(t3P.X3), &(t3P.C3)); // {z_j} + arraywidcen(nk, u, &S3, &(t3P.D3)); // {u_k} + set_nhg_type3(S3, t3P.X3, opts, spopts, &(nf3), &(t3P.h3), &(t3P.gam3)); } - if (p->opts.debug) { // report on choices of shifts, centers, etc... + if (opts.debug) { // report on choices of shifts, centers, etc... printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk); - printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", p->t3P.X1, - p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1, p->t3P.h1); + printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld h1=%.3g\t\n", t3P.X1, + t3P.C1, S1, t3P.D1, t3P.gam1, (long long)nf1, t3P.h1); if (d > 1) - printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", p->t3P.X2, - p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2, p->t3P.h2); + printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld h2=%.3g\n", t3P.X2, + t3P.C2, S2, t3P.D2, t3P.gam2, (long long)nf2, t3P.h2); if (d > 2) - printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", p->t3P.X3, - p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3, p->t3P.h3); + printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld h3=%.3g\n", t3P.X3, + t3P.C3, S3, t3P.D3, t3P.gam3, (long long)nf3, t3P.h3); } - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { + nf = nf1 * nf2 * nf3; // fine grid total number of points + if (nf * batchSize > MAX_NF) { fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", __func__); return FINUFFT_ERR_MAXNALLOC; } - p->fftPlan->free(p->fwBatch); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // maybe big workspace + fftPlan->free(fwBatch); + fwBatch = fftPlan->alloc_complex(nf * batchSize); // maybe big workspace - p->CpBatch.resize(nj * p->batchSize); // batch c' work + CpBatch.resize(nj * batchSize); // batch c' work - if (p->opts.debug) + if (opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, - (double)1E-09 * sizeof(std::complex) * (p->nf + nj) * p->batchSize, + (double)1E-09 * sizeof(std::complex) * (nf + nj) * batchSize, timer.elapsedsec()); - if (!p->fwBatch) { + if (!fwBatch) { fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); return FINUFFT_ERR_ALLOC; } - // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); + // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... // FIXME: should use realloc - if (p->X) free(p->X); - p->X = (TF *)malloc(sizeof(TF) * nj); - p->Sp.resize(nk); + if (X) free(X); + X = (TF *)malloc(sizeof(TF) * nj); + Sp.resize(nk); if (d > 1) { - if (p->Y) free(p->Y); - p->Y = (TF *)malloc(sizeof(TF) * nj); - p->Tp.resize(nk); + if (Y) free(Y); + Y = (TF *)malloc(sizeof(TF) * nj); + Tp.resize(nk); } if (d > 2) { - if (p->Z) free(p->Z); - p->Z = (TF *)malloc(sizeof(TF) * nj); - p->Up.resize(nk); + if (Z) free(Z); + Z = (TF *)malloc(sizeof(TF) * nj); + Up.resize(nk); } // always shift as use gam to rescale x_j to x'_j, etc (twist iii)... - TF ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim - if (d > 1) ig2 = 1.0 / p->t3P.gam2; - if (d > 2) ig3 = 1.0 / p->t3P.gam3; -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + TF ig1 = 1.0 / t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim + if (d > 1) ig2 = 1.0 / t3P.gam2; + if (d > 2) ig3 = 1.0 / t3P.gam3; +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT j = 0; j < nj; ++j) { - p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j - if (d > 1) // (ok to do inside loop because of branch predict) - p->Y[j] = (yj[j] - p->t3P.C2) * ig2; // rescale y_j - if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j + X[j] = (xj[j] - t3P.C1) * ig1; // rescale x_j + if (d > 1) // (ok to do inside loop because of branch predict) + Y[j] = (yj[j] - t3P.C2) * ig2; // rescale y_j + if (d > 2) Z[j] = (zj[j] - t3P.C3) * ig3; // rescale z_j } // set up prephase array... std::complex imasign = - (p->fftSign >= 0) ? std::complex(0, 1) : std::complex(0, -1); // +-i - p->prephase.resize(nj); - if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) { -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + (fftSign >= 0) ? std::complex(0, 1) : std::complex(0, -1); // +-i + prephase.resize(nj); + if (t3P.D1 != 0.0 || t3P.D2 != 0.0 || t3P.D3 != 0.0) { +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs - TF phase = p->t3P.D1 * xj[j]; - if (d > 1) phase += p->t3P.D2 * yj[j]; - if (d > 2) phase += p->t3P.D3 * zj[j]; - p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler - // e^{+-i.phase} + TF phase = t3P.D1 * xj[j]; + if (d > 1) phase += t3P.D2 * yj[j]; + if (d > 2) phase += t3P.D3 * zj[j]; + prephase[j] = cos(phase) + imasign * sin(phase); // Euler + // e^{+-i.phase} } } else for (BIGINT j = 0; j < nj; ++j) - p->prephase[j] = (std::complex)1.0; // *** or keep flag so no mult in exec?? + prephase[j] = (std::complex)1.0; // *** or keep flag so no mult in exec?? - // rescale the target s_k etc to s'_k etc... -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + // rescale the target s_k etc to s'_k etc... +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { - p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R + Sp[k] = t3P.h1 * t3P.gam1 * (s[k] - t3P.D1); // so |s'_k| < pi/R if (d > 1) - p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < - // pi/R + Tp[k] = t3P.h2 * t3P.gam2 * (t[k] - t3P.D2); // so |t'_k| < + // pi/R if (d > 2) - p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < - // pi/R + Up[k] = t3P.h3 * t3P.gam3 * (u[k] - t3P.D3); // so |u'_k| < + // pi/R } // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... // (exploits that FT separates because kernel is prod of 1D funcs) - p->deconv.resize(nk); - std::vector phiHatk1(nk); // don't confuse w/ p->phiHat - onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1 + deconv.resize(nk); + std::vector phiHatk1(nk); // don't confuse w/ phiHat + onedim_nuft_kernel(nk, Sp, phiHatk1, spopts); // fill phiHat1 std::vector phiHatk2, phiHatk3; if (d > 1) { phiHatk2.resize(nk); - onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2 + onedim_nuft_kernel(nk, Tp, phiHatk2, spopts); // fill phiHat2 } if (d > 2) { phiHatk3.resize(nk); - onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3 + onedim_nuft_kernel(nk, Up, phiHatk3, spopts); // fill phiHat3 } - int Cfinite = - isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan - // or inf if - // M=0, no - // input NU pts - int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen -#pragma omp parallel for num_threads(p->opts.nthreads) schedule(static) + int Cfinite = isfinite(t3P.C1) && isfinite(t3P.C2) && isfinite(t3P.C3); // C can be + // nan or inf + // if M=0, no + // input NU + // pts + int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0; // cen +#pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs TF phiHat = phiHatk1[k]; if (d > 1) phiHat *= phiHatk2[k]; if (d > 2) phiHat *= phiHatk3[k]; - p->deconv[k] = (std::complex)(1.0 / phiHat); + deconv[k] = (std::complex)(1.0 / phiHat); if (Cfinite && Cnonzero) { - TF phase = (s[k] - p->t3P.D1) * p->t3P.C1; - if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2; - if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3; - p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} + TF phase = (s[k] - t3P.D1) * t3P.C1; + if (d > 1) phase += (t[k] - t3P.D2) * t3P.C2; + if (d > 2) phase += (u[k] - t3P.D3) * t3P.C3; + deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} } } - if (p->opts.debug) + if (opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec()); // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... timer.restart(); - p->sortIndices.resize(p->nj); - p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, - p->Z, p->spopts); - if (p->opts.debug) - printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort, + sortIndices.resize(nj); + didSort = indexSort(sortIndices, nf1, nf2, nf3, nj, X, Y, Z, spopts); + if (opts.debug) + printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, didSort, timer.elapsedsec()); // Plan and setpts once, for the (repeated) inner type 2 finufft call... timer.restart(); - BIGINT t2nmodes[] = {p->nf1, p->nf2, p->nf3}; // t2 input is actually fw - finufft_opts t2opts = p->opts; // deep copy, since not ptrs - t2opts.modeord = 0; // needed for correct t3! - t2opts.debug = max(0, p->opts.debug - 1); // don't print as much detail - t2opts.spread_debug = max(0, p->opts.spread_debug - 1); - t2opts.showwarn = 0; // so don't see warnings 2x + BIGINT t2nmodes[] = {nf1, nf2, nf3}; // t2 input is actually fw + finufft_opts t2opts = opts; // deep copy, since not ptrs + t2opts.modeord = 0; // needed for correct t3! + t2opts.debug = max(0, opts.debug - 1); // don't print as much detail + t2opts.spread_debug = max(0, opts.spread_debug - 1); + t2opts.showwarn = 0; // so don't see warnings 2x // (...could vary other t2opts here?) - if (p->innerT2plan) { - delete p->innerT2plan; - p->innerT2plan = nullptr; + if (innerT2plan) { + delete innerT2plan; + innerT2plan = nullptr; } - int ier = finufft_makeplan_t(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol, - &p->innerT2plan, &t2opts); + int ier = finufft_makeplan_t(2, d, t2nmodes, fftSign, batchSize, tol, + &innerT2plan, &t2opts); if (ier > 1) { // if merely warning, still proceed fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", __func__, ier); return ier; } - ier = finufft_setpts_t(p->innerT2plan, nk, p->Sp.data(), p->Tp.data(), - p->Up.data(), 0, nullptr, nullptr, + ier = finufft_setpts_t(innerT2plan, nk, Sp.data(), Tp.data(), Up.data(), 0, + nullptr, nullptr, nullptr); // note nk = # output points (not nj) if (ier > 1) { fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); return ier; } - if (p->opts.debug) + if (opts.debug) printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec()); } return 0; } +template +int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, + TF *s, TF *t, TF *u) +/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for + spreading. (The last 4 arguments are ignored.) + For type 3: allocates internal working arrays, scales/centers the NU points + and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. +*/ +{ + return p->setpts(nj, xj, yj, zj, nk, s, t, u); +} template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, float *xj, float *yj, float *zj, BIGINT nk, float *s, float *t, float *u); @@ -1022,7 +1023,7 @@ template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, doub // EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE template -int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk) { +int FINUFFT_PLAN_T::execute(std::complex *cj, std::complex *fk) { /* See ../docs/cguru.doc for current documentation. For given (stack of) weights cj or coefficients fk, performs NUFFTs with @@ -1038,52 +1039,52 @@ int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex< CNTime timer; timer.start(); - if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ + if (type != 3) { // --------------------- TYPE 1,2 EXEC ------------------ double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing - if (p->opts.debug) - printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); + if (opts.debug) + printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, ntrans, nbatch, + batchSize); - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches + for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches // current batch is either batchSize, or possibly truncated if last one - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; // index of vector, since batchsizes same - std::complex *cjb = cj + bB * p->nj; // point to batch of weights - std::complex *fkb = fk + bB * p->N; // point to batch of mode coeffs - if (p->opts.debug > 1) + int thisBatchSize = min(ntrans - b * batchSize, batchSize); + int bB = b * batchSize; // index of vector, since batchsizes same + std::complex *cjb = cj + bB * nj; // point to batch of weights + std::complex *fkb = fk + bB * N; // point to batch of mode coeffs + if (opts.debug > 1) printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize); // STEP 1: (varies by type) timer.restart(); - if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid - spreadinterpSortedBatch(thisBatchSize, p, cjb); + if (type == 1) { // type 1: spread NU pts X, weights cj, to fw grid + spreadinterpSortedBatch(thisBatchSize, this, cjb); t_sprint += timer.elapsedsec(); } else { // type 2: amplify Fourier coeffs fk into 0-padded fw - deconvolveBatch(thisBatchSize, p, fkb); + deconvolveBatch(thisBatchSize, this, fkb); t_deconv += timer.elapsedsec(); } // STEP 2: call the FFT on this batch timer.restart(); - do_fft(p); + do_fft(this); t_fft += timer.elapsedsec(); - if (p->opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); + if (opts.debug > 1) printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); // STEP 3: (varies by type) timer.restart(); - if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk - deconvolveBatch(thisBatchSize, p, fkb); + if (type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk + deconvolveBatch(thisBatchSize, this, fkb); t_deconv += timer.elapsedsec(); } else { // type 2: interpolate unif fw grid to NU target pts - spreadinterpSortedBatch(thisBatchSize, p, cjb); + spreadinterpSortedBatch(thisBatchSize, this, cjb); t_sprint += timer.elapsedsec(); } } // ........end b loop - if (p->opts.debug) { // report total times in their natural order... - if (p->type == 1) { + if (opts.debug) { // report total times in their natural order... + if (type == 1) { printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint); printf(" tot FFT:\t\t\t\t%.3g s\n", t_fft); printf(" tot deconvolve:\t\t\t%.3g s\n", t_deconv); @@ -1102,56 +1103,56 @@ int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex< double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, t_deconv = 0.0; // accumulated timings - if (p->opts.debug) - printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, - p->nbatch, p->batchSize); + if (opts.debug) + printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, ntrans, + nbatch, batchSize); - for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches + for (int b = 0; b * batchSize < ntrans; b++) { // .....loop b over batches // batching and pointers to this batch, identical to t1,2 above... - int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize); - int bB = b * p->batchSize; - std::complex *cjb = cj + bB * p->nj; // batch of input strengths - std::complex *fkb = fk + bB * p->nk; // batch of output strengths - if (p->opts.debug > 1) + int thisBatchSize = min(ntrans - b * batchSize, batchSize); + int bB = b * batchSize; + std::complex *cjb = cj + bB * nj; // batch of input strengths + std::complex *fkb = fk + bB * nk; // batch of output strengths + if (opts.debug > 1) printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize); // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch... timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize? +#pragma omp parallel for num_threads(opts.nthreads) // or batchSize? for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nj; - for (BIGINT j = 0; j < p->nj; ++j) { - p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j]; + BIGINT ioff = i * nj; + for (BIGINT j = 0; j < nj; ++j) { + CpBatch[ioff + j] = prephase[j] * cjb[ioff + j]; } } t_pre += timer.elapsedsec(); // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... timer.restart(); - p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch.data()); // p->X are primed + spopts.spread_direction = 1; // spread + spreadinterpSortedBatch(thisBatchSize, this, CpBatch.data()); // X are primed t_spr += timer.elapsedsec(); // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); // illegal possible shrink of ntrans *after* plan for smaller last batch: - p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! + innerT2plan->ntrans = thisBatchSize; // do not try this at home! /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array still the same size, as Andrea explained; just wastes a few flops) */ - finufft_execute_t(p->innerT2plan, fkb, p->fwBatch); + finufft_execute_t(innerT2plan, fkb, fwBatch); t_t2 += timer.elapsedsec(); // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... timer.restart(); -#pragma omp parallel for num_threads(p->opts.nthreads) +#pragma omp parallel for num_threads(opts.nthreads) for (int i = 0; i < thisBatchSize; i++) { - BIGINT ioff = i * p->nk; - for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k]; + BIGINT ioff = i * nk; + for (BIGINT k = 0; k < nk; ++k) fkb[ioff + k] *= deconv[k]; } t_deconv += timer.elapsedsec(); } // ........end b loop - if (p->opts.debug) { // report total times in their natural order... + if (opts.debug) { // report total times in their natural order... printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre); printf(" tot spread:\t\t\t%.3g s\n", t_spr); printf(" tot type 2:\t\t\t%.3g s\n", t_t2); @@ -1163,6 +1164,22 @@ int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex< return 0; } +template +int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk) { + /* See ../docs/cguru.doc for current documentation. + + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and FFT as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. +*/ + return p->execute(cj, fk); +} template int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); template int finufft_execute_t( @@ -1175,8 +1192,7 @@ template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { // Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array - if (type == 1 || type == 2) { - } else { // free the stuff alloc for type 3 only + if (type == 3) { delete innerT2plan; innerT2plan = nullptr; free(X); diff --git a/src/utils.cpp b/src/utils.cpp index 488792f78..f64009132 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -33,9 +33,9 @@ BIGINT next235even(BIGINT n) // ----------------------- helpers for timing (always stay double prec) ------ void CNTime::start() { - initial = std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()) - .count() * + initial = double(std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count()) * 1e-6; } @@ -53,7 +53,7 @@ double CNTime::elapsedsec() std::uint64_t now = std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count(); - const double nowsec = now * 1e-6; + const double nowsec = double(now) * 1e-6; return nowsec - initial; } @@ -75,7 +75,7 @@ int get_num_threads_parallel_block() // ---------- thread-safe rand number generator for Windows platform --------- // (note this is used by macros in defs.h, and supplied in linux/macosx) #ifdef _WIN32 -int rand_r(unsigned int *seedp) +int rand_r(unsigned int * /*seedp*/) // Libin Lu, 6/18/20 { std::random_device rd;