diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 0140e54f9..305261f66 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -50,7 +50,8 @@ FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, BIGINT M0); -FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, BIGINT N); +FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, FLT N); +FINUFFT_ALWAYS_INLINE xsimd::batch fold(xsimd::batch x); } // namespace // declarations of purely internal functions... (thus need not be in .h) template::size; const auto regular_part = M & (-avx_width); @@ -1611,22 +1618,25 @@ void bin_sort_singlethread( return xsimd::batch_cast>(bins); }; - const auto compute_bins = [=](auto... args) constexpr noexcept { + const auto compute_bins = [=](const auto... args) constexpr noexcept { const std::array k_arr = {args...}; // - auto bins = - to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x); + auto bins0 = to_uint(fold(xsimd::load_unaligned(k_arr[0])) * rescale1); + decltype(bins0) bins1{0u}; + decltype(bins0) bins2{0u}; if constexpr (sizeof...(args) > 1) { - const auto i2 = - to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y); - bins = xsimd::fma(decltype(bins)(nbins1), i2, bins); + const auto i2 = to_uint(fold(xsimd::load_unaligned(k_arr[1])) * rescale2); + bins1 = nbins1 * i2; + } else { + return bins0; } if constexpr (sizeof...(args) > 2) { - const auto i3 = - to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z); - bins = xsimd::fma(decltype(bins)(nbins12), i3, bins); + const auto i3 = to_uint(fold(xsimd::load_unaligned(k_arr[2])) * rescale3); + bins2 = nbins12 * i3; + } else { + return bins0 + bins1; } - return bins; + return bins0 + bins1 + bins2; }; const auto increment_bins = [&counts](const auto bins) constexpr noexcept { @@ -2044,12 +2054,19 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset // hence moved them here FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(const xsimd::batch x, - const BIGINT N) { + const FLT N) { + const xsimd::batch x2pi{FLT(M_1_2PI)}; + const xsimd::batch half{FLT(0.5)}; + auto result = xsimd::fma(x, x2pi, half); + result -= xsimd::floor(result); + return result * N; +} +FINUFFT_ALWAYS_INLINE xsimd::batch fold(xsimd::batch x) { const xsimd::batch x2pi{FLT(M_1_2PI)}; const xsimd::batch half{FLT(0.5)}; auto result = xsimd::fma(x, x2pi, half); result -= xsimd::floor(result); - return result * FLT(N); + return result; } } // namespace } // namespace finufft::spreadinterp