From c8f389a48df7afef4f863b9756c5e10b86c6048d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 11 Jun 2024 15:04:50 -0400 Subject: [PATCH] Vectorized single thread binsort --- src/spreadinterp.cpp | 219 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 194 insertions(+), 25 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 0ee348d3f..93847d9bb 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -50,6 +50,7 @@ FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, BIGINT M0); +FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, BIGINT N); } // namespace // declarations of purely internal functions... (thus need not be in .h) template 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) +// // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, +// // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). +// // Note that round-off near kx=-pi stably rounds negative to i1=0. +// const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); +// const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; +// const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; +// const auto nbins = nbins1 * nbins2 * nbins3; +// const auto inv_bin_size_x = FLT(1.0 / bin_size_x); +// const auto inv_bin_size_y = FLT(1.0 / bin_size_y); +// const auto inv_bin_size_z = FLT(1.0 / bin_size_z); +// // count how many pts in each bin +// std::vector counts(nbins, 0); +// +// for (auto i = 0; i < M; i++) { +// // find the bin index in however many dims are needed +// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); +// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; +// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; +// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); +// ++counts[bin]; +// } +// +// // compute the offsets directly in the counts array (no offset array) +// BIGINT current_offset = 0; +// for (BIGINT i = 0; i < nbins; i++) { +// BIGINT tmp = counts[i]; +// counts[i] = current_offset; // Reinecke's cute replacement of counts[i] +// current_offset += tmp; +// } // (counts now contains the index offsets for each bin) +// +// for (auto i = 0; i < M; i++) { +// // find the bin index (again! but better than using RAM) +// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); +// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; +// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; +// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); +// ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly +// ++counts[bin]; // update the offsets +// } +//} + +void bin_sort_singlethread( + BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, + const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x, + const double bin_size_y, const double bin_size_z, const int debug) /* Returns permutation of all nonuniform points with good RAM access, * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version * @@ -1503,45 +1580,125 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIG * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed). */ { + static constexpr auto alignment = xsimd::batch::arch_type::alignment(); const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). // Note that round-off near kx=-pi stably rounds negative to i1=0. - const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); - const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; - const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins1 = UBIGINT(FLT(N1) / bin_size_x + 1); + const auto nbins2 = isky ? UBIGINT(FLT(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? UBIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins12 = nbins1 * nbins2; const auto nbins = nbins1 * nbins2 * nbins3; const auto inv_bin_size_x = FLT(1.0 / bin_size_x); const auto inv_bin_size_y = FLT(1.0 / bin_size_y); const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + + static constexpr auto avx_width = xsimd::batch::size; + const auto regular_part = M & (-avx_width); + // count how many pts in each bin - std::vector counts(nbins, 0); + std::vector counts(nbins, 0); + + static constexpr auto to_array = [](const auto bins) noexcept { + using contained_t = typename decltype(bins)::value_type; + alignas(alignment) std::array result{}; + bins.store_aligned(result.data()); + return result; + }; + + static constexpr auto to_uint = [](const xsimd::batch bins) noexcept { + return xsimd::batch_cast>(bins); + }; + + const auto compute_bins = [=](auto... args) constexpr noexcept { + std::array k_arr = {args...}; + auto bins = xsimd::floor( + fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x); + if constexpr (sizeof...(args) > 1) { + const auto i2 = xsimd::floor( + fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y); + bins = xsimd::fma(decltype(bins)(nbins1), i2, bins); + } + if constexpr (sizeof...(args) > 2) { + const auto i3 = xsimd::floor( + fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z); + bins = xsimd::fma(decltype(bins)(nbins12), i3, bins); + } + return to_uint(bins); + }; + + const auto increment_bins = [&counts](const auto bins) constexpr noexcept { + const auto bin_array = to_array(bins); + for (const auto bin : bin_array) { + ++counts[bin]; + } + }; + + const auto accumulate_bins = [&counts, &ret](const auto bins, + const auto i) constexpr noexcept { + const auto bin_array = to_array(bins); + for (uint8_t j{0}; j < avx_width; ++j) { + const auto bin = bin_array[j]; + // fill the inverse map on the fly, careful of indexes errors + ret[counts[bin]] = i + j; + ++counts[bin]; + } + }; - for (auto i = 0; i < M; i++) { + UBIGINT i{0}; + if (iskz) { + for (; i < regular_part; i += avx_width) { + increment_bins(compute_bins(kx + i, ky + i, kz + i)); + } + } else if (isky) { + for (; i < regular_part; i += avx_width) { + increment_bins(compute_bins(kx + i, ky + i)); + } + } else { + for (; i < regular_part; i += avx_width) { + increment_bins(compute_bins(kx + i)); + } + } + + for (; i < M; ++i) { // find the bin index in however many dims are needed - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; } // compute the offsets directly in the counts array (no offset array) - BIGINT current_offset = 0; - for (BIGINT i = 0; i < nbins; i++) { - BIGINT tmp = counts[i]; - counts[i] = current_offset; // Reinecke's cute replacement of counts[i] - current_offset += tmp; + UBIGINT current_offset{0}; // Reinecke's cute replacement of counts[i] + for (i = 0; i < nbins; ++i) { + counts[i] = std::exchange(current_offset, current_offset + counts[i]); } // (counts now contains the index offsets for each bin) - for (auto i = 0; i < M; i++) { + i = 0; // we need to redo the loop so variable should be zeroed here + if (iskz) { + for (; i < regular_part; i += avx_width) { + accumulate_bins(compute_bins(kx + i, ky + i, kz + i), i); + } + } else if (isky) { + for (; i < regular_part; i += avx_width) { + accumulate_bins(compute_bins(kx + i, ky + i), i); + } + } else { + for (; i < regular_part; i += avx_width) { + accumulate_bins(compute_bins(kx + i), i); + } + } + + for (; i < M; ++i) { // find the bin index (again! but better than using RAM) - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly - ++counts[bin]; // update the offsets + ret[counts[bin]] = i; // fill the inverse map on the fly + ++counts[bin]; // update the offsets } } @@ -1881,5 +2038,17 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset break; } } + +// since xsimd is not constexpr this slows down the loop due to the guard variables +// hence moved them here + +FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(const xsimd::batch x, + const BIGINT N) { + const xsimd::batch x2pi{FLT(M_1_2PI)}; + const xsimd::batch half{FLT(0.5)}; + auto result = xsimd::fma(x, x2pi, half); + result -= xsimd::floor(result); + return result * FLT(N); +} } // namespace } // namespace finufft::spreadinterp