From c8f389a48df7afef4f863b9756c5e10b86c6048d Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 11 Jun 2024 15:04:50 -0400
Subject: [PATCH] Vectorized single thread binsort

---
 src/spreadinterp.cpp | 219 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 194 insertions(+), 25 deletions(-)
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 0ee348d3f..93847d9bb 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -50,6 +50,7 @@ FINUFFT_NEVER_INLINE
 void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3,
                         BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3,
                         BIGINT M0);
+FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold_rescale_vec(xsimd::batch<FLT> x, BIGINT N);
 } // namespace
 // declarations of purely internal functions... (thus need not be in .h)
 template<uint8_t ns, uint8_t kerevalmeth, class T,
@@ -88,9 +89,10 @@ static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
                                 BIGINT padded_size1, BIGINT size1, BIGINT size2,
                                 BIGINT size3, BIGINT N1, BIGINT N2, BIGINT N3,
                                 FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0);
-static void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-                                  BIGINT N1, BIGINT N2, BIGINT N3, double bin_size_x,
-                                  double bin_size_y, double bin_size_z, int debug);
+static void bin_sort_singlethread(BIGINT *ret, BIGINT M, const FLT *kx, const FLT *ky,
+                                  const FLT *kz, BIGINT N1, BIGINT N2, BIGINT N3,
+                                  double bin_size_x, double bin_size_y, double bin_size_z,
+                                  int debug);
 void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1,
                           BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y,
                           double bin_size_z, int debug, int nthr);
@@ -1472,9 +1474,84 @@ void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3,
   }
 }
 
-void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1,
-                           BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y,
-                           double bin_size_z, int debug)
+// void bin_sort_singlethread(
+//     BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
+//     const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x,
+//     const double bin_size_y, const double bin_size_z, const int debug)
+///* Returns permutation of all nonuniform points with good RAM access,
+// * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
+// *
+// * This is achieved by binning into cuboids (of given bin_size within the
+// * overall box domain), then reading out the indices within
+// * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest).
+// * Finally the permutation is inverted, so that the good ordering is: the
+// * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1]
+// *
+// * Inputs: M - number of input NU points.
+// *         kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi).
+// *                    Points outside this range are folded into it.
+// *         N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D)
+// *         bin_size_x,y,z - what binning box size to use in each dimension
+// *                    (in rescaled coords where ranges are [0,Ni] ).
+// *                    For 1D, only bin_size_x is used; for 2D, it & bin_size_y.
+// * Output:
+// *         writes to ret a vector list of indices, each in the range 0,..,M-1.
+// *         Thus, ret must have been preallocated for M BIGINTs.
+// *
+// * Notes: I compared RAM usage against declaring an internal vector and passing
+// * back; the latter used more RAM and was slower.
+// * Avoided the bins array, as in JFM's spreader of 2016,
+// * tidied up, early 2017, Barnett.
+// * Timings (2017): 3s for M=1e8 NU pts on 1 core of i7; 5s on 1 core of xeon.
+// * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
+// */
+//{
+//  const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
+//  // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
+//  // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
+//  // Note that round-off near kx=-pi stably rounds negative to i1=0.
+//  const auto nbins1         = BIGINT(FLT(N1) / bin_size_x + 1);
+//  const auto nbins2         = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
+//  const auto nbins3         = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
+//  const auto nbins          = nbins1 * nbins2 * nbins3;
+//  const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
+//  const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
+//  const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
+//  // count how many pts in each bin
+//  std::vector<BIGINT> counts(nbins, 0);
+//
+//  for (auto i = 0; i < M; i++) {
+//    // find the bin index in however many dims are needed
+//    const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
+//    const auto i2  = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
+//    const auto i3  = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+//    const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
+//    ++counts[bin];
+//  }
+//
+//  // compute the offsets directly in the counts array (no offset array)
+//  BIGINT current_offset = 0;
+//  for (BIGINT i = 0; i < nbins; i++) {
+//    BIGINT tmp = counts[i];
+//    counts[i]  = current_offset; // Reinecke's cute replacement of counts[i]
+//    current_offset += tmp;
+//  } // (counts now contains the index offsets for each bin)
+//
+//  for (auto i = 0; i < M; i++) {
+//    // find the bin index (again! but better than using RAM)
+//    const auto i1    = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
+//    const auto i2    = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
+//    const auto i3    = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+//    const auto bin   = i1 + nbins1 * (i2 + nbins2 * i3);
+//    ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly
+//    ++counts[bin];                // update the offsets
+//  }
+//}
+
+void bin_sort_singlethread(
+    BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz,
+    const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x,
+    const double bin_size_y, const double bin_size_z, const int debug)
 /* Returns permutation of all nonuniform points with good RAM access,
  * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
  *
@@ -1503,45 +1580,125 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIG
  * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
  */
 {
+  static constexpr auto alignment = xsimd::batch<FLT>::arch_type::alignment();
   const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
   // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
   // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
   // Note that round-off near kx=-pi stably rounds negative to i1=0.
-  const auto nbins1         = BIGINT(FLT(N1) / bin_size_x + 1);
-  const auto nbins2         = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1;
-  const auto nbins3         = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1;
+  const auto nbins1         = UBIGINT(FLT(N1) / bin_size_x + 1);
+  const auto nbins2         = isky ? UBIGINT(FLT(N2) / bin_size_y + 1) : 1;
+  const auto nbins3         = iskz ? UBIGINT(FLT(N3) / bin_size_z + 1) : 1;
+  const auto nbins12        = nbins1 * nbins2;
   const auto nbins          = nbins1 * nbins2 * nbins3;
   const auto inv_bin_size_x = FLT(1.0 / bin_size_x);
   const auto inv_bin_size_y = FLT(1.0 / bin_size_y);
   const auto inv_bin_size_z = FLT(1.0 / bin_size_z);
+
+  static constexpr auto avx_width = xsimd::batch<FLT>::size;
+  const auto regular_part         = M & (-avx_width);
+
   // count how many pts in each bin
-  std::vector<BIGINT> counts(nbins, 0);
+  std::vector<UBIGINT> counts(nbins, 0);
+
+  static constexpr auto to_array = [](const auto bins) noexcept {
+    using contained_t = typename decltype(bins)::value_type;
+    alignas(alignment) std::array<contained_t, decltype(bins)::size> result{};
+    bins.store_aligned(result.data());
+    return result;
+  };
+
+  static constexpr auto to_uint = [](const xsimd::batch<FLT> bins) noexcept {
+    return xsimd::batch_cast<xsimd::as_unsigned_integer_t<FLT>>(bins);
+  };
+
+  const auto compute_bins = [=](auto... args) constexpr noexcept {
+    std::array<const FLT *, sizeof...(args)> k_arr = {args...};
+    auto bins                                      = xsimd::floor(
+        fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x);
+    if constexpr (sizeof...(args) > 1) {
+      const auto i2 = xsimd::floor(
+          fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y);
+      bins = xsimd::fma(decltype(bins)(nbins1), i2, bins);
+    }
+    if constexpr (sizeof...(args) > 2) {
+      const auto i3 = xsimd::floor(
+          fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z);
+      bins = xsimd::fma(decltype(bins)(nbins12), i3, bins);
+    }
+    return to_uint(bins);
+  };
+
+  const auto increment_bins = [&counts](const auto bins) constexpr noexcept {
+    const auto bin_array = to_array(bins);
+    for (const auto bin : bin_array) {
+      ++counts[bin];
+    }
+  };
+
+  const auto accumulate_bins = [&counts, &ret](const auto bins,
+                                               const auto i) constexpr noexcept {
+    const auto bin_array = to_array(bins);
+    for (uint8_t j{0}; j < avx_width; ++j) {
+      const auto bin = bin_array[j];
+      // fill the inverse map on the fly, careful of indexes errors
+      ret[counts[bin]] = i + j;
+      ++counts[bin];
+    }
+  };
 
-  for (auto i = 0; i < M; i++) {
+  UBIGINT i{0};
+  if (iskz) {
+    for (; i < regular_part; i += avx_width) {
+      increment_bins(compute_bins(kx + i, ky + i, kz + i));
+    }
+  } else if (isky) {
+    for (; i < regular_part; i += avx_width) {
+      increment_bins(compute_bins(kx + i, ky + i));
+    }
+  } else {
+    for (; i < regular_part; i += avx_width) {
+      increment_bins(compute_bins(kx + i));
+    }
+  }
+
+  for (; i < M; ++i) {
     // find the bin index in however many dims are needed
-    const auto i1  = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
-    const auto i2  = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
-    const auto i3  = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+    const auto i1  = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
+    const auto i2  = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
+    const auto i3  = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
     const auto bin = i1 + nbins1 * (i2 + nbins2 * i3);
     ++counts[bin];
   }
 
   // compute the offsets directly in the counts array (no offset array)
-  BIGINT current_offset = 0;
-  for (BIGINT i = 0; i < nbins; i++) {
-    BIGINT tmp = counts[i];
-    counts[i]  = current_offset; // Reinecke's cute replacement of counts[i]
-    current_offset += tmp;
+  UBIGINT current_offset{0}; // Reinecke's cute replacement of counts[i]
+  for (i = 0; i < nbins; ++i) {
+    counts[i] = std::exchange(current_offset, current_offset + counts[i]);
   } // (counts now contains the index offsets for each bin)
 
-  for (auto i = 0; i < M; i++) {
+  i = 0; // we need to redo the loop so variable should be zeroed here
+  if (iskz) {
+    for (; i < regular_part; i += avx_width) {
+      accumulate_bins(compute_bins(kx + i, ky + i, kz + i), i);
+    }
+  } else if (isky) {
+    for (; i < regular_part; i += avx_width) {
+      accumulate_bins(compute_bins(kx + i, ky + i), i);
+    }
+  } else {
+    for (; i < regular_part; i += avx_width) {
+      accumulate_bins(compute_bins(kx + i), i);
+    }
+  }
+
+  for (; i < M; ++i) {
     // find the bin index (again! but better than using RAM)
-    const auto i1    = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
-    const auto i2    = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
-    const auto i3    = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
+    const auto i1    = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x);
+    const auto i2    = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0;
+    const auto i3    = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0;
     const auto bin   = i1 + nbins1 * (i2 + nbins2 * i3);
-    ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly
-    ++counts[bin];                // update the offsets
+    ret[counts[bin]] = i; // fill the inverse map on the fly
+    ++counts[bin];        // update the offsets
   }
 }
 
@@ -1881,5 +2038,17 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset
     break;
   }
 }
+
+// since xsimd is not constexpr this slows down the loop due to the guard variables
+// hence moved them here
+
+FINUFFT_ALWAYS_INLINE xsimd::batch<FLT> fold_rescale_vec(const xsimd::batch<FLT> x,
+                                                         const BIGINT N) {
+  const xsimd::batch<FLT> x2pi{FLT(M_1_2PI)};
+  const xsimd::batch<FLT> half{FLT(0.5)};
+  auto result = xsimd::fma(x, x2pi, half);
+  result -= xsimd::floor(result);
+  return result * FLT(N);
+}
 } // namespace
 } // namespace finufft::spreadinterp