Voultapher · Voultapher · Nov 19, 2023 · Nov 4, 2023 · Nov 9, 2023 · Nov 10, 2023
diff --git a/src/quicksort.rs b/src/quicksort.rs
@@ -193,6 +193,64 @@ impl<T> StablePartitionTypeImpl for T {
     }
 }
 
+struct PartitionState<T> {
+    // The current element that is being looked at, scans left to right through slice.
+    scan: *const T,
+    // Counts the number of elements that compared less-than, also works around:
+    // https://github.com/rust-lang/rust/issues/117128
+    num_lt: usize,
+    // Reverse scratch output pointer.
+    scratch_rev: *mut T,
+}
+
+/// This construct works around a couple of issues with auto unrolling as well as manual unrolling.
+/// Auto unrolling as tested with rustc 1.75 is somewhat run-time and binary-size inefficient,
+/// because it performs additional math to calculate the loop end, which we can avoid by
+/// precomputing the loop end. Also auto unrolling only happens on x86 but not on Arm where doing so
+/// for the Firestorm micro-architecture yields a 15+% performance improvement. Manual unrolling via
+/// repeated code has a large negative impact on debug compile-times, and unrolling via `for _ in
+/// 0..UNROLL_LEN` has a 10-20% perf penalty when compiling with `opt-level=s` which is deemed
+/// unacceptable for such a crucial component of the sort implementation.
+trait UnrollHelper: Sized {
+    const UNROLL_LEN: usize;
+
+    unsafe fn unrolled_loop_body<F: FnMut(&mut PartitionState<Self>)>(
+        loop_body: F,
+        state: &mut PartitionState<Self>,
+    );
+}
+
+impl<T> UnrollHelper for T {
+    default const UNROLL_LEN: usize = 2;
+
+    #[inline(always)]
+    default unsafe fn unrolled_loop_body<F: FnMut(&mut PartitionState<T>)>(
+        mut loop_body: F,
+        state: &mut PartitionState<T>,
+    ) {
+        loop_body(state);
+        loop_body(state);
+    }
+}
+
+impl<T> UnrollHelper for T
+where
+    (): crate::IsTrue<{ mem::size_of::<T>() <= 8 }>,
+{
+    const UNROLL_LEN: usize = 4;
+
+    #[inline(always)]
+    unsafe fn unrolled_loop_body<F: FnMut(&mut PartitionState<T>)>(
+        mut loop_body: F,
+        state: &mut PartitionState<T>,
+    ) {
+        loop_body(state);
+        loop_body(state);
+        loop_body(state);
+        loop_body(state);
+    }
+}
+
 /// Specialization for small types, through traits to not invoke compile time
 /// penalties for loop unrolling when not used.
 ///
@@ -202,8 +260,8 @@ impl<T> StablePartitionTypeImpl for T {
 /// store. And explicit loop unrolling is also often very beneficial.
 impl<T> StablePartitionTypeImpl for T
 where
-    T: Copy,
-    (): crate::IsTrue<{ mem::size_of::<T>() <= (mem::size_of::<u64>() * 2) }>,
+    T: Copy + crate::Freeze,
+    (): crate::IsTrue<{ mem::size_of::<T>() <= 16 }>,
 {
     /// See [`StablePartitionTypeImpl::partition_fill_scratch`].
     fn partition_fill_scratch<F: FnMut(&T, &T) -> bool>(
@@ -229,57 +287,37 @@ where
             // naive loop unrolling where the exact same loop body is just
             // repeated.
             let pivot = v_base.add(pivot_pos);
-            let mut pivot_in_scratch = ptr::null_mut();
-            let mut num_lt = 0;
-            let mut scratch_rev = scratch_base.add(len);
-            macro_rules! loop_body {
-                ($i:expr) => {
-                    let scan = v_base.add($i);
-                    scratch_rev = scratch_rev.sub(1);
-
-                    let is_less_than_pivot = is_less(&*scan, &*pivot);
-                    ptr::copy_nonoverlapping(scan, scratch_base.add(num_lt), 1);
-                    ptr::copy_nonoverlapping(scan, scratch_rev.add(num_lt), 1);
-
-                    // Save pivot location in scratch for later.
-                    if const { crate::has_direct_interior_mutability::<T>() }
-                        && intrinsics::unlikely(scan as *const T == pivot)
-                    {
-                        pivot_in_scratch = if is_less_than_pivot {
-                            scratch_base.add(num_lt) // i + num_lt
-                        } else {
-                            scratch_rev.add(num_lt) // len - (i + 1) + num_lt = len - 1 - num_ge
-                        };
-                    }
-
-                    num_lt += is_less_than_pivot as usize;
-                };
-            }
 
-            /// To ensure good performance across platforms we explicitly unroll using a
-            /// fixed-size inner loop. We do not simply call the loop body multiple times as
-            /// this increases compile times significantly more, and the compiler unrolls
-            /// a fixed loop just as well, if it is sensible.
-            const UNROLL_LEN: usize = 4;
-            let mut offset = 0;
-            for _ in 0..(len / UNROLL_LEN) {
-                for unroll_i in 0..UNROLL_LEN {
-                    loop_body!(offset + unroll_i);
-                }
-                offset += UNROLL_LEN;
+            let mut loop_body = |state: &mut PartitionState<T>| {
+                state.scratch_rev = state.scratch_rev.sub(1);
+
+                let is_less_than_pivot = is_less(&*state.scan, &*pivot);
+                ptr::copy_nonoverlapping(state.scan, scratch_base.add(state.num_lt), 1);
+                ptr::copy_nonoverlapping(state.scan, state.scratch_rev.add(state.num_lt), 1);
+
+                state.num_lt += is_less_than_pivot as usize;
+                state.scan = state.scan.add(1);
+            };
+
+            let mut state = PartitionState {
+                scan: v_base,
+                num_lt: 0,
+                scratch_rev: scratch_base.add(len),
+            };
+
+            // We do not simply call the loop body multiple times as this increases compile
+            // times significantly more, and the compiler unrolls a fixed loop just as well, if it
+            // is sensible.
+            let unroll_end = v_base.add(len - (T::UNROLL_LEN - 1));
+            while state.scan < unroll_end {
+                T::unrolled_loop_body(&mut loop_body, &mut state);
             }
 
-            for i in 0..(len % UNROLL_LEN) {
-                loop_body!(offset + i);
+            while state.scan < v_base.add(len) {
+                loop_body(&mut state);
             }
 
-            // SAFETY: if T has interior mutability our copy in scratch can be
-            // outdated, update it.
-            if const { crate::has_direct_interior_mutability::<T>() } {
-                ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1);
-            }
-
-            num_lt
+            state.num_lt
         }
     }
 }