From 59d110e19b89ec1fd4aff443e8e9e8c154c56f62 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Sun, 19 Nov 2023 14:28:29 +0100
Subject: [PATCH 1/2] Perform single copy + pointer select instead of double
 write for small types

A previous optimization is no longer deemed worth it, new test results
and understanding suggest that a single call to ptr::copy_nonoverlapping
per element is more efficient for large input that don't fit into the
last level cache. As well as for types like f128.
---
 src/quicksort.rs | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/src/quicksort.rs b/src/quicksort.rs
index 2ce8d4e..0214008 100644
--- a/src/quicksort.rs
+++ b/src/quicksort.rs
@@ -165,11 +165,12 @@ impl<T> StablePartitionTypeImpl for T {
                 scratch_rev = scratch_rev.sub(1);
 
                 let is_less_than_pivot = is_less(&*scan, &*pivot);
-                let dst = if is_less_than_pivot {
-                    scratch_base.add(num_lt) // i + num_lt
+                let dst_base = if is_less_than_pivot {
+                    scratch_base // i + num_lt
                 } else {
-                    scratch_rev.add(num_lt) // len - (i + 1) + num_lt = len - 1 - num_ge
+                    scratch_rev // len - (i + 1) + num_lt = len - 1 - num_ge
                 };
+                let dst = dst_base.add(num_lt);
 
                 // Save pivot location in scratch for later.
                 if const { crate::has_direct_interior_mutability::<T>() }
@@ -292,8 +293,12 @@ where
                 state.scratch_rev = state.scratch_rev.sub(1);
 
                 let is_less_than_pivot = is_less(&*state.scan, &*pivot);
-                ptr::copy_nonoverlapping(state.scan, scratch_base.add(state.num_lt), 1);
-                ptr::copy_nonoverlapping(state.scan, state.scratch_rev.add(state.num_lt), 1);
+                let dst_base = if is_less_than_pivot {
+                    scratch_base // i + num_lt
+                } else {
+                    state.scratch_rev // len - (i + 1) + num_lt = len - 1 - num_ge
+                };
+                ptr::copy_nonoverlapping(state.scan, dst_base.add(state.num_lt), 1);
 
                 state.num_lt += is_less_than_pivot as usize;
                 state.scan = state.scan.add(1);

From c48d058712677bf117c28cb6aa9d14b3abdd8cdc Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Tue, 21 Nov 2023 21:31:03 +0100
Subject: [PATCH 2/2] Update comments

---
 src/quicksort.rs | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/quicksort.rs b/src/quicksort.rs
index 0214008..63f96bb 100644
--- a/src/quicksort.rs
+++ b/src/quicksort.rs
@@ -254,11 +254,6 @@ where
 
 /// Specialization for small types, through traits to not invoke compile time
 /// penalties for loop unrolling when not used.
-///
-/// Benchmarks show that for small types simply storing to *both* potential
-/// destinations is more efficient than a conditional store. It is also less at
-/// risk of having the compiler generating a branch instead of conditional
-/// store. And explicit loop unrolling is also often very beneficial.
 impl<T> StablePartitionTypeImpl for T
 where
     T: Copy + crate::Freeze,
@@ -280,13 +275,8 @@ where
         }
 
         unsafe {
-            // SAFETY: exactly the same invariants and logic as the
-            // non-specialized impl. The conditional store being replaced by a
-            // double copy changes nothing, on all but the final iteration the
-            // bad write will simply be overwritten by a later iteration, and on
-            // the final iteration we write to the same index twice. And we do
-            // naive loop unrolling where the exact same loop body is just
-            // repeated.
+            // SAFETY: exactly the same invariants and logic as the non-specialized impl. And we do
+            // naive loop unrolling where the exact same loop body is just repeated.
             let pivot = v_base.add(pivot_pos);
 
             let mut loop_body = |state: &mut PartitionState<T>| {