From 59d110e19b89ec1fd4aff443e8e9e8c154c56f62 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Sun, 19 Nov 2023 14:28:29 +0100 Subject: [PATCH 1/2] Perform single copy + pointer select instead of double write for small types A previous optimization is no longer deemed worth it, new test results and understanding suggest that a single call to ptr::copy_nonoverlapping per element is more efficient for large input that don't fit into the last level cache. As well as for types like f128. --- src/quicksort.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/quicksort.rs b/src/quicksort.rs index 2ce8d4e..0214008 100644 --- a/src/quicksort.rs +++ b/src/quicksort.rs @@ -165,11 +165,12 @@ impl StablePartitionTypeImpl for T { scratch_rev = scratch_rev.sub(1); let is_less_than_pivot = is_less(&*scan, &*pivot); - let dst = if is_less_than_pivot { - scratch_base.add(num_lt) // i + num_lt + let dst_base = if is_less_than_pivot { + scratch_base // i + num_lt } else { - scratch_rev.add(num_lt) // len - (i + 1) + num_lt = len - 1 - num_ge + scratch_rev // len - (i + 1) + num_lt = len - 1 - num_ge }; + let dst = dst_base.add(num_lt); // Save pivot location in scratch for later. if const { crate::has_direct_interior_mutability::() } @@ -292,8 +293,12 @@ where state.scratch_rev = state.scratch_rev.sub(1); let is_less_than_pivot = is_less(&*state.scan, &*pivot); - ptr::copy_nonoverlapping(state.scan, scratch_base.add(state.num_lt), 1); - ptr::copy_nonoverlapping(state.scan, state.scratch_rev.add(state.num_lt), 1); + let dst_base = if is_less_than_pivot { + scratch_base // i + num_lt + } else { + state.scratch_rev // len - (i + 1) + num_lt = len - 1 - num_ge + }; + ptr::copy_nonoverlapping(state.scan, dst_base.add(state.num_lt), 1); state.num_lt += is_less_than_pivot as usize; state.scan = state.scan.add(1); From c48d058712677bf117c28cb6aa9d14b3abdd8cdc Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Tue, 21 Nov 2023 21:31:03 +0100 Subject: [PATCH 2/2] Update comments --- src/quicksort.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/quicksort.rs b/src/quicksort.rs index 0214008..63f96bb 100644 --- a/src/quicksort.rs +++ b/src/quicksort.rs @@ -254,11 +254,6 @@ where /// Specialization for small types, through traits to not invoke compile time /// penalties for loop unrolling when not used. -/// -/// Benchmarks show that for small types simply storing to *both* potential -/// destinations is more efficient than a conditional store. It is also less at -/// risk of having the compiler generating a branch instead of conditional -/// store. And explicit loop unrolling is also often very beneficial. impl StablePartitionTypeImpl for T where T: Copy + crate::Freeze, @@ -280,13 +275,8 @@ where } unsafe { - // SAFETY: exactly the same invariants and logic as the - // non-specialized impl. The conditional store being replaced by a - // double copy changes nothing, on all but the final iteration the - // bad write will simply be overwritten by a later iteration, and on - // the final iteration we write to the same index twice. And we do - // naive loop unrolling where the exact same loop body is just - // repeated. + // SAFETY: exactly the same invariants and logic as the non-specialized impl. And we do + // naive loop unrolling where the exact same loop body is just repeated. let pivot = v_base.add(pivot_pos); let mut loop_body = |state: &mut PartitionState| {