diff --git a/src/quicksort.rs b/src/quicksort.rs index ee0cab3..2ce8d4e 100644 --- a/src/quicksort.rs +++ b/src/quicksort.rs @@ -193,6 +193,64 @@ impl StablePartitionTypeImpl for T { } } +struct PartitionState { + // The current element that is being looked at, scans left to right through slice. + scan: *const T, + // Counts the number of elements that compared less-than, also works around: + // https://github.com/rust-lang/rust/issues/117128 + num_lt: usize, + // Reverse scratch output pointer. + scratch_rev: *mut T, +} + +/// This construct works around a couple of issues with auto unrolling as well as manual unrolling. +/// Auto unrolling as tested with rustc 1.75 is somewhat run-time and binary-size inefficient, +/// because it performs additional math to calculate the loop end, which we can avoid by +/// precomputing the loop end. Also auto unrolling only happens on x86 but not on Arm where doing so +/// for the Firestorm micro-architecture yields a 15+% performance improvement. Manual unrolling via +/// repeated code has a large negative impact on debug compile-times, and unrolling via `for _ in +/// 0..UNROLL_LEN` has a 10-20% perf penalty when compiling with `opt-level=s` which is deemed +/// unacceptable for such a crucial component of the sort implementation. +trait UnrollHelper: Sized { + const UNROLL_LEN: usize; + + unsafe fn unrolled_loop_body)>( + loop_body: F, + state: &mut PartitionState, + ); +} + +impl UnrollHelper for T { + default const UNROLL_LEN: usize = 2; + + #[inline(always)] + default unsafe fn unrolled_loop_body)>( + mut loop_body: F, + state: &mut PartitionState, + ) { + loop_body(state); + loop_body(state); + } +} + +impl UnrollHelper for T +where + (): crate::IsTrue<{ mem::size_of::() <= 8 }>, +{ + const UNROLL_LEN: usize = 4; + + #[inline(always)] + unsafe fn unrolled_loop_body)>( + mut loop_body: F, + state: &mut PartitionState, + ) { + loop_body(state); + loop_body(state); + loop_body(state); + loop_body(state); + } +} + /// Specialization for small types, through traits to not invoke compile time /// penalties for loop unrolling when not used. /// @@ -202,8 +260,8 @@ impl StablePartitionTypeImpl for T { /// store. And explicit loop unrolling is also often very beneficial. impl StablePartitionTypeImpl for T where - T: Copy, - (): crate::IsTrue<{ mem::size_of::() <= (mem::size_of::() * 2) }>, + T: Copy + crate::Freeze, + (): crate::IsTrue<{ mem::size_of::() <= 16 }>, { /// See [`StablePartitionTypeImpl::partition_fill_scratch`]. fn partition_fill_scratch bool>( @@ -229,57 +287,37 @@ where // naive loop unrolling where the exact same loop body is just // repeated. let pivot = v_base.add(pivot_pos); - let mut pivot_in_scratch = ptr::null_mut(); - let mut num_lt = 0; - let mut scratch_rev = scratch_base.add(len); - macro_rules! loop_body { - ($i:expr) => { - let scan = v_base.add($i); - scratch_rev = scratch_rev.sub(1); - - let is_less_than_pivot = is_less(&*scan, &*pivot); - ptr::copy_nonoverlapping(scan, scratch_base.add(num_lt), 1); - ptr::copy_nonoverlapping(scan, scratch_rev.add(num_lt), 1); - - // Save pivot location in scratch for later. - if const { crate::has_direct_interior_mutability::() } - && intrinsics::unlikely(scan as *const T == pivot) - { - pivot_in_scratch = if is_less_than_pivot { - scratch_base.add(num_lt) // i + num_lt - } else { - scratch_rev.add(num_lt) // len - (i + 1) + num_lt = len - 1 - num_ge - }; - } - - num_lt += is_less_than_pivot as usize; - }; - } - /// To ensure good performance across platforms we explicitly unroll using a - /// fixed-size inner loop. We do not simply call the loop body multiple times as - /// this increases compile times significantly more, and the compiler unrolls - /// a fixed loop just as well, if it is sensible. - const UNROLL_LEN: usize = 4; - let mut offset = 0; - for _ in 0..(len / UNROLL_LEN) { - for unroll_i in 0..UNROLL_LEN { - loop_body!(offset + unroll_i); - } - offset += UNROLL_LEN; + let mut loop_body = |state: &mut PartitionState| { + state.scratch_rev = state.scratch_rev.sub(1); + + let is_less_than_pivot = is_less(&*state.scan, &*pivot); + ptr::copy_nonoverlapping(state.scan, scratch_base.add(state.num_lt), 1); + ptr::copy_nonoverlapping(state.scan, state.scratch_rev.add(state.num_lt), 1); + + state.num_lt += is_less_than_pivot as usize; + state.scan = state.scan.add(1); + }; + + let mut state = PartitionState { + scan: v_base, + num_lt: 0, + scratch_rev: scratch_base.add(len), + }; + + // We do not simply call the loop body multiple times as this increases compile + // times significantly more, and the compiler unrolls a fixed loop just as well, if it + // is sensible. + let unroll_end = v_base.add(len - (T::UNROLL_LEN - 1)); + while state.scan < unroll_end { + T::unrolled_loop_body(&mut loop_body, &mut state); } - for i in 0..(len % UNROLL_LEN) { - loop_body!(offset + i); + while state.scan < v_base.add(len) { + loop_body(&mut state); } - // SAFETY: if T has interior mutability our copy in scratch can be - // outdated, update it. - if const { crate::has_direct_interior_mutability::() } { - ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1); - } - - num_lt + state.num_lt } } }