Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve manual partition loop unrolling #28

Merged
merged 3 commits into from
Nov 19, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 86 additions & 48 deletions src/quicksort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,64 @@ impl<T> StablePartitionTypeImpl for T {
}
}

struct PartitionState<T> {
// The current element that is being looked at, scans left to right through slice.
scan: *const T,
// Counts the number of elements that compared less-than, also works around:
// https://github.com/rust-lang/rust/issues/117128
num_lt: usize,
// Reverse scratch output pointer.
scratch_rev: *mut T,
}

/// This construct works around a couple of issues with auto unrolling as well as manual unrolling.
/// Auto unrolling as tested with rustc 1.75 is somewhat run-time and binary-size inefficient,
/// because it performs additional math to calculate the loop end, which we can avoid by
/// precomputing the loop end. Also auto unrolling only happens on x86 but not on Arm where doing so
/// for the Firestorm micro-architecture yields a 15+% performance improvement. Manual unrolling via
/// repeated code has a large negative impact on debug compile-times, and unrolling via `for _ in
/// 0..UNROLL_LEN` has a 10-20% perf penalty when compiling with `opt-level=s` which is deemed
/// unacceptable for such a crucial component of the sort implementation.
trait UnrollHelper: Sized {
const UNROLL_LEN: usize;

unsafe fn unrolled_loop_body<F: FnMut(&mut PartitionState<Self>)>(
loop_body: F,
state: &mut PartitionState<Self>,
);
}

impl<T> UnrollHelper for T {
default const UNROLL_LEN: usize = 2;

#[inline(always)]
default unsafe fn unrolled_loop_body<F: FnMut(&mut PartitionState<T>)>(
mut loop_body: F,
state: &mut PartitionState<T>,
) {
loop_body(state);
loop_body(state);
}
}

impl<T> UnrollHelper for T
where
(): crate::IsTrue<{ mem::size_of::<T>() <= 8 }>,
{
const UNROLL_LEN: usize = 4;

#[inline(always)]
unsafe fn unrolled_loop_body<F: FnMut(&mut PartitionState<T>)>(
mut loop_body: F,
state: &mut PartitionState<T>,
) {
loop_body(state);
loop_body(state);
loop_body(state);
loop_body(state);
}
}

/// Specialization for small types, through traits to not invoke compile time
/// penalties for loop unrolling when not used.
///
Expand All @@ -202,8 +260,8 @@ impl<T> StablePartitionTypeImpl for T {
/// store. And explicit loop unrolling is also often very beneficial.
impl<T> StablePartitionTypeImpl for T
where
T: Copy,
(): crate::IsTrue<{ mem::size_of::<T>() <= (mem::size_of::<u64>() * 2) }>,
T: Copy + crate::Freeze,
(): crate::IsTrue<{ mem::size_of::<T>() <= 16 }>,
{
/// See [`StablePartitionTypeImpl::partition_fill_scratch`].
fn partition_fill_scratch<F: FnMut(&T, &T) -> bool>(
Expand All @@ -229,57 +287,37 @@ where
// naive loop unrolling where the exact same loop body is just
// repeated.
let pivot = v_base.add(pivot_pos);
let mut pivot_in_scratch = ptr::null_mut();
let mut num_lt = 0;
let mut scratch_rev = scratch_base.add(len);
macro_rules! loop_body {
($i:expr) => {
let scan = v_base.add($i);
scratch_rev = scratch_rev.sub(1);

let is_less_than_pivot = is_less(&*scan, &*pivot);
ptr::copy_nonoverlapping(scan, scratch_base.add(num_lt), 1);
ptr::copy_nonoverlapping(scan, scratch_rev.add(num_lt), 1);

// Save pivot location in scratch for later.
if const { crate::has_direct_interior_mutability::<T>() }
&& intrinsics::unlikely(scan as *const T == pivot)
{
pivot_in_scratch = if is_less_than_pivot {
scratch_base.add(num_lt) // i + num_lt
} else {
scratch_rev.add(num_lt) // len - (i + 1) + num_lt = len - 1 - num_ge
};
}

num_lt += is_less_than_pivot as usize;
};
}

/// To ensure good performance across platforms we explicitly unroll using a
/// fixed-size inner loop. We do not simply call the loop body multiple times as
/// this increases compile times significantly more, and the compiler unrolls
/// a fixed loop just as well, if it is sensible.
const UNROLL_LEN: usize = 4;
let mut offset = 0;
for _ in 0..(len / UNROLL_LEN) {
for unroll_i in 0..UNROLL_LEN {
loop_body!(offset + unroll_i);
}
offset += UNROLL_LEN;
let mut loop_body = |state: &mut PartitionState<T>| {
state.scratch_rev = state.scratch_rev.sub(1);

let is_less_than_pivot = is_less(&*state.scan, &*pivot);
ptr::copy_nonoverlapping(state.scan, scratch_base.add(state.num_lt), 1);
ptr::copy_nonoverlapping(state.scan, state.scratch_rev.add(state.num_lt), 1);

state.num_lt += is_less_than_pivot as usize;
state.scan = state.scan.add(1);
};

let mut state = PartitionState {
scan: v_base,
num_lt: 0,
scratch_rev: scratch_base.add(len),
};

// We do not simply call the loop body multiple times as this increases compile
// times significantly more, and the compiler unrolls a fixed loop just as well, if it
// is sensible.
let unroll_end = v_base.add(len - (T::UNROLL_LEN - 1));
while state.scan < unroll_end {
T::unrolled_loop_body(&mut loop_body, &mut state);
}

for i in 0..(len % UNROLL_LEN) {
loop_body!(offset + i);
while state.scan < v_base.add(len) {
loop_body(&mut state);
}

// SAFETY: if T has interior mutability our copy in scratch can be
// outdated, update it.
if const { crate::has_direct_interior_mutability::<T>() } {
ptr::copy_nonoverlapping(pivot, pivot_in_scratch, 1);
}

num_lt
state.num_lt
}
}
}
Loading