Skip to content

Commit

Permalink
Avoid large allocations for transition range after insertion sort
Browse files Browse the repository at this point in the history
Also includes misc cleanup that improves code-quality and compile-times.
  • Loading branch information
Voultapher committed Mar 6, 2024
1 parent d86d21a commit 8f6a50e
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 71 deletions.
90 changes: 57 additions & 33 deletions src/drift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ use core::intrinsics;
use core::mem::MaybeUninit;

use crate::merge::merge;
use crate::smallsort::SmallSortTypeImpl;
use crate::smallsort::{
insertion_sort_shift_left, SmallSortTypeImpl, MAX_LEN_ALWAYS_INSERTION_SORT,
};
use crate::stable_quicksort;
use crate::DriftsortRun;

// Lazy logical runs as in Glidesort.
#[inline(always)]
Expand Down Expand Up @@ -119,15 +120,13 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(
// It's important to have a relatively high entry barrier for pre-sorted runs, as the presence
// of a single such run will force on average several merge operations and shrink the maximum
// quicksort size a lot. For that reason we use sqrt(len) as our pre-sorted run threshold, with
// SMALL_SORT_THRESHOLD as the lower limit. When eagerly sorting we also use
// SMALL_SORT_THRESHOLD as our threshold, as we will call small_sort on any runs smaller than
// this.
let min_good_run_len =
if eager_sort || len <= (T::SMALL_SORT_THRESHOLD * T::SMALL_SORT_THRESHOLD) {
T::SMALL_SORT_THRESHOLD
} else {
sqrt_approx(len)
};
// SMALL_SORT_THRESHOLD as the lower limit. When eagerly sorting we ignore this threshold as we
// will call insertion sort directly to create runs.
let min_good_run_len = if len <= (T::SMALL_SORT_THRESHOLD * T::SMALL_SORT_THRESHOLD) {
T::SMALL_SORT_THRESHOLD
} else {
sqrt_approx(len)
};

// (stack_len, runs, desired_depths) together form a stack maintaining run
// information for the powersort heuristic. desired_depths[i] is the desired
Expand All @@ -147,13 +146,7 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(
// with root-level desired depth to fully collapse the merge tree.
let (next_run, desired_depth);
if scan_idx < len {
next_run = create_run(
&mut v[scan_idx..],
scratch,
min_good_run_len,
eager_sort,
is_less,
);
next_run = create_run(&mut v[scan_idx..], min_good_run_len, eager_sort, is_less);
desired_depth = merge_tree_depth(
scan_idx - prev_run.len(),
scan_idx,
Expand Down Expand Up @@ -212,13 +205,12 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(

/// Creates a new logical run.
///
/// A logical run can either be sorted or unsorted. If there is a pre-existing
/// run of length min_good_run_len (or longer) starting at v[0] we find and
/// return it, otherwise we return a run of length min_good_run_len that is
/// eagerly sorted when eager_sort is true, and left unsorted otherwise.
/// A logical run can either be sorted or unsorted. If there is a pre-existing run of length
/// min_good_run_len or longer starting at v[0] we find and return it, otherwise we return a run of
/// length MAX_LEN_ALWAYS_INSERTION_SORT or longer that is eagerly sorted when eager_sort is true,
/// and left unsorted otherwise.
fn create_run<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
min_good_run_len: usize,
eager_sort: bool,
is_less: &mut F,
Expand All @@ -230,23 +222,28 @@ fn create_run<T, F: FnMut(&T, &T) -> bool>(
intrinsics::assume(run_len <= v.len());
}

if run_len >= min_good_run_len {
let full_sorted = run_len >= min_good_run_len;

if full_sorted || eager_sort {
if was_reversed {
v[..run_len].reverse();
}
}

if full_sorted {
DriftsortRun::new_sorted(run_len)
} else {
let new_run_len = cmp::min(min_good_run_len, v.len());

if eager_sort {
// When eager sorting min_good_run_len = T::SMALL_SORT_THRESHOLD,
// which will make stable_quicksort immediately call smallsort. By
// not calling the smallsort directly here it can always be inlined
// into the quicksort itself, making the recursive base case faster.
crate::quicksort::stable_quicksort(&mut v[..new_run_len], scratch, 0, None, is_less);
DriftsortRun::new_sorted(new_run_len)
let mut eager_run_len = run_len;
if run_len < MAX_LEN_ALWAYS_INSERTION_SORT {
let new_run_len = cmp::min(MAX_LEN_ALWAYS_INSERTION_SORT, v.len());
insertion_sort_shift_left(&mut v[..new_run_len], run_len, is_less);
eager_run_len = new_run_len;
}

DriftsortRun::new_sorted(eager_run_len)
} else {
DriftsortRun::new_unsorted(new_run_len)
DriftsortRun::new_unsorted(cmp::min(min_good_run_len, v.len()))
}
}
}
Expand Down Expand Up @@ -280,3 +277,30 @@ fn find_existing_run<T, F: FnMut(&T, &T) -> bool>(v: &[T], is_less: &mut F) -> (
(run_len, strictly_descending)
}
}

/// Compactly stores the length of a run, and whether or not it is sorted. This
/// can always fit in a usize because the maximum slice length is isize::MAX.
#[derive(Copy, Clone)]
struct DriftsortRun(usize);

impl DriftsortRun {
#[inline(always)]
fn new_sorted(length: usize) -> Self {
Self((length << 1) | 1)
}

#[inline(always)]
fn new_unsorted(length: usize) -> Self {
Self((length << 1) | 0)
}

#[inline(always)]
fn sorted(self) -> bool {
self.0 & 1 == 1
}

#[inline(always)]
fn len(self) -> usize {
self.0 >> 1
}
}
61 changes: 24 additions & 37 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,7 @@ mod pivot;
mod quicksort;
mod smallsort;

/// Compactly stores the length of a run, and whether or not it is sorted. This
/// can always fit in a usize because the maximum slice length is isize::MAX.
#[derive(Copy, Clone)]
struct DriftsortRun(usize);

impl DriftsortRun {
#[inline(always)]
pub fn new_sorted(length: usize) -> Self {
Self((length << 1) | 1)
}

#[inline(always)]
pub fn new_unsorted(length: usize) -> Self {
Self((length << 1) | 0)
}

#[inline(always)]
pub fn sorted(self) -> bool {
self.0 & 1 == 1
}

#[inline(always)]
pub fn len(self) -> usize {
self.0 >> 1
}
}
use crate::smallsort::MAX_LEN_ALWAYS_INSERTION_SORT;

#[inline(always)]
pub fn sort<T: Ord>(v: &mut [T]) {
Expand Down Expand Up @@ -85,7 +60,7 @@ fn driftsort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_les
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by icache
// misses during the sort, and thrashing the icache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;

if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
smallsort::insertion_sort_shift_left(v, 1, is_less);
return;
Expand All @@ -105,23 +80,35 @@ fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], i
// sized inputs while scaling down to len / 2 for larger inputs. We need at
// least len / 2 for our stable merging routine.
const MAX_FULL_ALLOC_BYTES: usize = 8_000_000;

let len = v.len();
let full_alloc_size = cmp::min(len, MAX_FULL_ALLOC_BYTES / mem::size_of::<T>());

let alloc_size = cmp::max(
cmp::max(len / 2, full_alloc_size),
crate::smallsort::MIN_SMALL_SORT_SCRATCH_LEN,
);
// Using the hybrid quick + merge-sort has performance issues with the transition from insertion
// sort to main loop. A more gradual and smoother transition can be had by using an always eager
// merge approach as long as it can be served by at most 3 merges. Another factor that affects
// transition performance, is the allocation size. The small-sort has a lower limit of
// `MIN_SMALL_SORT_SCRATCH_LEN` e.g. 48 which is much larger than for example 21. Where we only
// need a scratch buffer of length 10. Especially fully ascending and descending inputs in range
// `20..100` that are a bit larger than a `u64`, e.g. `String` spend most of their time doing
// the allocation. An alternativ would be to do lazy scratch allocation, but that comes with
// it's own set of complex tradeoffs and makes the code substantially more complex.
let eager_sort = len <= MAX_LEN_ALWAYS_INSERTION_SORT * 4;
let len_div_2 = len / 2;

let alloc_size = if eager_sort {
len_div_2
} else {
// There are local checks that ensure that it would abort if the thresholds are tweaked in a
// way that make this value ever smaller than `MIN_SMALL_SORT_SCRATCH_LEN`
let full_alloc_size = cmp::min(len, MAX_FULL_ALLOC_BYTES / mem::size_of::<T>());

cmp::max(len_div_2, full_alloc_size)
};

let mut buf = BufT::with_capacity(alloc_size);
let scratch_slice =
unsafe { slice::from_raw_parts_mut(buf.mut_ptr() as *mut MaybeUninit<T>, buf.capacity()) };

// Using the hybrid quick + merge-sort has performance issues with the transition from insertion
// sort to main loop. A more gradual and smoother transition can be had by using an always eager
// merge approach as long as it can be served by a single merge.
use crate::smallsort::SmallSortTypeImpl;
let eager_sort = len <= T::SMALL_SORT_THRESHOLD * 2;
drift::sort(v, scratch_slice, eager_sort, is_less);
}

Expand Down
5 changes: 4 additions & 1 deletion src/smallsort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ impl<T> SmallSortTypeImpl for T {
}
}

pub const MIN_SMALL_SORT_SCRATCH_LEN: usize = i32::SMALL_SORT_THRESHOLD + 16;
// const MIN_SMALL_SORT_SCRATCH_LEN: usize = i32::SMALL_SORT_THRESHOLD + 16;

impl<T: crate::Freeze> SmallSortTypeImpl for T {
// From a comparison perspective 20, would be ~2% more efficient for fully random input, but a
Expand Down Expand Up @@ -198,6 +198,9 @@ unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T,
}
}

// Going beyond this is algorithmically inefficient.
pub(crate) const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;

/// Sort `v` assuming `v[..offset]` is already sorted.
pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
Expand Down

0 comments on commit 8f6a50e

Please sign in to comment.