From 8f6a50e8a418c94cb2a947598ccaa04443d34d88 Mon Sep 17 00:00:00 2001 From: Lukas Bergdoll Date: Wed, 6 Mar 2024 18:03:22 +0100 Subject: [PATCH] Avoid large allocations for transition range after insertion sort Also includes misc cleanup that improves code-quality and compile-times. --- src/drift.rs | 90 ++++++++++++++++++++++++++++++------------------ src/lib.rs | 61 +++++++++++++------------------- src/smallsort.rs | 5 ++- 3 files changed, 85 insertions(+), 71 deletions(-) diff --git a/src/drift.rs b/src/drift.rs index af816da..135c5ac 100644 --- a/src/drift.rs +++ b/src/drift.rs @@ -3,9 +3,10 @@ use core::intrinsics; use core::mem::MaybeUninit; use crate::merge::merge; -use crate::smallsort::SmallSortTypeImpl; +use crate::smallsort::{ + insertion_sort_shift_left, SmallSortTypeImpl, MAX_LEN_ALWAYS_INSERTION_SORT, +}; use crate::stable_quicksort; -use crate::DriftsortRun; // Lazy logical runs as in Glidesort. #[inline(always)] @@ -119,15 +120,13 @@ pub fn sort bool>( // It's important to have a relatively high entry barrier for pre-sorted runs, as the presence // of a single such run will force on average several merge operations and shrink the maximum // quicksort size a lot. For that reason we use sqrt(len) as our pre-sorted run threshold, with - // SMALL_SORT_THRESHOLD as the lower limit. When eagerly sorting we also use - // SMALL_SORT_THRESHOLD as our threshold, as we will call small_sort on any runs smaller than - // this. - let min_good_run_len = - if eager_sort || len <= (T::SMALL_SORT_THRESHOLD * T::SMALL_SORT_THRESHOLD) { - T::SMALL_SORT_THRESHOLD - } else { - sqrt_approx(len) - }; + // SMALL_SORT_THRESHOLD as the lower limit. When eagerly sorting we ignore this threshold as we + // will call insertion sort directly to create runs. + let min_good_run_len = if len <= (T::SMALL_SORT_THRESHOLD * T::SMALL_SORT_THRESHOLD) { + T::SMALL_SORT_THRESHOLD + } else { + sqrt_approx(len) + }; // (stack_len, runs, desired_depths) together form a stack maintaining run // information for the powersort heuristic. desired_depths[i] is the desired @@ -147,13 +146,7 @@ pub fn sort bool>( // with root-level desired depth to fully collapse the merge tree. let (next_run, desired_depth); if scan_idx < len { - next_run = create_run( - &mut v[scan_idx..], - scratch, - min_good_run_len, - eager_sort, - is_less, - ); + next_run = create_run(&mut v[scan_idx..], min_good_run_len, eager_sort, is_less); desired_depth = merge_tree_depth( scan_idx - prev_run.len(), scan_idx, @@ -212,13 +205,12 @@ pub fn sort bool>( /// Creates a new logical run. /// -/// A logical run can either be sorted or unsorted. If there is a pre-existing -/// run of length min_good_run_len (or longer) starting at v[0] we find and -/// return it, otherwise we return a run of length min_good_run_len that is -/// eagerly sorted when eager_sort is true, and left unsorted otherwise. +/// A logical run can either be sorted or unsorted. If there is a pre-existing run of length +/// min_good_run_len or longer starting at v[0] we find and return it, otherwise we return a run of +/// length MAX_LEN_ALWAYS_INSERTION_SORT or longer that is eagerly sorted when eager_sort is true, +/// and left unsorted otherwise. fn create_run bool>( v: &mut [T], - scratch: &mut [MaybeUninit], min_good_run_len: usize, eager_sort: bool, is_less: &mut F, @@ -230,23 +222,28 @@ fn create_run bool>( intrinsics::assume(run_len <= v.len()); } - if run_len >= min_good_run_len { + let full_sorted = run_len >= min_good_run_len; + + if full_sorted || eager_sort { if was_reversed { v[..run_len].reverse(); } + } + + if full_sorted { DriftsortRun::new_sorted(run_len) } else { - let new_run_len = cmp::min(min_good_run_len, v.len()); - if eager_sort { - // When eager sorting min_good_run_len = T::SMALL_SORT_THRESHOLD, - // which will make stable_quicksort immediately call smallsort. By - // not calling the smallsort directly here it can always be inlined - // into the quicksort itself, making the recursive base case faster. - crate::quicksort::stable_quicksort(&mut v[..new_run_len], scratch, 0, None, is_less); - DriftsortRun::new_sorted(new_run_len) + let mut eager_run_len = run_len; + if run_len < MAX_LEN_ALWAYS_INSERTION_SORT { + let new_run_len = cmp::min(MAX_LEN_ALWAYS_INSERTION_SORT, v.len()); + insertion_sort_shift_left(&mut v[..new_run_len], run_len, is_less); + eager_run_len = new_run_len; + } + + DriftsortRun::new_sorted(eager_run_len) } else { - DriftsortRun::new_unsorted(new_run_len) + DriftsortRun::new_unsorted(cmp::min(min_good_run_len, v.len())) } } } @@ -280,3 +277,30 @@ fn find_existing_run bool>(v: &[T], is_less: &mut F) -> ( (run_len, strictly_descending) } } + +/// Compactly stores the length of a run, and whether or not it is sorted. This +/// can always fit in a usize because the maximum slice length is isize::MAX. +#[derive(Copy, Clone)] +struct DriftsortRun(usize); + +impl DriftsortRun { + #[inline(always)] + fn new_sorted(length: usize) -> Self { + Self((length << 1) | 1) + } + + #[inline(always)] + fn new_unsorted(length: usize) -> Self { + Self((length << 1) | 0) + } + + #[inline(always)] + fn sorted(self) -> bool { + self.0 & 1 == 1 + } + + #[inline(always)] + fn len(self) -> usize { + self.0 >> 1 + } +} diff --git a/src/lib.rs b/src/lib.rs index 3a4d59b..8f1711c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,32 +23,7 @@ mod pivot; mod quicksort; mod smallsort; -/// Compactly stores the length of a run, and whether or not it is sorted. This -/// can always fit in a usize because the maximum slice length is isize::MAX. -#[derive(Copy, Clone)] -struct DriftsortRun(usize); - -impl DriftsortRun { - #[inline(always)] - pub fn new_sorted(length: usize) -> Self { - Self((length << 1) | 1) - } - - #[inline(always)] - pub fn new_unsorted(length: usize) -> Self { - Self((length << 1) | 0) - } - - #[inline(always)] - pub fn sorted(self) -> bool { - self.0 & 1 == 1 - } - - #[inline(always)] - pub fn len(self) -> usize { - self.0 >> 1 - } -} +use crate::smallsort::MAX_LEN_ALWAYS_INSERTION_SORT; #[inline(always)] pub fn sort(v: &mut [T]) { @@ -85,7 +60,7 @@ fn driftsort bool, BufT: BufGuard>(v: &mut [T], is_les // modern processors is very valuable, and for a single sort call in general // purpose code any gains from an advanced method are cancelled by icache // misses during the sort, and thrashing the icache for surrounding code. - const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) { smallsort::insertion_sort_shift_left(v, 1, is_less); return; @@ -105,23 +80,35 @@ fn driftsort_main bool, BufT: BufGuard>(v: &mut [T], i // sized inputs while scaling down to len / 2 for larger inputs. We need at // least len / 2 for our stable merging routine. const MAX_FULL_ALLOC_BYTES: usize = 8_000_000; + let len = v.len(); - let full_alloc_size = cmp::min(len, MAX_FULL_ALLOC_BYTES / mem::size_of::()); - let alloc_size = cmp::max( - cmp::max(len / 2, full_alloc_size), - crate::smallsort::MIN_SMALL_SORT_SCRATCH_LEN, - ); + // Using the hybrid quick + merge-sort has performance issues with the transition from insertion + // sort to main loop. A more gradual and smoother transition can be had by using an always eager + // merge approach as long as it can be served by at most 3 merges. Another factor that affects + // transition performance, is the allocation size. The small-sort has a lower limit of + // `MIN_SMALL_SORT_SCRATCH_LEN` e.g. 48 which is much larger than for example 21. Where we only + // need a scratch buffer of length 10. Especially fully ascending and descending inputs in range + // `20..100` that are a bit larger than a `u64`, e.g. `String` spend most of their time doing + // the allocation. An alternativ would be to do lazy scratch allocation, but that comes with + // it's own set of complex tradeoffs and makes the code substantially more complex. + let eager_sort = len <= MAX_LEN_ALWAYS_INSERTION_SORT * 4; + let len_div_2 = len / 2; + + let alloc_size = if eager_sort { + len_div_2 + } else { + // There are local checks that ensure that it would abort if the thresholds are tweaked in a + // way that make this value ever smaller than `MIN_SMALL_SORT_SCRATCH_LEN` + let full_alloc_size = cmp::min(len, MAX_FULL_ALLOC_BYTES / mem::size_of::()); + + cmp::max(len_div_2, full_alloc_size) + }; let mut buf = BufT::with_capacity(alloc_size); let scratch_slice = unsafe { slice::from_raw_parts_mut(buf.mut_ptr() as *mut MaybeUninit, buf.capacity()) }; - // Using the hybrid quick + merge-sort has performance issues with the transition from insertion - // sort to main loop. A more gradual and smoother transition can be had by using an always eager - // merge approach as long as it can be served by a single merge. - use crate::smallsort::SmallSortTypeImpl; - let eager_sort = len <= T::SMALL_SORT_THRESHOLD * 2; drift::sort(v, scratch_slice, eager_sort, is_less); } diff --git a/src/smallsort.rs b/src/smallsort.rs index 8382378..342e254 100644 --- a/src/smallsort.rs +++ b/src/smallsort.rs @@ -37,7 +37,7 @@ impl SmallSortTypeImpl for T { } } -pub const MIN_SMALL_SORT_SCRATCH_LEN: usize = i32::SMALL_SORT_THRESHOLD + 16; +// const MIN_SMALL_SORT_SCRATCH_LEN: usize = i32::SMALL_SORT_THRESHOLD + 16; impl SmallSortTypeImpl for T { // From a comparison perspective 20, would be ~2% more efficient for fully random input, but a @@ -198,6 +198,9 @@ unsafe fn insert_tail bool>(begin: *mut T, tail: *mut T, } } +// Going beyond this is algorithmically inefficient. +pub(crate) const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20; + /// Sort `v` assuming `v[..offset]` is already sorted. pub fn insertion_sort_shift_left bool>( v: &mut [T],