From 8f6a50e8a418c94cb2a947598ccaa04443d34d88 Mon Sep 17 00:00:00 2001
From: Lukas Bergdoll <lukas.bergdoll@gmail.com>
Date: Wed, 6 Mar 2024 18:03:22 +0100
Subject: [PATCH] Avoid large allocations for transition range after insertion
 sort

Also includes misc cleanup that improves code-quality and compile-times.
---
 src/drift.rs     | 90 ++++++++++++++++++++++++++++++------------------
 src/lib.rs       | 61 +++++++++++++-------------------
 src/smallsort.rs |  5 ++-
 3 files changed, 85 insertions(+), 71 deletions(-)
diff --git a/src/drift.rs b/src/drift.rs
index af816da..135c5ac 100644
--- a/src/drift.rs
+++ b/src/drift.rs
@@ -3,9 +3,10 @@ use core::intrinsics;
 use core::mem::MaybeUninit;
 
 use crate::merge::merge;
-use crate::smallsort::SmallSortTypeImpl;
+use crate::smallsort::{
+    insertion_sort_shift_left, SmallSortTypeImpl, MAX_LEN_ALWAYS_INSERTION_SORT,
+};
 use crate::stable_quicksort;
-use crate::DriftsortRun;
 
 // Lazy logical runs as in Glidesort.
 #[inline(always)]
@@ -119,15 +120,13 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(
     // It's important to have a relatively high entry barrier for pre-sorted runs, as the presence
     // of a single such run will force on average several merge operations and shrink the maximum
     // quicksort size a lot. For that reason we use sqrt(len) as our pre-sorted run threshold, with
-    // SMALL_SORT_THRESHOLD as the lower limit. When eagerly sorting we also use
-    // SMALL_SORT_THRESHOLD as our threshold, as we will call small_sort on any runs smaller than
-    // this.
-    let min_good_run_len =
-        if eager_sort || len <= (T::SMALL_SORT_THRESHOLD * T::SMALL_SORT_THRESHOLD) {
-            T::SMALL_SORT_THRESHOLD
-        } else {
-            sqrt_approx(len)
-        };
+    // SMALL_SORT_THRESHOLD as the lower limit. When eagerly sorting we ignore this threshold as we
+    // will call insertion sort directly to create runs.
+    let min_good_run_len = if len <= (T::SMALL_SORT_THRESHOLD * T::SMALL_SORT_THRESHOLD) {
+        T::SMALL_SORT_THRESHOLD
+    } else {
+        sqrt_approx(len)
+    };
 
     // (stack_len, runs, desired_depths) together form a stack maintaining run
     // information for the powersort heuristic. desired_depths[i] is the desired
@@ -147,13 +146,7 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(
         // with root-level desired depth to fully collapse the merge tree.
         let (next_run, desired_depth);
         if scan_idx < len {
-            next_run = create_run(
-                &mut v[scan_idx..],
-                scratch,
-                min_good_run_len,
-                eager_sort,
-                is_less,
-            );
+            next_run = create_run(&mut v[scan_idx..], min_good_run_len, eager_sort, is_less);
             desired_depth = merge_tree_depth(
                 scan_idx - prev_run.len(),
                 scan_idx,
@@ -212,13 +205,12 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(
 
 /// Creates a new logical run.
 ///
-/// A logical run can either be sorted or unsorted. If there is a pre-existing
-/// run of length min_good_run_len (or longer) starting at v[0] we find and
-/// return it, otherwise we return a run of length min_good_run_len that is
-/// eagerly sorted when eager_sort is true, and left unsorted otherwise.
+/// A logical run can either be sorted or unsorted. If there is a pre-existing run of length
+/// min_good_run_len or longer starting at v[0] we find and return it, otherwise we return a run of
+/// length MAX_LEN_ALWAYS_INSERTION_SORT or longer that is eagerly sorted when eager_sort is true,
+/// and left unsorted otherwise.
 fn create_run<T, F: FnMut(&T, &T) -> bool>(
     v: &mut [T],
-    scratch: &mut [MaybeUninit<T>],
     min_good_run_len: usize,
     eager_sort: bool,
     is_less: &mut F,
@@ -230,23 +222,28 @@ fn create_run<T, F: FnMut(&T, &T) -> bool>(
         intrinsics::assume(run_len <= v.len());
     }
 
-    if run_len >= min_good_run_len {
+    let full_sorted = run_len >= min_good_run_len;
+
+    if full_sorted || eager_sort {
         if was_reversed {
             v[..run_len].reverse();
         }
+    }
+
+    if full_sorted {
         DriftsortRun::new_sorted(run_len)
     } else {
-        let new_run_len = cmp::min(min_good_run_len, v.len());
-
         if eager_sort {
-            // When eager sorting min_good_run_len = T::SMALL_SORT_THRESHOLD,
-            // which will make stable_quicksort immediately call smallsort. By
-            // not calling the smallsort directly here it can always be inlined
-            // into the quicksort itself, making the recursive base case faster.
-            crate::quicksort::stable_quicksort(&mut v[..new_run_len], scratch, 0, None, is_less);
-            DriftsortRun::new_sorted(new_run_len)
+            let mut eager_run_len = run_len;
+            if run_len < MAX_LEN_ALWAYS_INSERTION_SORT {
+                let new_run_len = cmp::min(MAX_LEN_ALWAYS_INSERTION_SORT, v.len());
+                insertion_sort_shift_left(&mut v[..new_run_len], run_len, is_less);
+                eager_run_len = new_run_len;
+            }
+
+            DriftsortRun::new_sorted(eager_run_len)
         } else {
-            DriftsortRun::new_unsorted(new_run_len)
+            DriftsortRun::new_unsorted(cmp::min(min_good_run_len, v.len()))
         }
     }
 }
@@ -280,3 +277,30 @@ fn find_existing_run<T, F: FnMut(&T, &T) -> bool>(v: &[T], is_less: &mut F) -> (
         (run_len, strictly_descending)
     }
 }
+
+/// Compactly stores the length of a run, and whether or not it is sorted. This
+/// can always fit in a usize because the maximum slice length is isize::MAX.
+#[derive(Copy, Clone)]
+struct DriftsortRun(usize);
+
+impl DriftsortRun {
+    #[inline(always)]
+    fn new_sorted(length: usize) -> Self {
+        Self((length << 1) | 1)
+    }
+
+    #[inline(always)]
+    fn new_unsorted(length: usize) -> Self {
+        Self((length << 1) | 0)
+    }
+
+    #[inline(always)]
+    fn sorted(self) -> bool {
+        self.0 & 1 == 1
+    }
+
+    #[inline(always)]
+    fn len(self) -> usize {
+        self.0 >> 1
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3a4d59b..8f1711c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -23,32 +23,7 @@ mod pivot;
 mod quicksort;
 mod smallsort;
 
-/// Compactly stores the length of a run, and whether or not it is sorted. This
-/// can always fit in a usize because the maximum slice length is isize::MAX.
-#[derive(Copy, Clone)]
-struct DriftsortRun(usize);
-
-impl DriftsortRun {
-    #[inline(always)]
-    pub fn new_sorted(length: usize) -> Self {
-        Self((length << 1) | 1)
-    }
-
-    #[inline(always)]
-    pub fn new_unsorted(length: usize) -> Self {
-        Self((length << 1) | 0)
-    }
-
-    #[inline(always)]
-    pub fn sorted(self) -> bool {
-        self.0 & 1 == 1
-    }
-
-    #[inline(always)]
-    pub fn len(self) -> usize {
-        self.0 >> 1
-    }
-}
+use crate::smallsort::MAX_LEN_ALWAYS_INSERTION_SORT;
 
 #[inline(always)]
 pub fn sort<T: Ord>(v: &mut [T]) {
@@ -85,7 +60,7 @@ fn driftsort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_les
     // modern processors is very valuable, and for a single sort call in general
     // purpose code any gains from an advanced method are cancelled by icache
     // misses during the sort, and thrashing the icache for surrounding code.
-    const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+
     if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
         smallsort::insertion_sort_shift_left(v, 1, is_less);
         return;
@@ -105,23 +80,35 @@ fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], i
     // sized inputs while scaling down to len / 2 for larger inputs. We need at
     // least len / 2 for our stable merging routine.
     const MAX_FULL_ALLOC_BYTES: usize = 8_000_000;
+
     let len = v.len();
-    let full_alloc_size = cmp::min(len, MAX_FULL_ALLOC_BYTES / mem::size_of::<T>());
 
-    let alloc_size = cmp::max(
-        cmp::max(len / 2, full_alloc_size),
-        crate::smallsort::MIN_SMALL_SORT_SCRATCH_LEN,
-    );
+    // Using the hybrid quick + merge-sort has performance issues with the transition from insertion
+    // sort to main loop. A more gradual and smoother transition can be had by using an always eager
+    // merge approach as long as it can be served by at most 3 merges. Another factor that affects
+    // transition performance, is the allocation size. The small-sort has a lower limit of
+    // `MIN_SMALL_SORT_SCRATCH_LEN` e.g. 48 which is much larger than for example 21. Where we only
+    // need a scratch buffer of length 10. Especially fully ascending and descending inputs in range
+    // `20..100` that are a bit larger than a `u64`, e.g. `String` spend most of their time doing
+    // the allocation. An alternativ would be to do lazy scratch allocation, but that comes with
+    // it's own set of complex tradeoffs and makes the code substantially more complex.
+    let eager_sort = len <= MAX_LEN_ALWAYS_INSERTION_SORT * 4;
+    let len_div_2 = len / 2;
+
+    let alloc_size = if eager_sort {
+        len_div_2
+    } else {
+        // There are local checks that ensure that it would abort if the thresholds are tweaked in a
+        // way that make this value ever smaller than `MIN_SMALL_SORT_SCRATCH_LEN`
+        let full_alloc_size = cmp::min(len, MAX_FULL_ALLOC_BYTES / mem::size_of::<T>());
+
+        cmp::max(len_div_2, full_alloc_size)
+    };
 
     let mut buf = BufT::with_capacity(alloc_size);
     let scratch_slice =
         unsafe { slice::from_raw_parts_mut(buf.mut_ptr() as *mut MaybeUninit<T>, buf.capacity()) };
 
-    // Using the hybrid quick + merge-sort has performance issues with the transition from insertion
-    // sort to main loop. A more gradual and smoother transition can be had by using an always eager
-    // merge approach as long as it can be served by a single merge.
-    use crate::smallsort::SmallSortTypeImpl;
-    let eager_sort = len <= T::SMALL_SORT_THRESHOLD * 2;
     drift::sort(v, scratch_slice, eager_sort, is_less);
 }
 
diff --git a/src/smallsort.rs b/src/smallsort.rs
index 8382378..342e254 100644
--- a/src/smallsort.rs
+++ b/src/smallsort.rs
@@ -37,7 +37,7 @@ impl<T> SmallSortTypeImpl for T {
     }
 }
 
-pub const MIN_SMALL_SORT_SCRATCH_LEN: usize = i32::SMALL_SORT_THRESHOLD + 16;
+// const MIN_SMALL_SORT_SCRATCH_LEN: usize = i32::SMALL_SORT_THRESHOLD + 16;
 
 impl<T: crate::Freeze> SmallSortTypeImpl for T {
     // From a comparison perspective 20, would be ~2% more efficient for fully random input, but a
@@ -198,6 +198,9 @@ unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T,
     }
 }
 
+// Going beyond this is algorithmically inefficient.
+pub(crate) const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
+
 /// Sort `v` assuming `v[..offset]` is already sorted.
 pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
     v: &mut [T],