From b31025487d27324ac75e17636ad9ed2eaa6ad2b9 Mon Sep 17 00:00:00 2001
From: Ben Haller <github@sticksoftware.com>
Date: Sat, 12 Aug 2023 10:28:22 -0400
Subject: [PATCH] parallelizing sorting, pass 1

---
 EidosScribe/EidosTextView.mm     |   1 +
 SLiM.xcodeproj/project.pbxproj   |  20 +++
 VERSIONS                         |   1 +
 core/genome.cpp                  |   1 +
 eidos/eidos_functions_stats.cpp  |   1 +
 eidos/eidos_functions_values.cpp |   1 +
 eidos/eidos_globals.cpp          | 147 ----------------------
 eidos/eidos_globals.h            |  71 -----------
 eidos/eidos_openmp.h             |   3 +
 eidos/eidos_sorting.cpp          | 208 +++++++++++++++++++++++++++++++
 eidos/eidos_sorting.h            | 148 ++++++++++++++++++++++
 eidos/eidos_test.cpp             |  98 +++++++++++++++
 eidos/eidos_value.cpp            |   1 +
 13 files changed, 483 insertions(+), 218 deletions(-)
 create mode 100644 eidos/eidos_sorting.cpp
 create mode 100644 eidos/eidos_sorting.h
diff --git a/EidosScribe/EidosTextView.mm b/EidosScribe/EidosTextView.mm
index 5c15a3352..6b10c37b6 100644
--- a/EidosScribe/EidosTextView.mm
+++ b/EidosScribe/EidosTextView.mm
@@ -29,6 +29,7 @@
 #include "eidos_property_signature.h"
 #include "eidos_type_table.h"
 #include "eidos_type_interpreter.h"
+#include "eidos_sorting.h"
 
 #include <stdexcept>
 #include <memory>
diff --git a/SLiM.xcodeproj/project.pbxproj b/SLiM.xcodeproj/project.pbxproj
index 7153e806e..8cd0debce 100644
--- a/SLiM.xcodeproj/project.pbxproj
+++ b/SLiM.xcodeproj/project.pbxproj
@@ -350,6 +350,14 @@
 		986D73E81B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; };
 		986D73E91B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; };
 		986D73EA1B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; };
+		98729ACF2A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD02A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD12A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD22A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD32A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD42A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD52A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
+		98729AD62A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; };
 		98760EDD28CE5E7600CEBC40 /* libomp.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 98760EDC28CE5E7600CEBC40 /* libomp.dylib */; };
 		98760EDE28CE5E8200CEBC40 /* libomp.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 98760EDC28CE5E7600CEBC40 /* libomp.dylib */; };
 		9876E5F51ED5572C00FF9762 /* tdist.c in Sources */ = {isa = PBXBuildFile; fileRef = 9876E5F41ED5572C00FF9762 /* tdist.c */; };
@@ -1618,6 +1626,8 @@
 		986926EA1AA6B7480000E138 /* GraphView_PopulationVisualization.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = GraphView_PopulationVisualization.mm; sourceTree = "<group>"; };
 		986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_call_signature.cpp; sourceTree = "<group>"; };
 		986D73E71B07E89E007FBB70 /* eidos_call_signature.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eidos_call_signature.h; sourceTree = "<group>"; };
+		98729ACD2A87A93500E81662 /* eidos_sorting.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = eidos_sorting.h; sourceTree = "<group>"; };
+		98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_sorting.cpp; sourceTree = "<group>"; };
 		9873B12328CE26CD00582D83 /* eidos_openmp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = eidos_openmp.h; sourceTree = "<group>"; };
 		98760EDC28CE5E7600CEBC40 /* libomp.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libomp.dylib; path = /usr/local/lib/libomp.dylib; sourceTree = "<group>"; };
 		9876E5F31ED5572C00FF9762 /* gsl_cdf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = gsl_cdf.h; path = cdf/gsl_cdf.h; sourceTree = "<group>"; };
@@ -2009,6 +2019,8 @@
 				9857E33A1BB58DAE00F1C8A9 /* eidos_intrusive_ptr.h */,
 				985A11861BBA07CB009EE1FF /* eidos_object_pool.h */,
 				9809F8BD24F32B3E00D312E4 /* eidos_tinycolormap.h */,
+				98729ACD2A87A93500E81662 /* eidos_sorting.h */,
+				98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */,
 				9893C7DC1CDA2D650029AC94 /* eidos_beep.h */,
 				9893C7DB1CDA2D650029AC94 /* eidos_beep.cpp */,
 				98B8AF8725A2BE7200C95D66 /* json_fwd.hpp */,
@@ -3300,6 +3312,7 @@
 				982556B81BA8EF8C0054CB3F /* eidos_property_signature.cpp in Sources */,
 				98C8217F1C7A983800548839 /* inline.c in Sources */,
 				985F3F0E24BA31D900E712E0 /* eidos_test_functions_statistics.cpp in Sources */,
+				98729AD42A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98C821AC1C7A9B1600548839 /* discrete.c in Sources */,
 				9807663924493E0B00F6CBB4 /* gzwrite.c in Sources */,
 				98C8218C1C7A983800548839 /* poisson.c in Sources */,
@@ -3371,6 +3384,7 @@
 				98332ADC1FDBC0D000274FF0 /* dgemv.c in Sources */,
 				98332B111FDBD09800274FF0 /* init.c in Sources */,
 				98C8212C1C7A980000548839 /* gamma.c in Sources */,
+				98729ACF2A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98C8212D1C7A980000548839 /* gauss.c in Sources */,
 				985F3F0124BA307200E712E0 /* eidos_test_operators_arithmetic.cpp in Sources */,
 				985F3F0B24BA31D900E712E0 /* eidos_test_functions_statistics.cpp in Sources */,
@@ -3613,6 +3627,7 @@
 				98332AEE1FDBC29500274FF0 /* rowcol.c in Sources */,
 				9890D1EF27136BB7001EAE98 /* eidos_class_DataFrame.cpp in Sources */,
 				987AD8871B2CC0C10035D6C8 /* EidosVariableBrowserController.mm in Sources */,
+				98729AD22A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				9807662B24493E0B00F6CBB4 /* crc32.c in Sources */,
 				98C821661C7A983800548839 /* beta.c in Sources */,
 				985F3EFE24BA2F1500E712E0 /* eidos_test_functions_vector.cpp in Sources */,
@@ -3691,6 +3706,7 @@
 				98CF5215294A3FC900557BBA /* binomial_tpe.c in Sources */,
 				98CF5216294A3FC900557BBA /* tables.c in Sources */,
 				98CF5217294A3FC900557BBA /* species_eidos.cpp in Sources */,
+				98729AD12A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98CF5218294A3FC900557BBA /* slim_functions.cpp in Sources */,
 				98CF5219294A3FC900557BBA /* deflate.c in Sources */,
 				98CF521A294A3FC900557BBA /* FindRecipeController.mm in Sources */,
@@ -3918,6 +3934,7 @@
 				98CF53E8294A714200557BBA /* rowcol.c in Sources */,
 				98CF53E9294A714200557BBA /* eidos_class_DataFrame.cpp in Sources */,
 				98CF53EA294A714200557BBA /* EidosVariableBrowserController.mm in Sources */,
+				98729AD32A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98CF53EB294A714200557BBA /* crc32.c in Sources */,
 				98CF53EC294A714200557BBA /* beta.c in Sources */,
 				98CF53ED294A714200557BBA /* eidos_test_functions_vector.cpp in Sources */,
@@ -3996,6 +4013,7 @@
 				98C821491C7A983700548839 /* binomial_tpe.c in Sources */,
 				9850D8E9206309A0006BFD2E /* tables.c in Sources */,
 				98AC617B24BA34ED0001914C /* species_eidos.cpp in Sources */,
+				98729AD02A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98DDAED7221765480038C133 /* slim_functions.cpp in Sources */,
 				98076618244934A800F6CBB4 /* deflate.c in Sources */,
 				984252C3216FA9930019696A /* FindRecipeController.mm in Sources */,
@@ -4192,6 +4210,7 @@
 				98D7EBB328CE557C00DEAAC4 /* poisson.c in Sources */,
 				98D7EBB428CE557C00DEAAC4 /* eidos_type_table.cpp in Sources */,
 				98D7EBB528CE557C00DEAAC4 /* exp.c in Sources */,
+				98729AD52A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98D7EBB628CE557C00DEAAC4 /* eidos_test_functions_other.cpp in Sources */,
 				98D7EBB728CE557C00DEAAC4 /* eidos_script.cpp in Sources */,
 				981DC37B28E27300000ABE91 /* eidos_functions_distributions.cpp in Sources */,
@@ -4263,6 +4282,7 @@
 				98D7ECA428CE58FC00DEAAC4 /* dgemv.c in Sources */,
 				98D7ECA528CE58FC00DEAAC4 /* init.c in Sources */,
 				98D7ECA628CE58FC00DEAAC4 /* gamma.c in Sources */,
+				98729AD62A87AAB700E81662 /* eidos_sorting.cpp in Sources */,
 				98D7ECA728CE58FC00DEAAC4 /* gauss.c in Sources */,
 				98D7ECA828CE58FC00DEAAC4 /* eidos_test_operators_arithmetic.cpp in Sources */,
 				981DC38528E27301000ABE91 /* eidos_functions_files.cpp in Sources */,
diff --git a/VERSIONS b/VERSIONS
index 2040fa601..853e46b45 100644
--- a/VERSIONS
+++ b/VERSIONS
@@ -85,6 +85,7 @@ PARALLEL changes (now in the master branch):
 	add recipe 22.7, Parallelizing nonWF reproduction and spatial interactions
 	add internal EidosBenchmark facility for timing of internal loops, for benchmarking of parallelization scaling
 	add a command-line option, -perTaskThreads, to let the user control how many threads to use for different tasks (if not overridden)
+	parallelize sorting, where possible/efficient
 
 development head (in the master branch):
 	fix a minor bug with autofix when opening multiple .slim documents at once in SLiMgui
diff --git a/core/genome.cpp b/core/genome.cpp
index c5b92d304..5e34e5bee 100644
--- a/core/genome.cpp
+++ b/core/genome.cpp
@@ -26,6 +26,7 @@
 #include "species.h"
 #include "polymorphism.h"
 #include "subpopulation.h"
+#include "eidos_sorting.h"
 
 #include <algorithm>
 #include <string>
diff --git a/eidos/eidos_functions_stats.cpp b/eidos/eidos_functions_stats.cpp
index b90b6fbe4..5341a3692 100644
--- a/eidos/eidos_functions_stats.cpp
+++ b/eidos/eidos_functions_stats.cpp
@@ -19,6 +19,7 @@
 
 
 #include "eidos_functions.h"
+#include "eidos_sorting.h"
 
 #include <utility>
 #include <limits>
diff --git a/eidos/eidos_functions_values.cpp b/eidos/eidos_functions_values.cpp
index 218a607e4..1be3f53b1 100644
--- a/eidos/eidos_functions_values.cpp
+++ b/eidos/eidos_functions_values.cpp
@@ -21,6 +21,7 @@
 #include "eidos_functions.h"
 #include "eidos_interpreter.h"
 #include "eidos_rng.h"
+#include "eidos_sorting.h"
 
 #include <unordered_map>
 #include <vector>
diff --git a/eidos/eidos_globals.cpp b/eidos/eidos_globals.cpp
index 9afadeec5..15250982d 100644
--- a/eidos/eidos_globals.cpp
+++ b/eidos/eidos_globals.cpp
@@ -3331,153 +3331,6 @@ bool Eidos_RegexWorks(void)
 	return regex_works;
 }
 
-// Here are some early explorations into parallelizing sorting.  The speedups
-// here are not particularly impressive.  Parallel sorting is a very deep and
-// complex rabbit hole to go down; see, e.g., Victor Duvanenko's work at
-// https://github.com/DragonSpit/ParallelAlgorithms.  But those algorithms
-// use TBB instead of OpenMP, and require C++17, so they're not easily usable.
-// Wikipedia at https://en.wikipedia.org/wiki/Merge_sort#Parallel_merge_sort
-// also has some very interesting commentary about parallelization of sorting.
-// The code here is also very primitive - integer only, no templates, no
-// client-suppliable comparator, etc. – so it would be hard to integrate into
-// all the ways Eidos presently uses std::sort.  Improving this looks like a
-// good project for somebody in CS.
-
-// This parallel quicksort code is thanks to Ruud van der Pas, modified from
-// https://www.openmp.org/wp-content/uploads/sc16-openmp-booth-tasking-ruud.pdf
-#ifdef _OPENMP
-static void _Eidos_ParallelQuicksort_I(int64_t *values, int64_t lo, int64_t hi)
-{
-	if (lo >= hi)
-		return;
-	
-	if (hi - lo + 1 <= 1000) {
-		// fall over to using std::sort when below a threshold interval size
-		// the larger the threshold, the less time we spend thrashing tasks on small
-		// intervals, which is good; but it also sets a limit on how many threads we
-		// we bring to bear on relatively small sorts, which is bad; 1000 seems fine
-		std::sort(values + lo, values + hi + 1);
-	} else {
-		// choose the middle of three pivots, in an attempt to avoid really bad pivots
-		int64_t pivot1 = *(values + lo);
-		int64_t pivot2 = *(values + hi);
-		int64_t pivot3 = *(values + ((lo + hi) >> 1));
-		int64_t pivot;
-		
-		if (pivot1 > pivot2)
-		{
-			if (pivot2 > pivot3)		pivot = pivot2;
-			else if (pivot1 > pivot3)	pivot = pivot3;
-			else						pivot = pivot1;
-		}
-		else
-		{
-			if (pivot1 > pivot3)		pivot = pivot1;
-			else if (pivot2 > pivot3)	pivot = pivot3;
-			else						pivot = pivot2;
-		}
-		
-		// note that std::partition is not guaranteed to leave the pivot value in position
-		// we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy
-		// this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance
-		// note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort
-		int64_t *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const int64_t& em) { return em < pivot; });
-		int64_t *middle2 = std::partition(middle1, values + hi + 1, [pivot](const int64_t& em) { return !(pivot < em); });
-		int64_t mid1 = middle1 - values;
-		int64_t mid2 = middle2 - values;
-		#pragma omp task default(none) firstprivate(values, lo, mid1)
-		{ _Eidos_ParallelQuicksort_I(values, lo, mid1 - 1); }	// Left branch
-		#pragma omp task default(none) firstprivate(values, hi, mid2)
-		{ _Eidos_ParallelQuicksort_I(values, mid2, hi); }		// Right branch
-	}
-}
-#endif
-
-void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements)
-{
-#ifdef _OPENMP
-	if (nelements > 1000) {
-		#pragma omp parallel default(none) shared(values, nelements)
-		{
-			#pragma omp single nowait
-			{
-				_Eidos_ParallelQuicksort_I(values, 0, nelements - 1);
-			}
-		} // End of parallel region
-	}
-	else
-	{
-		// Use std::sort for small vectors
-		std::sort(values, values + nelements);
-	}
-#else
-	// Use std::sort when not running parallel
-	std::sort(values, values + nelements);
-#endif
-}
-
-// This parallel mergesort code is thanks to Libor Bukata and Jan Dvořák, modified from
-// https://cw.fel.cvut.cz/old/_media/courses/b4m35pag/lab6_slides_advanced_openmp.pdf
-#ifdef _OPENMP
-static void _Eidos_ParallelMergesort_I(int64_t *values, int64_t left, int64_t right)
-{
-	if (left >= right)
-		return;
-	
-	if (right - left <= 1000)
-	{
-		// fall over to using std::sort when below a threshold interval size
-		// the larger the threshold, the less time we spend thrashing tasks on small
-		// intervals, which is good; but it also sets a limit on how many threads we
-		// we bring to bear on relatively small sorts, which is bad; 1000 seems fine
-		std::sort(values + left, values + right + 1);
-	}
-	else
-	{
-		int64_t mid = (left + right) / 2;
-		#pragma omp taskgroup
-		{
-			// the original code had if() limits on task subdivision here, but that
-			// doesn't make sense to me, because we also have the threshold above,
-			// which serves the same purpose but avoids using std::sort on subdivided
-			// regions and then merging them with inplace_merge; if we assume that
-			// std::sort is faster than mergesort when running on one thread, then
-			// merging subdivided std::sort calls only seems like a good strategy
-			// when the std::sort calls happen on separate threads
-			#pragma omp task default(none) firstprivate(values, left, mid) untied
-			_Eidos_ParallelMergesort_I(values, left, mid);
-			#pragma omp task default(none) firstprivate(values, mid, right) untied
-			_Eidos_ParallelMergesort_I(values, mid + 1, right);
-			#pragma omp taskyield
-		}
-		std::inplace_merge(values + left, values + mid + 1, values + right + 1);
-	}
-}
-#endif
-
-void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements)
-{ 
-#ifdef _OPENMP
-	if (nelements > 1000) {
-		#pragma omp parallel default(none) shared(values, nelements)
-		{
-			#pragma omp single
-			{
-				_Eidos_ParallelMergesort_I(values, 0, nelements - 1);
-			}
-		} // End of parallel region
-	}
-	else
-	{
-		// Use std::sort for small vectors
-		std::sort(values, values + nelements);
-	}
-#else
-	// Use std::sort when not running parallel
-	std::sort(values, values + nelements);
-#endif
-}
-
 
 // *******************************************************************************************************************
 //
diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h
index 8f12923bb..15f8e8119 100644
--- a/eidos/eidos_globals.h
+++ b/eidos/eidos_globals.h
@@ -590,73 +590,6 @@ std::string Eidos_string_escaped_CSV(const std::string &unescapedString);
 // BCH 13 December 2017: no longer used, commenting this out
 //std::string Eidos_Exec(const char *p_cmd);
 
-// Get indexes that would result in sorted ordering of a vector.  This rather nice code is adapted from http://stackoverflow.com/a/12399290/2752221
-template <typename T>
-std::vector<int64_t> EidosSortIndexes(const std::vector<T> &p_v, bool p_ascending = true)
-{
-	// initialize original index locations
-	std::vector<int64_t> idx(p_v.size());
-	std::iota(idx.begin(), idx.end(), 0);
-	
-	// sort indexes based on comparing values in v
-	if (p_ascending)
-		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];});
-	else
-		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];});
-	
-	return idx;
-}
-
-template <>
-inline std::vector<int64_t> EidosSortIndexes<double>(const std::vector<double> &p_v, bool p_ascending)
-{
-	// initialize original index locations
-	std::vector<int64_t> idx(p_v.size());
-	std::iota(idx.begin(), idx.end(), 0);
-	
-	// sort indexes based on comparing values in v
-	// this specialization for type double sorts NaNs to the end
-	if (p_ascending)
-		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);});
-	else
-		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);});
-	
-	return idx;
-}
-
-template <typename T>
-std::vector<int64_t> EidosSortIndexes(const T *p_v, size_t p_size, bool p_ascending = true)
-{
-	// initialize original index locations
-	std::vector<int64_t> idx(p_size);
-	std::iota(idx.begin(), idx.end(), 0);
-	
-	// sort indexes based on comparing values in v
-	if (p_ascending)
-		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];});
-	else
-		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];});
-	
-	return idx;
-}
-
-template <>
-inline std::vector<int64_t> EidosSortIndexes<double>(const double *p_v, size_t p_size, bool p_ascending)
-{
-	// initialize original index locations
-	std::vector<int64_t> idx(p_size);
-	std::iota(idx.begin(), idx.end(), 0);
-	
-	// sort indexes based on comparing values in v
-	// this specialization for type double sorts NaNs to the end
-	if (p_ascending)
-		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);});
-	else
-		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);});
-	
-	return idx;
-}
-
 extern int gEidosFloatOutputPrecision;		// precision used for output of float values in Eidos; not user-visible at present
 
 std::string EidosStringForFloat(double p_value);
@@ -689,10 +622,6 @@ BidiIter Eidos_random_unique(BidiIter begin, BidiIter end, size_t num_random)
 // The <regex> library does not work on Ubuntu 18.04, annoyingly; probably a very old compiler or something.  So we have to check.
 bool Eidos_RegexWorks(void);
 
-// Parallel sorting; these use std::sort when we are not running parallel, or for small jobs
-void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements);
-void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements);
-
 
 // *******************************************************************************************************************
 //
diff --git a/eidos/eidos_openmp.h b/eidos/eidos_openmp.h
index 722565581..1c7511900 100644
--- a/eidos/eidos_openmp.h
+++ b/eidos/eidos_openmp.h
@@ -59,6 +59,9 @@
 
 #include <signal.h>
 #include <limits.h>
+#include <errno.h>
+#include <string.h>
+#include <iostream>
 
 
 // This is the largest number of threads we allow the user to set.  There is no hard limit in the code;
diff --git a/eidos/eidos_sorting.cpp b/eidos/eidos_sorting.cpp
new file mode 100644
index 000000000..4bd6c8bd9
--- /dev/null
+++ b/eidos/eidos_sorting.cpp
@@ -0,0 +1,208 @@
+//
+//  eidos_sorting.cpp
+//  SLiM
+//
+//  Created by Ben Haller on 8/12/23.
+//  Copyright (c) 2023 Philipp Messer.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+#include "eidos_sorting.h"
+#include "eidos_openmp.h"
+
+
+// Here are some early explorations into parallelizing sorting.  The speedups
+// here are not particularly impressive.  Parallel sorting is a very deep and
+// complex rabbit hole to go down; see, e.g., Victor Duvanenko's work at
+// https://github.com/DragonSpit/ParallelAlgorithms.  But those algorithms
+// use TBB instead of OpenMP, and require C++17, so they're not easily usable.
+// Wikipedia at https://en.wikipedia.org/wiki/Merge_sort#Parallel_merge_sort
+// also has some very interesting commentary about parallelization of sorting.
+// The code here is also very primitive - integer only, no templates, no
+// client-suppliable comparator, etc. – so it would be hard to integrate into
+// all the ways Eidos presently uses std::sort.  Improving this looks like a
+// good project for somebody in CS.
+
+// This parallel quicksort code is thanks to Ruud van der Pas, modified from
+// https://www.openmp.org/wp-content/uploads/sc16-openmp-booth-tasking-ruud.pdf
+#ifdef _OPENMP
+static void _Eidos_ParallelQuicksort_I(int64_t *values, int64_t lo, int64_t hi)
+{
+	if (lo >= hi)
+		return;
+	
+	if (hi - lo + 1 <= 1000) {
+		// fall over to using std::sort when below a threshold interval size
+		// the larger the threshold, the less time we spend thrashing tasks on small
+		// intervals, which is good; but it also sets a limit on how many threads we
+		// we bring to bear on relatively small sorts, which is bad; 1000 seems fine
+		std::sort(values + lo, values + hi + 1);
+	} else {
+		// choose the middle of three pivots, in an attempt to avoid really bad pivots
+		int64_t pivot1 = *(values + lo);
+		int64_t pivot2 = *(values + hi);
+		int64_t pivot3 = *(values + ((lo + hi) >> 1));
+		int64_t pivot;
+		
+		if (pivot1 > pivot2)
+		{
+			if (pivot2 > pivot3)		pivot = pivot2;
+			else if (pivot1 > pivot3)	pivot = pivot3;
+			else						pivot = pivot1;
+		}
+		else
+		{
+			if (pivot1 > pivot3)		pivot = pivot1;
+			else if (pivot2 > pivot3)	pivot = pivot3;
+			else						pivot = pivot2;
+		}
+		
+		// note that std::partition is not guaranteed to leave the pivot value in position
+		// we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy
+		// this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance
+		// note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort
+		int64_t *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const int64_t& em) { return em < pivot; });
+		int64_t *middle2 = std::partition(middle1, values + hi + 1, [pivot](const int64_t& em) { return !(pivot < em); });
+		int64_t mid1 = middle1 - values;
+		int64_t mid2 = middle2 - values;
+		#pragma omp task default(none) firstprivate(values, lo, mid1)
+		{ _Eidos_ParallelQuicksort_I(values, lo, mid1 - 1); }	// Left branch
+		#pragma omp task default(none) firstprivate(values, hi, mid2)
+		{ _Eidos_ParallelQuicksort_I(values, mid2, hi); }		// Right branch
+	}
+}
+#endif
+
+void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements)
+{
+#ifdef _OPENMP
+	if (nelements > 1000) {
+		#pragma omp parallel default(none) shared(values, nelements)
+		{
+			#pragma omp single nowait
+			{
+				_Eidos_ParallelQuicksort_I(values, 0, nelements - 1);
+			}
+		} // End of parallel region
+	}
+	else
+	{
+		// Use std::sort for small vectors
+		std::sort(values, values + nelements);
+	}
+#else
+	// Use std::sort when not running parallel
+	std::sort(values, values + nelements);
+#endif
+}
+
+
+// This parallel mergesort code is thanks to Libor Bukata and Jan Dvořák, modified from
+// https://cw.fel.cvut.cz/old/_media/courses/b4m35pag/lab6_slides_advanced_openmp.pdf
+#ifdef _OPENMP
+static void _Eidos_ParallelMergesort_I(int64_t *values, int64_t left, int64_t right)
+{
+	if (left >= right)
+		return;
+	
+	if (right - left <= 1000)
+	{
+		// fall over to using std::sort when below a threshold interval size
+		// the larger the threshold, the less time we spend thrashing tasks on small
+		// intervals, which is good; but it also sets a limit on how many threads we
+		// we bring to bear on relatively small sorts, which is bad; 1000 seems fine
+		std::sort(values + left, values + right + 1);
+	}
+	else
+	{
+		int64_t mid = (left + right) / 2;
+		#pragma omp taskgroup
+		{
+			// the original code had if() limits on task subdivision here, but that
+			// doesn't make sense to me, because we also have the threshold above,
+			// which serves the same purpose but avoids using std::sort on subdivided
+			// regions and then merging them with inplace_merge; if we assume that
+			// std::sort is faster than mergesort when running on one thread, then
+			// merging subdivided std::sort calls only seems like a good strategy
+			// when the std::sort calls happen on separate threads
+			#pragma omp task default(none) firstprivate(values, left, mid) untied
+			_Eidos_ParallelMergesort_I(values, left, mid);
+			#pragma omp task default(none) firstprivate(values, mid, right) untied
+			_Eidos_ParallelMergesort_I(values, mid + 1, right);
+			#pragma omp taskyield
+		}
+		std::inplace_merge(values + left, values + mid + 1, values + right + 1);
+	}
+}
+#endif
+
+void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements)
+{ 
+#ifdef _OPENMP
+	if (nelements > 1000) {
+		#pragma omp parallel default(none) shared(values, nelements)
+		{
+			#pragma omp single
+			{
+				_Eidos_ParallelMergesort_I(values, 0, nelements - 1);
+			}
+		} // End of parallel region
+	}
+	else
+	{
+		// Use std::sort for small vectors
+		std::sort(values, values + nelements);
+	}
+#else
+	// Use std::sort when not running parallel
+	std::sort(values, values + nelements);
+#endif
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/eidos/eidos_sorting.h b/eidos/eidos_sorting.h
new file mode 100644
index 000000000..9fe8c8b6c
--- /dev/null
+++ b/eidos/eidos_sorting.h
@@ -0,0 +1,148 @@
+//
+//  eidos_sorting.h
+//  Eidos
+//
+//  Created by Ben Haller on 8/12/23.
+//  Copyright (c) 2023 Philipp Messer.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+ 
+ This file contains an assortment of sorting algorithms used for various purposes.
+ 
+ */
+
+
+#ifndef __Eidos__eidos_sorting_h
+#define __Eidos__eidos_sorting_h
+
+#include "eidos_openmp.h"
+
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <cstdint>
+
+
+//
+//	Parallel sorting; these use std::sort when we are not running parallel, or for small jobs
+//
+
+void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements);
+void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements);
+
+
+//
+//	Sriram's parallel sort
+//
+
+template <typename T, typename Compare>
+void Sriram_parallel_omp_sort(std::vector<T> &data, Compare comparator)
+{
+	int num_threads = omp_get_max_threads();
+	size_t size = data.size();
+	size_t chunk_size = size / num_threads;		// chunk size per thread
+	
+#pragma omp parallel
+	{
+		int tid = omp_get_thread_num();
+		// cout << omp_get_thread_num() << std::endl;
+		
+		size_t start = tid * chunk_size;
+		size_t end = (tid + 1) * chunk_size;
+		
+		if (tid == num_threads - 1)
+			end = size;
+		
+		// sort sub-array using comparator
+		std::sort(data.begin() + start, data.begin() + end, comparator);
+	}
+	
+	std::sort(data.begin(), data.end(), comparator);
+}
+
+
+//
+//	Get indexes that would result in sorted ordering of a vector.
+//	This rather nice code is adapted from http://stackoverflow.com/a/12399290/2752221
+//
+
+template <typename T>
+std::vector<int64_t> EidosSortIndexes(const std::vector<T> &p_v, bool p_ascending = true)
+{
+	// initialize original index locations
+	std::vector<int64_t> idx(p_v.size());
+	std::iota(idx.begin(), idx.end(), 0);
+	
+	// sort indexes based on comparing values in v
+	if (p_ascending)
+		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];});
+	else
+		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];});
+	
+	return idx;
+}
+
+template <>
+inline std::vector<int64_t> EidosSortIndexes<double>(const std::vector<double> &p_v, bool p_ascending)
+{
+	// initialize original index locations
+	std::vector<int64_t> idx(p_v.size());
+	std::iota(idx.begin(), idx.end(), 0);
+	
+	// sort indexes based on comparing values in v
+	// this specialization for type double sorts NaNs to the end
+	if (p_ascending)
+		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);});
+	else
+		std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);});
+	
+	return idx;
+}
+
+template <typename T>
+std::vector<int64_t> EidosSortIndexes(const T *p_v, size_t p_size, bool p_ascending = true)
+{
+	// initialize original index locations
+	std::vector<int64_t> idx(p_size);
+	std::iota(idx.begin(), idx.end(), 0);
+	
+	// sort indexes based on comparing values in v
+	if (p_ascending)
+		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];});
+	else
+		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];});
+	
+	return idx;
+}
+
+template <>
+inline std::vector<int64_t> EidosSortIndexes<double>(const double *p_v, size_t p_size, bool p_ascending)
+{
+	// initialize original index locations
+	std::vector<int64_t> idx(p_size);
+	std::iota(idx.begin(), idx.end(), 0);
+	
+	// sort indexes based on comparing values in v
+	// this specialization for type double sorts NaNs to the end
+	if (p_ascending)
+		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);});
+	else
+		std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);});
+	
+	return idx;
+}
+
+
+#endif /* __Eidos__eidos_sorting_h */
diff --git a/eidos/eidos_test.cpp b/eidos/eidos_test.cpp
index 89305722a..7da266286 100644
--- a/eidos/eidos_test.cpp
+++ b/eidos/eidos_test.cpp
@@ -24,6 +24,7 @@
 #include "eidos_interpreter.h"
 #include "eidos_globals.h"
 #include "eidos_rng.h"
+#include "eidos_sorting.h"
 
 #include <stdlib.h>
 #include <iostream>
@@ -1276,6 +1277,103 @@ int RunEidosTests(void)
 	}
 #endif
 	
+#if 1
+	// Speed and correctness tests of various parallel sorting algorithms
+	{
+		std::cout << std::endl << "SORTING TESTS:" << std::endl;
+		
+		gsl_rng *rng = EIDOS_GSL_RNG(omp_get_thread_num());		// the single-threaded RNG
+		typedef int64_t SORT_TYPE;
+		
+		{
+			auto comparator = [](SORT_TYPE a, SORT_TYPE b) {
+				return a < b;
+			};
+			const std::size_t test_size = 10000000;
+			const int reps = 5;
+			double time_sum;
+			std::vector<SORT_TYPE> data_original;
+			data_original.resize(test_size);
+			
+			for (std::size_t i = 0; i < test_size; ++i)
+				data_original[i] = Eidos_rng_uniform_int(rng, test_size);
+			
+			std::vector<SORT_TYPE> data_stdsort;
+			
+			std::cout << "time (std::sort): ";
+			time_sum = 0.0;
+			for (int rep = 0; rep < reps; ++rep)
+			{
+				data_stdsort = data_original;
+				eidos_profile_t begin = Eidos_BenchmarkTime();
+				
+				std::sort(data_stdsort.begin(), data_stdsort.end());
+				
+				eidos_profile_t end = Eidos_BenchmarkTime();
+				double time_spent = Eidos_ElapsedProfileTime(end - begin);
+				std::cout << time_spent << " ";
+				time_sum += time_spent;
+			}
+			std::cout << " : mean " << time_sum / reps << std::endl;
+			
+#ifdef _OPENMP
+			std::cout << "time (PQUICK): ";
+			time_sum = 0.0;
+			for (int rep = 0; rep < reps; ++rep)
+			{
+				std::vector<SORT_TYPE> data_PQUICK = data_original;
+				eidos_profile_t begin = Eidos_BenchmarkTime();
+				
+				Eidos_ParallelQuicksort_I(data_PQUICK.data(), data_PQUICK.size());
+				
+				eidos_profile_t end = Eidos_BenchmarkTime();
+				double time_spent = Eidos_ElapsedProfileTime(end - begin);
+				bool correct = (data_PQUICK == data_stdsort);
+				std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : "");
+				time_sum += time_spent;
+			}
+			std::cout << " : mean " << time_sum / reps << std::endl;
+			
+			std::cout << "time (PMERGE): ";
+			time_sum = 0.0;
+			for (int rep = 0; rep < reps; ++rep)
+			{
+				std::vector<SORT_TYPE> data_PMERGE = data_original;
+				eidos_profile_t begin = Eidos_BenchmarkTime();
+				
+				Eidos_ParallelMergesort_I(data_PMERGE.data(), data_PMERGE.size());
+				
+				eidos_profile_t end = Eidos_BenchmarkTime();
+				double time_spent = Eidos_ElapsedProfileTime(end - begin);
+				bool correct = (data_PMERGE == data_stdsort);
+				std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : "");
+				time_sum += time_spent;
+			}
+			std::cout << " : mean " << time_sum / reps << std::endl;
+			
+			std::cout << "time (PSRIRAM): ";
+			time_sum = 0.0;
+			for (int rep = 0; rep < reps; ++rep)
+			{
+				std::vector<SORT_TYPE> data_PSRIRAM = data_original;
+				eidos_profile_t begin = Eidos_BenchmarkTime();
+				
+				Sriram_parallel_omp_sort(data_PSRIRAM, comparator);
+				
+				eidos_profile_t end = Eidos_BenchmarkTime();
+				double time_spent = Eidos_ElapsedProfileTime(end - begin);
+				bool correct = (data_PSRIRAM == data_stdsort);
+				std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : "");
+				time_sum += time_spent;
+			}
+			std::cout << " : mean " << time_sum / reps << std::endl;
+#endif
+		}
+		
+		std::cout << std::endl << std::endl;
+	}
+#endif
+	
 	// If we ran tests, the random number seed has been set; let's set it back to a good seed value
 	Eidos_SetRNGSeed(Eidos_GenerateRNGSeed());
 	
diff --git a/eidos/eidos_value.cpp b/eidos/eidos_value.cpp
index 3e1aaeffc..b570d1076 100644
--- a/eidos/eidos_value.cpp
+++ b/eidos/eidos_value.cpp
@@ -19,6 +19,7 @@
 
 #include "eidos_value.h"
 #include "eidos_functions.h"
+#include "eidos_sorting.h"
 #include "eidos_interpreter.h"
 #include "eidos_call_signature.h"
 #include "eidos_property_signature.h"