diff --git a/SLiM.xcodeproj/project.pbxproj b/SLiM.xcodeproj/project.pbxproj index 8cd0debc..126592ff 100644 --- a/SLiM.xcodeproj/project.pbxproj +++ b/SLiM.xcodeproj/project.pbxproj @@ -350,14 +350,14 @@ 986D73E81B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; }; 986D73E91B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; }; 986D73EA1B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; }; - 98729ACF2A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD02A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD12A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD22A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD32A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD42A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD52A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; - 98729AD62A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD82A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729AD92A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729ADA2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729ADB2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729ADC2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729ADD2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729ADE2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; + 98729ADF2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */; }; 98760EDD28CE5E7600CEBC40 /* libomp.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 98760EDC28CE5E7600CEBC40 /* libomp.dylib */; }; 98760EDE28CE5E8200CEBC40 /* libomp.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 98760EDC28CE5E7600CEBC40 /* libomp.dylib */; }; 9876E5F51ED5572C00FF9762 /* tdist.c in Sources */ = {isa = PBXBuildFile; fileRef = 9876E5F41ED5572C00FF9762 /* tdist.c */; }; @@ -1627,7 +1627,8 @@ 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_call_signature.cpp; sourceTree = ""; }; 986D73E71B07E89E007FBB70 /* eidos_call_signature.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eidos_call_signature.h; sourceTree = ""; }; 98729ACD2A87A93500E81662 /* eidos_sorting.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = eidos_sorting.h; sourceTree = ""; }; - 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_sorting.cpp; sourceTree = ""; }; + 98729ACE2A87AAB700E81662 /* eidos_sorting.inc */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.pascal; path = eidos_sorting.inc; sourceTree = ""; }; + 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_sorting.cpp; sourceTree = ""; }; 9873B12328CE26CD00582D83 /* eidos_openmp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = eidos_openmp.h; sourceTree = ""; }; 98760EDC28CE5E7600CEBC40 /* libomp.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libomp.dylib; path = /usr/local/lib/libomp.dylib; sourceTree = ""; }; 9876E5F31ED5572C00FF9762 /* gsl_cdf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = gsl_cdf.h; path = cdf/gsl_cdf.h; sourceTree = ""; }; @@ -2020,7 +2021,8 @@ 985A11861BBA07CB009EE1FF /* eidos_object_pool.h */, 9809F8BD24F32B3E00D312E4 /* eidos_tinycolormap.h */, 98729ACD2A87A93500E81662 /* eidos_sorting.h */, - 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */, + 98729AD72A87DFBE00E81662 /* eidos_sorting.cpp */, + 98729ACE2A87AAB700E81662 /* eidos_sorting.inc */, 9893C7DC1CDA2D650029AC94 /* eidos_beep.h */, 9893C7DB1CDA2D650029AC94 /* eidos_beep.cpp */, 98B8AF8725A2BE7200C95D66 /* json_fwd.hpp */, @@ -3312,7 +3314,7 @@ 982556B81BA8EF8C0054CB3F /* eidos_property_signature.cpp in Sources */, 98C8217F1C7A983800548839 /* inline.c in Sources */, 985F3F0E24BA31D900E712E0 /* eidos_test_functions_statistics.cpp in Sources */, - 98729AD42A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729ADD2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98C821AC1C7A9B1600548839 /* discrete.c in Sources */, 9807663924493E0B00F6CBB4 /* gzwrite.c in Sources */, 98C8218C1C7A983800548839 /* poisson.c in Sources */, @@ -3384,7 +3386,7 @@ 98332ADC1FDBC0D000274FF0 /* dgemv.c in Sources */, 98332B111FDBD09800274FF0 /* init.c in Sources */, 98C8212C1C7A980000548839 /* gamma.c in Sources */, - 98729ACF2A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729AD82A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98C8212D1C7A980000548839 /* gauss.c in Sources */, 985F3F0124BA307200E712E0 /* eidos_test_operators_arithmetic.cpp in Sources */, 985F3F0B24BA31D900E712E0 /* eidos_test_functions_statistics.cpp in Sources */, @@ -3627,7 +3629,7 @@ 98332AEE1FDBC29500274FF0 /* rowcol.c in Sources */, 9890D1EF27136BB7001EAE98 /* eidos_class_DataFrame.cpp in Sources */, 987AD8871B2CC0C10035D6C8 /* EidosVariableBrowserController.mm in Sources */, - 98729AD22A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729ADB2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 9807662B24493E0B00F6CBB4 /* crc32.c in Sources */, 98C821661C7A983800548839 /* beta.c in Sources */, 985F3EFE24BA2F1500E712E0 /* eidos_test_functions_vector.cpp in Sources */, @@ -3706,7 +3708,7 @@ 98CF5215294A3FC900557BBA /* binomial_tpe.c in Sources */, 98CF5216294A3FC900557BBA /* tables.c in Sources */, 98CF5217294A3FC900557BBA /* species_eidos.cpp in Sources */, - 98729AD12A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729ADA2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98CF5218294A3FC900557BBA /* slim_functions.cpp in Sources */, 98CF5219294A3FC900557BBA /* deflate.c in Sources */, 98CF521A294A3FC900557BBA /* FindRecipeController.mm in Sources */, @@ -3934,7 +3936,7 @@ 98CF53E8294A714200557BBA /* rowcol.c in Sources */, 98CF53E9294A714200557BBA /* eidos_class_DataFrame.cpp in Sources */, 98CF53EA294A714200557BBA /* EidosVariableBrowserController.mm in Sources */, - 98729AD32A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729ADC2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98CF53EB294A714200557BBA /* crc32.c in Sources */, 98CF53EC294A714200557BBA /* beta.c in Sources */, 98CF53ED294A714200557BBA /* eidos_test_functions_vector.cpp in Sources */, @@ -4013,7 +4015,7 @@ 98C821491C7A983700548839 /* binomial_tpe.c in Sources */, 9850D8E9206309A0006BFD2E /* tables.c in Sources */, 98AC617B24BA34ED0001914C /* species_eidos.cpp in Sources */, - 98729AD02A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729AD92A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98DDAED7221765480038C133 /* slim_functions.cpp in Sources */, 98076618244934A800F6CBB4 /* deflate.c in Sources */, 984252C3216FA9930019696A /* FindRecipeController.mm in Sources */, @@ -4210,7 +4212,7 @@ 98D7EBB328CE557C00DEAAC4 /* poisson.c in Sources */, 98D7EBB428CE557C00DEAAC4 /* eidos_type_table.cpp in Sources */, 98D7EBB528CE557C00DEAAC4 /* exp.c in Sources */, - 98729AD52A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729ADE2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98D7EBB628CE557C00DEAAC4 /* eidos_test_functions_other.cpp in Sources */, 98D7EBB728CE557C00DEAAC4 /* eidos_script.cpp in Sources */, 981DC37B28E27300000ABE91 /* eidos_functions_distributions.cpp in Sources */, @@ -4282,7 +4284,7 @@ 98D7ECA428CE58FC00DEAAC4 /* dgemv.c in Sources */, 98D7ECA528CE58FC00DEAAC4 /* init.c in Sources */, 98D7ECA628CE58FC00DEAAC4 /* gamma.c in Sources */, - 98729AD62A87AAB700E81662 /* eidos_sorting.cpp in Sources */, + 98729ADF2A87DFBE00E81662 /* eidos_sorting.cpp in Sources */, 98D7ECA728CE58FC00DEAAC4 /* gauss.c in Sources */, 98D7ECA828CE58FC00DEAAC4 /* eidos_test_operators_arithmetic.cpp in Sources */, 981DC38528E27301000ABE91 /* eidos_functions_files.cpp in Sources */, diff --git a/eidos/eidos_functions_other.cpp b/eidos/eidos_functions_other.cpp index 7d896d84..e5187ad9 100644 --- a/eidos/eidos_functions_other.cpp +++ b/eidos/eidos_functions_other.cpp @@ -798,6 +798,10 @@ EidosValue_SP Eidos_ExecuteFunction_parallelGetTaskThreadCounts(__attribute__((u objectElement->SetKeyValue_StringKeys("RUNIF_2", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_RUNIF_2))); objectElement->SetKeyValue_StringKeys("RUNIF_3", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_RUNIF_3))); + objectElement->SetKeyValue_StringKeys("SORT_INT", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_SORT_INT))); + objectElement->SetKeyValue_StringKeys("SORT_FLOAT", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_SORT_FLOAT))); + objectElement->SetKeyValue_StringKeys("SORT_STRING", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_SORT_STRING))); + objectElement->SetKeyValue_StringKeys("POINT_IN_BOUNDS_1D", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_POINT_IN_BOUNDS_1D))); objectElement->SetKeyValue_StringKeys("POINT_IN_BOUNDS_2D", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_POINT_IN_BOUNDS_2D))); objectElement->SetKeyValue_StringKeys("POINT_IN_BOUNDS_3D", EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Int_singleton(gEidos_OMP_threads_POINT_IN_BOUNDS_3D))); @@ -999,6 +1003,10 @@ EidosValue_SP Eidos_ExecuteFunction_parallelSetTaskThreadCounts(__attribute__((u else if (key == "RUNIF_2") gEidos_OMP_threads_RUNIF_2 = (int)value_int64; else if (key == "RUNIF_3") gEidos_OMP_threads_RUNIF_3 = (int)value_int64; + else if (key == "SORT_INT") gEidos_OMP_threads_SORT_INT = (int)value_int64; + else if (key == "SORT_FLOAT") gEidos_OMP_threads_SORT_FLOAT = (int)value_int64; + else if (key == "SORT_STRING") gEidos_OMP_threads_SORT_STRING = (int)value_int64; + else if (key == "POINT_IN_BOUNDS_1D") gEidos_OMP_threads_POINT_IN_BOUNDS_1D = (int)value_int64; else if (key == "POINT_IN_BOUNDS_2D") gEidos_OMP_threads_POINT_IN_BOUNDS_2D = (int)value_int64; else if (key == "POINT_IN_BOUNDS_3D") gEidos_OMP_threads_POINT_IN_BOUNDS_3D = (int)value_int64; diff --git a/eidos/eidos_globals.cpp b/eidos/eidos_globals.cpp index 15250982..d65917c3 100644 --- a/eidos/eidos_globals.cpp +++ b/eidos/eidos_globals.cpp @@ -302,6 +302,10 @@ int gEidos_OMP_threads_RUNIF_1 = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_RUNIF_2 = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_RUNIF_3 = EIDOS_OMP_MAX_THREADS; +int gEidos_OMP_threads_SORT_INT = EIDOS_OMP_MAX_THREADS; +int gEidos_OMP_threads_SORT_FLOAT = EIDOS_OMP_MAX_THREADS; +int gEidos_OMP_threads_SORT_STRING = EIDOS_OMP_MAX_THREADS; + int gEidos_OMP_threads_POINT_IN_BOUNDS_1D = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_POINT_IN_BOUNDS_2D = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_POINT_IN_BOUNDS_3D = EIDOS_OMP_MAX_THREADS; @@ -451,6 +455,10 @@ void _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts per_task_thread_count gEidos_OMP_threads_RUNIF_2 = EIDOS_OMP_MAX_THREADS; gEidos_OMP_threads_RUNIF_3 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SORT_INT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SORT_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SORT_STRING = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = EIDOS_OMP_MAX_THREADS; gEidos_OMP_threads_POINT_IN_BOUNDS_2D = EIDOS_OMP_MAX_THREADS; gEidos_OMP_threads_POINT_IN_BOUNDS_3D = EIDOS_OMP_MAX_THREADS; @@ -581,6 +589,10 @@ void _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts per_task_thread_count gEidos_OMP_threads_RUNIF_2 = 16; gEidos_OMP_threads_RUNIF_3 = 16; + gEidos_OMP_threads_SORT_INT = 16; // subject to revision + gEidos_OMP_threads_SORT_FLOAT = 16; // subject to revision + gEidos_OMP_threads_SORT_STRING = 16; // subject to revision + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = 12; gEidos_OMP_threads_POINT_IN_BOUNDS_2D = 12; gEidos_OMP_threads_POINT_IN_BOUNDS_3D = 16; @@ -713,6 +725,10 @@ void _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts per_task_thread_count gEidos_OMP_threads_RUNIF_2 = 40; gEidos_OMP_threads_RUNIF_3 = 40; + gEidos_OMP_threads_SORT_INT = 40; // subject to revision + gEidos_OMP_threads_SORT_FLOAT = 40; // subject to revision + gEidos_OMP_threads_SORT_STRING = 40; // subject to revision + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = 40; gEidos_OMP_threads_POINT_IN_BOUNDS_2D = 40; gEidos_OMP_threads_POINT_IN_BOUNDS_3D = 40; @@ -870,6 +886,10 @@ void _Eidos_ClipOpenMPThreadCounts(void) gEidos_OMP_threads_RUNIF_2 = std::min(gEidosMaxThreads, gEidos_OMP_threads_RUNIF_2); gEidos_OMP_threads_RUNIF_3 = std::min(gEidosMaxThreads, gEidos_OMP_threads_RUNIF_3); + gEidos_OMP_threads_SORT_INT = std::min(gEidosMaxThreads, gEidos_OMP_threads_SORT_INT); + gEidos_OMP_threads_SORT_FLOAT = std::min(gEidosMaxThreads, gEidos_OMP_threads_SORT_FLOAT); + gEidos_OMP_threads_SORT_STRING = std::min(gEidosMaxThreads, gEidos_OMP_threads_SORT_STRING); + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = std::min(gEidosMaxThreads, gEidos_OMP_threads_POINT_IN_BOUNDS_1D); gEidos_OMP_threads_POINT_IN_BOUNDS_2D = std::min(gEidosMaxThreads, gEidos_OMP_threads_POINT_IN_BOUNDS_2D); gEidos_OMP_threads_POINT_IN_BOUNDS_3D = std::min(gEidosMaxThreads, gEidos_OMP_threads_POINT_IN_BOUNDS_3D); diff --git a/eidos/eidos_openmp.h b/eidos/eidos_openmp.h index 1c751190..1ce4093c 100644 --- a/eidos/eidos_openmp.h +++ b/eidos/eidos_openmp.h @@ -322,6 +322,11 @@ class EidosDebugLock #define EIDOS_OMPMIN_RUNIF_2 10000 #define EIDOS_OMPMIN_RUNIF_3 10000 +// Sorting & ordering +#define EIDOS_OMPMIN_SORT_INT 4000 +#define EIDOS_OMPMIN_SORT_FLOAT 4000 +#define EIDOS_OMPMIN_SORT_STRING 4000 + // Spatial point/map manipulation #define EIDOS_OMPMIN_POINT_IN_BOUNDS_1D 2000 #define EIDOS_OMPMIN_POINT_IN_BOUNDS_2D 2000 @@ -452,6 +457,11 @@ class EidosDebugLock #define EIDOS_OMPMIN_RUNIF_2 0 #define EIDOS_OMPMIN_RUNIF_3 0 +// Sorting & ordering +#define EIDOS_OMPMIN_SORT_INT 0 +#define EIDOS_OMPMIN_SORT_FLOAT 0 +#define EIDOS_OMPMIN_SORT_STRING 0 + // Spatial point/map manipulation #define EIDOS_OMPMIN_POINT_IN_BOUNDS_1D 0 #define EIDOS_OMPMIN_POINT_IN_BOUNDS_2D 0 @@ -583,6 +593,11 @@ extern int gEidos_OMP_threads_RUNIF_1; extern int gEidos_OMP_threads_RUNIF_2; extern int gEidos_OMP_threads_RUNIF_3; +// Sorting & ordering +extern int gEidos_OMP_threads_SORT_INT; +extern int gEidos_OMP_threads_SORT_FLOAT; +extern int gEidos_OMP_threads_SORT_STRING; + // Spatial point/map manipulation; benchmark section P extern int gEidos_OMP_threads_POINT_IN_BOUNDS_1D; extern int gEidos_OMP_threads_POINT_IN_BOUNDS_2D; diff --git a/eidos/eidos_sorting.cpp b/eidos/eidos_sorting.cpp index 4bd6c8bd..96a18d91 100644 --- a/eidos/eidos_sorting.cpp +++ b/eidos/eidos_sorting.cpp @@ -17,159 +17,31 @@ // // You should have received a copy of the GNU General Public License along with Eidos. If not, see . +#include +#include +#include +#include + #include "eidos_sorting.h" -#include "eidos_openmp.h" - - -// Here are some early explorations into parallelizing sorting. The speedups -// here are not particularly impressive. Parallel sorting is a very deep and -// complex rabbit hole to go down; see, e.g., Victor Duvanenko's work at -// https://github.com/DragonSpit/ParallelAlgorithms. But those algorithms -// use TBB instead of OpenMP, and require C++17, so they're not easily usable. -// Wikipedia at https://en.wikipedia.org/wiki/Merge_sort#Parallel_merge_sort -// also has some very interesting commentary about parallelization of sorting. -// The code here is also very primitive - integer only, no templates, no -// client-suppliable comparator, etc. – so it would be hard to integrate into -// all the ways Eidos presently uses std::sort. Improving this looks like a -// good project for somebody in CS. - -// This parallel quicksort code is thanks to Ruud van der Pas, modified from -// https://www.openmp.org/wp-content/uploads/sc16-openmp-booth-tasking-ruud.pdf -#ifdef _OPENMP -static void _Eidos_ParallelQuicksort_I(int64_t *values, int64_t lo, int64_t hi) -{ - if (lo >= hi) - return; - - if (hi - lo + 1 <= 1000) { - // fall over to using std::sort when below a threshold interval size - // the larger the threshold, the less time we spend thrashing tasks on small - // intervals, which is good; but it also sets a limit on how many threads we - // we bring to bear on relatively small sorts, which is bad; 1000 seems fine - std::sort(values + lo, values + hi + 1); - } else { - // choose the middle of three pivots, in an attempt to avoid really bad pivots - int64_t pivot1 = *(values + lo); - int64_t pivot2 = *(values + hi); - int64_t pivot3 = *(values + ((lo + hi) >> 1)); - int64_t pivot; - - if (pivot1 > pivot2) - { - if (pivot2 > pivot3) pivot = pivot2; - else if (pivot1 > pivot3) pivot = pivot3; - else pivot = pivot1; - } - else - { - if (pivot1 > pivot3) pivot = pivot1; - else if (pivot2 > pivot3) pivot = pivot3; - else pivot = pivot2; - } - - // note that std::partition is not guaranteed to leave the pivot value in position - // we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy - // this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance - // note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort - int64_t *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const int64_t& em) { return em < pivot; }); - int64_t *middle2 = std::partition(middle1, values + hi + 1, [pivot](const int64_t& em) { return !(pivot < em); }); - int64_t mid1 = middle1 - values; - int64_t mid2 = middle2 - values; - #pragma omp task default(none) firstprivate(values, lo, mid1) - { _Eidos_ParallelQuicksort_I(values, lo, mid1 - 1); } // Left branch - #pragma omp task default(none) firstprivate(values, hi, mid2) - { _Eidos_ParallelQuicksort_I(values, mid2, hi); } // Right branch - } -} -#endif - -void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements) -{ -#ifdef _OPENMP - if (nelements > 1000) { - #pragma omp parallel default(none) shared(values, nelements) - { - #pragma omp single nowait - { - _Eidos_ParallelQuicksort_I(values, 0, nelements - 1); - } - } // End of parallel region - } - else - { - // Use std::sort for small vectors - std::sort(values, values + nelements); - } -#else - // Use std::sort when not running parallel - std::sort(values, values + nelements); -#endif -} - - -// This parallel mergesort code is thanks to Libor Bukata and Jan Dvořák, modified from -// https://cw.fel.cvut.cz/old/_media/courses/b4m35pag/lab6_slides_advanced_openmp.pdf -#ifdef _OPENMP -static void _Eidos_ParallelMergesort_I(int64_t *values, int64_t left, int64_t right) -{ - if (left >= right) - return; - - if (right - left <= 1000) - { - // fall over to using std::sort when below a threshold interval size - // the larger the threshold, the less time we spend thrashing tasks on small - // intervals, which is good; but it also sets a limit on how many threads we - // we bring to bear on relatively small sorts, which is bad; 1000 seems fine - std::sort(values + left, values + right + 1); - } - else - { - int64_t mid = (left + right) / 2; - #pragma omp taskgroup - { - // the original code had if() limits on task subdivision here, but that - // doesn't make sense to me, because we also have the threshold above, - // which serves the same purpose but avoids using std::sort on subdivided - // regions and then merging them with inplace_merge; if we assume that - // std::sort is faster than mergesort when running on one thread, then - // merging subdivided std::sort calls only seems like a good strategy - // when the std::sort calls happen on separate threads - #pragma omp task default(none) firstprivate(values, left, mid) untied - _Eidos_ParallelMergesort_I(values, left, mid); - #pragma omp task default(none) firstprivate(values, mid, right) untied - _Eidos_ParallelMergesort_I(values, mid + 1, right); - #pragma omp taskyield - } - std::inplace_merge(values + left, values + mid + 1, values + right + 1); - } -} -#endif - -void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements) -{ -#ifdef _OPENMP - if (nelements > 1000) { - #pragma omp parallel default(none) shared(values, nelements) - { - #pragma omp single - { - _Eidos_ParallelMergesort_I(values, 0, nelements - 1); - } - } // End of parallel region - } - else - { - // Use std::sort for small vectors - std::sort(values, values + nelements); - } -#else - // Use std::sort when not running parallel - std::sort(values, values + nelements); -#endif -} +// Generate the code for our sorting variants + +#define SORTTYPE int64_t +#define EIDOS_SORT_CUTOFF EIDOS_OMPMIN_SORT_INT +#define EIDOS_SORT_THREADS gEidos_OMP_threads_SORT_INT +#include "eidos_sorting.inc" + +#define SORTTYPE double +#define EIDOS_SORT_CUTOFF EIDOS_OMPMIN_SORT_FLOAT +#define EIDOS_SORT_THREADS gEidos_OMP_threads_SORT_FLOAT +#include "eidos_sorting.inc" + +#define SORTTYPE std::string +#define EIDOS_SORT_CUTOFF EIDOS_OMPMIN_SORT_STRING +#define EIDOS_SORT_THREADS gEidos_OMP_threads_SORT_STRING +#include "eidos_sorting.inc" + diff --git a/eidos/eidos_sorting.h b/eidos/eidos_sorting.h index 9fe8c8b6..d49a19f7 100644 --- a/eidos/eidos_sorting.h +++ b/eidos/eidos_sorting.h @@ -33,14 +33,120 @@ #include #include #include +#include // // Parallel sorting; these use std::sort when we are not running parallel, or for small jobs // -void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements); -void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements); +// Here are some early explorations into parallelizing sorting. The speedups +// here are not particularly impressive. Parallel sorting is a very deep and +// complex rabbit hole to go down; see, e.g., Victor Duvanenko's work at +// https://github.com/DragonSpit/ParallelAlgorithms. But those algorithms +// use TBB instead of OpenMP, and require C++17, so they're not easily usable. +// Wikipedia at https://en.wikipedia.org/wiki/Merge_sort#Parallel_merge_sort +// also has some very interesting commentary about parallelization of sorting. +// The code here is also very primitive - integer only, no templates, no +// client-suppliable comparator, etc. – so it would be hard to integrate into +// all the ways Eidos presently uses std::sort. Improving this looks like a +// good project for somebody in CS. + +void Eidos_ParallelSort(int64_t *values, int64_t nelements, bool p_ascending); +void Eidos_ParallelSort(float *values, int64_t nelements, bool p_ascending); +void Eidos_ParallelSort(double *values, int64_t nelements, bool p_ascending); +void Eidos_ParallelSort(std::string *values, int64_t nelements, bool p_ascending); + +// This constant basically determines how much a sorting task will get subdivided +// before the small chunks get turned over to std::sort() to single-threaded sort. +// The optimum (probably hardware-dependent) was determined by trial and error. +#define EIDOS_FALLTHROUGH_FACTOR 10 + +// This template-based version should be equivalent to the above include-based +// version, but unfortunately it runs a little slower; I can't figure out why. +#ifdef _OPENMP +template +static void _Eidos_ParallelQuicksort_Comparator(T *values, int64_t lo, int64_t hi, const Compare &comparator, int64_t fallthrough) +{ + if (lo >= hi) + return; + + if (hi - lo + 1 <= fallthrough) { + // fall through to sorting with std::sort() below our threshold size + std::sort(values + lo, values + hi + 1, comparator); + } else { + // choose the middle of three pivots, in an attempt to avoid really bad pivots + T &pivot1 = *(values + lo); + T &pivot2 = *(values + hi); + T &pivot3 = *(values + ((lo + hi) >> 1)); + T pivot; + + if (pivot1 > pivot2) + { + if (pivot2 > pivot3) pivot = pivot2; + else if (pivot1 > pivot3) pivot = pivot3; + else pivot = pivot1; + } + else + { + if (pivot1 > pivot3) pivot = pivot1; + else if (pivot2 > pivot3) pivot = pivot3; + else pivot = pivot2; + } + + // note that std::partition is not guaranteed to leave the pivot value in position + // we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy + // this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance + // note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort + T *middle1 = std::partition(values + lo, values + hi + 1, [&, pivot, comparator](const T& em) { return comparator(em, pivot); }); + T *middle2 = std::partition(middle1, values + hi + 1, [&, pivot, comparator](const T& em) { return !comparator(pivot, em); }); + int64_t mid1 = middle1 - values; + int64_t mid2 = middle2 - values; + #pragma omp task default(none) firstprivate(values, lo, mid1) shared(comparator, fallthrough) + { _Eidos_ParallelQuicksort_Comparator(values, lo, mid1 - 1, comparator, fallthrough); } // Left branch + #pragma omp task default(none) firstprivate(values, hi, mid2) shared(comparator, fallthrough) + { _Eidos_ParallelQuicksort_Comparator(values, mid2, hi, comparator, fallthrough); } // Right branch + } +} +#endif + +template +void Eidos_ParallelSort_Comparator(T *values, int64_t nelements, const Compare &comparator) +{ + // This wrapper just ensures that we use std::sort for small vectors, + // or when not parallel, andhandles ascending vs. descending +#ifdef _OPENMP + if (nelements >= EIDOS_OMPMIN_SORT_INT) { // we use the integer cutoff here; it's the common case + EIDOS_THREAD_COUNT(gEidos_OMP_threads_SORT_INT); // we use the integer cutoff here; it's the common case +#pragma omp parallel default(none) shared(values, nelements, comparator) num_threads(thread_count) + { + // We fall through to using std::sort when below a threshold interval size. + // The larger the threshold, the less time we spend thrashing tasks on small + // intervals, which is good; but it also sets a limit on how many threads we + // we bring to bear on relatively small sorts, which is bad. We try to + // calculate the optimal fall-through heuristically here; basically we want + // to subdivide with tasks enough that the workload is shared well, and then + // do the rest of the work with std::sort(). The more threads there are, + // the smaller we want to subdivide. + int64_t fallthrough = nelements / (EIDOS_FALLTHROUGH_FACTOR * omp_get_num_threads()); + + if (fallthrough < 1000) + fallthrough = 1000; + +#pragma omp single nowait + { + _Eidos_ParallelQuicksort_Comparator(values, 0, nelements - 1, comparator, fallthrough); + } + } // End of parallel region + } + else + { + std::sort(values, values + nelements, comparator); + } +#else + std::sort(values, values + nelements, comparator); +#endif +} // diff --git a/eidos/eidos_sorting.inc b/eidos/eidos_sorting.inc new file mode 100644 index 00000000..ece12eb5 --- /dev/null +++ b/eidos/eidos_sorting.inc @@ -0,0 +1,213 @@ +// +// eidos_sorting.inc +// SLiM +// +// Created by Ben Haller on 8/12/23. +// Copyright (c) 2023 Philipp Messer. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + + +// This file is intended to be #included to generate code. This would be cleaner if we could use +// C++ templates, but that seems to kill performance, in my tests; the compiler does not seem to +// be smart enough, or (very possibly) I'm doing it wrong. Anyhow, define SORTTYPE and then include. + +#ifndef SORTTYPE +#error SORTTYPE must be defined before including this file +#endif + + +// This parallel quicksort code is thanks to Ruud van der Pas, modified from +// https://www.openmp.org/wp-content/uploads/sc16-openmp-booth-tasking-ruud.pdf +#ifdef _OPENMP +static void _Eidos_ParallelQuicksort_ASCENDING(SORTTYPE *values, int64_t lo, int64_t hi, int64_t fallthrough) +{ + if (lo >= hi) + return; + + if (hi - lo + 1 <= fallthrough) { + // fall through to sorting with std::sort() below our threshold size + std::sort(values + lo, values + hi + 1); + } else { + // choose the middle of three pivots, in an attempt to avoid really bad pivots + SORTTYPE &pivot1 = *(values + lo); + SORTTYPE &pivot2 = *(values + hi); + SORTTYPE &pivot3 = *(values + ((lo + hi) >> 1)); + SORTTYPE pivot; + + if (pivot1 > pivot2) + { + if (pivot2 > pivot3) pivot = pivot2; + else if (pivot1 > pivot3) pivot = pivot3; + else pivot = pivot1; + } + else + { + if (pivot1 > pivot3) pivot = pivot1; + else if (pivot2 > pivot3) pivot = pivot3; + else pivot = pivot2; + } + + // note that std::partition is not guaranteed to leave the pivot value in position + // we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy + // this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance + // note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort + SORTTYPE *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const SORTTYPE& em) { return em < pivot; }); + SORTTYPE *middle2 = std::partition(middle1, values + hi + 1, [pivot](const SORTTYPE& em) { return !(pivot < em); }); + int64_t mid1 = middle1 - values; + int64_t mid2 = middle2 - values; + #pragma omp task default(none) firstprivate(values, lo, mid1, fallthrough) + { _Eidos_ParallelQuicksort_ASCENDING(values, lo, mid1 - 1, fallthrough); } // Left branch + #pragma omp task default(none) firstprivate(values, hi, mid2, fallthrough) + { _Eidos_ParallelQuicksort_ASCENDING(values, mid2, hi, fallthrough); } // Right branch + } +} + +static void _Eidos_ParallelQuicksort_DESCENDING(SORTTYPE *values, int64_t lo, int64_t hi, int64_t fallthrough) +{ + if (lo >= hi) + return; + + if (hi - lo + 1 <= fallthrough) { + // fall through to sorting with std::sort() below our threshold size + // note that descending sorts are slower because this STL usage pattern is slower! + std::sort(values + lo, values + hi + 1, std::greater()); + } else { + // choose the middle of three pivots, in an attempt to avoid really bad pivots + SORTTYPE &pivot1 = *(values + lo); + SORTTYPE &pivot2 = *(values + hi); + SORTTYPE &pivot3 = *(values + ((lo + hi) >> 1)); + SORTTYPE pivot; + + if (pivot1 > pivot2) + { + if (pivot2 > pivot3) pivot = pivot2; + else if (pivot1 > pivot3) pivot = pivot3; + else pivot = pivot1; + } + else + { + if (pivot1 > pivot3) pivot = pivot1; + else if (pivot2 > pivot3) pivot = pivot3; + else pivot = pivot2; + } + + // note that std::partition is not guaranteed to leave the pivot value in position + // we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy + // this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance + // note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort + SORTTYPE *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const SORTTYPE& em) { return em > pivot; }); + SORTTYPE *middle2 = std::partition(middle1, values + hi + 1, [pivot](const SORTTYPE& em) { return !(pivot > em); }); + int64_t mid1 = middle1 - values; + int64_t mid2 = middle2 - values; + #pragma omp task default(none) firstprivate(values, lo, mid1, fallthrough) + { _Eidos_ParallelQuicksort_DESCENDING(values, lo, mid1 - 1, fallthrough); } // Left branch + #pragma omp task default(none) firstprivate(values, hi, mid2, fallthrough) + { _Eidos_ParallelQuicksort_DESCENDING(values, mid2, hi, fallthrough); } // Right branch + } +} +#endif + +void Eidos_ParallelSort(SORTTYPE *values, int64_t nelements, bool p_ascending) +{ + // This wrapper just ensures that we use std::sort for small vectors, + // or when not parallel, andhandles ascending vs. descending +#ifdef _OPENMP + if (nelements >= EIDOS_SORT_CUTOFF) { + EIDOS_THREAD_COUNT(EIDOS_SORT_THREADS); +#pragma omp parallel default(none) shared(values, nelements, p_ascending) num_threads(thread_count) + { + // We fall through to using std::sort when below a threshold interval size. + // The larger the threshold, the less time we spend thrashing tasks on small + // intervals, which is good; but it also sets a limit on how many threads we + // we bring to bear on relatively small sorts, which is bad. We try to + // calculate the optimal fall-through heuristically here; basically we want + // to subdivide with tasks enough that the workload is shared well, and then + // do the rest of the work with std::sort(). The more threads there are, + // the smaller we want to subdivide. + int64_t fallthrough = nelements / (EIDOS_FALLTHROUGH_FACTOR * omp_get_num_threads()); + + if (fallthrough < 1000) + fallthrough = 1000; + + if (p_ascending) + { +#pragma omp single nowait + { + _Eidos_ParallelQuicksort_ASCENDING(values, 0, nelements - 1, fallthrough); + } + } + else + { +#pragma omp single nowait + { + _Eidos_ParallelQuicksort_DESCENDING(values, 0, nelements - 1, fallthrough); + } + } + } // End of parallel region + } + else + { + if (p_ascending) + std::sort(values, values + nelements); + else + std::sort(values, values + nelements, std::greater()); + } +#else + if (p_ascending) + std::sort(values, values + nelements); + else + std::sort(values, values + nelements, std::greater()); +#endif +} + +// Leave the includer with SORTTYPE undefined, to avoid accidents +#undef SORTTYPE +#undef EIDOS_SORT_CUTOFF +#undef EIDOS_SORT_THREADS + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/eidos/eidos_test.cpp b/eidos/eidos_test.cpp index 7da26628..29005f4f 100644 --- a/eidos/eidos_test.cpp +++ b/eidos/eidos_test.cpp @@ -1277,18 +1277,21 @@ int RunEidosTests(void) } #endif -#if 1 +#if 0 // Speed and correctness tests of various parallel sorting algorithms { std::cout << std::endl << "SORTING TESTS:" << std::endl; gsl_rng *rng = EIDOS_GSL_RNG(omp_get_thread_num()); // the single-threaded RNG - typedef int64_t SORT_TYPE; + typedef std::string SORT_TYPE; { - auto comparator = [](SORT_TYPE a, SORT_TYPE b) { - return a < b; - }; + // use the appropriate comparator for the sort type, in the code below + // note that Eidos_ParallelSort_Comparator() and Sriram_parallel_omp_sort() require a comparator, + // whereas std::sort() defaults to ascending (op <) by default, and Eidos_ParallelSort() doesn't take one. + //auto comparator_scalar = [](SORT_TYPE a, SORT_TYPE b) { return a < b; }; + auto comparator_string = [](const std::string &a, const std::string &b) { return a < b; }; + //auto comparator_double = [](const double& a, const double& b) { return std::isnan(b) || (a < b); }; const std::size_t test_size = 10000000; const int reps = 5; double time_sum; @@ -1296,7 +1299,10 @@ int RunEidosTests(void) data_original.resize(test_size); for (std::size_t i = 0; i < test_size; ++i) - data_original[i] = Eidos_rng_uniform_int(rng, test_size); + { + //data_original[i] = Eidos_rng_uniform_int(rng, test_size); + data_original[i] = std::to_string(Eidos_rng_uniform_int(rng, test_size)); + } std::vector data_stdsort; @@ -1324,7 +1330,7 @@ int RunEidosTests(void) std::vector data_PQUICK = data_original; eidos_profile_t begin = Eidos_BenchmarkTime(); - Eidos_ParallelQuicksort_I(data_PQUICK.data(), data_PQUICK.size()); + Eidos_ParallelSort(data_PQUICK.data(), data_PQUICK.size(), true); eidos_profile_t end = Eidos_BenchmarkTime(); double time_spent = Eidos_ElapsedProfileTime(end - begin); @@ -1334,18 +1340,18 @@ int RunEidosTests(void) } std::cout << " : mean " << time_sum / reps << std::endl; - std::cout << "time (PMERGE): "; + std::cout << "time (PQUICKCOMP): "; time_sum = 0.0; for (int rep = 0; rep < reps; ++rep) { - std::vector data_PMERGE = data_original; + std::vector data_PQUICKCOMP = data_original; eidos_profile_t begin = Eidos_BenchmarkTime(); - Eidos_ParallelMergesort_I(data_PMERGE.data(), data_PMERGE.size()); + Eidos_ParallelSort_Comparator(data_PQUICKCOMP.data(), data_PQUICKCOMP.size(), comparator_string); eidos_profile_t end = Eidos_BenchmarkTime(); double time_spent = Eidos_ElapsedProfileTime(end - begin); - bool correct = (data_PMERGE == data_stdsort); + bool correct = (data_PQUICKCOMP == data_stdsort); std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : ""); time_sum += time_spent; } @@ -1358,7 +1364,7 @@ int RunEidosTests(void) std::vector data_PSRIRAM = data_original; eidos_profile_t begin = Eidos_BenchmarkTime(); - Sriram_parallel_omp_sort(data_PSRIRAM, comparator); + Sriram_parallel_omp_sort(data_PSRIRAM, comparator_string); eidos_profile_t end = Eidos_BenchmarkTime(); double time_spent = Eidos_ElapsedProfileTime(end - begin); diff --git a/eidos/eidos_test_parallel.h b/eidos/eidos_test_parallel.h index e1fbd5ee..c32fb35f 100644 --- a/eidos/eidos_test_parallel.h +++ b/eidos/eidos_test_parallel.h @@ -435,6 +435,18 @@ if (!identical(y1, yN) | !identical(z1, zN)) stop('parallel sort(float x) failed // *********************************************************************************************** +// (string)sort(string x) + +x = asString(runif(1000000, -100, 100)); +yN = sort(x); +zN = sort(x, ascending=F); +parallelSetNumThreads(1); +y1 = sort(x); +z1 = sort(x, ascending=F); +if (!identical(y1, yN) | !identical(z1, zN)) stop('parallel sort(string x) failed test'); + +// *********************************************************************************************** + // (float)dnorm(float x, numeric$ mean, numeric$ sd) // EIDOS_OMPMIN_DNORM_1 x = runif(1000000, -100, 100); @@ -627,8 +639,8 @@ if (abs(sd(aN)-sd(a1)) > 0.02) stop('parallel rpois() failed sd test'); aN = runif(1000000, 0, 1); parallelSetNumThreads(1); a1 = runif(1000000, 0, 1); -if (abs(mean(aN)-mean(a1)) > 0.0015) stop('parallel rpois() failed mean test'); -if (abs(sd(aN)-sd(a1)) > 0.0006) stop('parallel rpois() failed sd test'); +if (abs(mean(aN)-mean(a1)) > 0.0015) stop('parallel runif() failed mean test'); +if (abs(sd(aN)-sd(a1)) > 0.0006) stop('parallel runif() failed sd test'); // *********************************************************************************************** @@ -637,8 +649,8 @@ if (abs(sd(aN)-sd(a1)) > 0.0006) stop('parallel rpois() failed sd test'); aN = runif(1000000, 3, 17); parallelSetNumThreads(1); a1 = runif(1000000, 3, 17); -if (abs(mean(aN)-mean(a1)) > 0.02) stop('parallel rpois() failed mean test'); -if (abs(sd(aN)-sd(a1)) > 0.01) stop('parallel rpois() failed sd test'); +if (abs(mean(aN)-mean(a1)) > 0.02) stop('parallel runif() failed mean test'); +if (abs(sd(aN)-sd(a1)) > 0.01) stop('parallel runif() failed sd test'); // *********************************************************************************************** @@ -649,8 +661,8 @@ max = rep(17.0, 1000000); aN = runif(1000000, 3, 17); parallelSetNumThreads(1); a1 = runif(1000000, 3, 17); -if (abs(mean(aN)-mean(a1)) > 0.02) stop('parallel rpois() failed mean test'); -if (abs(sd(aN)-sd(a1)) > 0.01) stop('parallel rpois() failed sd test'); +if (abs(mean(aN)-mean(a1)) > 0.02) stop('parallel runif() failed mean test'); +if (abs(sd(aN)-sd(a1)) > 0.01) stop('parallel runif() failed sd test'); )V0G0N" diff --git a/eidos/eidos_value.cpp b/eidos/eidos_value.cpp index b570d107..45541ed8 100644 --- a/eidos/eidos_value.cpp +++ b/eidos/eidos_value.cpp @@ -1661,19 +1661,8 @@ void EidosValue_Int_vector::PushValueFromIndexOfEidosValue(int p_idx, const Eido void EidosValue_Int_vector::Sort(bool p_ascending) { - if (p_ascending) - { - // For the ascending int64_t case specifically, we now have a parallel quicksort - // algorithm that gives a little speedup. This is kind of experimental, but has - // been tested and seems to be correct. I wrote a parallel mergesort algorithm - // too; its performance is kind of comparable but quicksort gives a bit more - // speed with a small number of threads (2-10 threads), so I chose it for now. - // This is the only place that sorting has been parallelized so far. Note that - // this function automatically falls back to std::sort when single-threaded. - Eidos_ParallelQuicksort_I(values_, count_); - } - else - std::sort(values_, values_ + count_, std::greater()); + // This will sort in parallel if the task is large enough (and we're running parallel) + Eidos_ParallelSort(values_, count_, p_ascending); } EidosValue_Int_vector *EidosValue_Int_vector::reserve(size_t p_reserved_size) @@ -1959,10 +1948,11 @@ void EidosValue_Float_vector::PushValueFromIndexOfEidosValue(int p_idx, const Ei void EidosValue_Float_vector::Sort(bool p_ascending) { + // Unfortunately a custom comparator is needed to make the sort order with NANs match that of R if (p_ascending) - std::sort(values_, values_ + count_, [](const double& a, const double& b) { return std::isnan(b) || (a < b); }); + Eidos_ParallelSort_Comparator(values_, count_, [](const double& a, const double& b) { return std::isnan(b) || (a < b); }); else - std::sort(values_, values_ + count_, [](const double& a, const double& b) { return std::isnan(b) || (a > b); }); + Eidos_ParallelSort_Comparator(values_, count_, [](const double& a, const double& b) { return std::isnan(b) || (a > b); }); } EidosValue_Float_vector *EidosValue_Float_vector::reserve(size_t p_reserved_size)