From b31025487d27324ac75e17636ad9ed2eaa6ad2b9 Mon Sep 17 00:00:00 2001 From: Ben Haller Date: Sat, 12 Aug 2023 10:28:22 -0400 Subject: [PATCH] parallelizing sorting, pass 1 --- EidosScribe/EidosTextView.mm | 1 + SLiM.xcodeproj/project.pbxproj | 20 +++ VERSIONS | 1 + core/genome.cpp | 1 + eidos/eidos_functions_stats.cpp | 1 + eidos/eidos_functions_values.cpp | 1 + eidos/eidos_globals.cpp | 147 ---------------------- eidos/eidos_globals.h | 71 ----------- eidos/eidos_openmp.h | 3 + eidos/eidos_sorting.cpp | 208 +++++++++++++++++++++++++++++++ eidos/eidos_sorting.h | 148 ++++++++++++++++++++++ eidos/eidos_test.cpp | 98 +++++++++++++++ eidos/eidos_value.cpp | 1 + 13 files changed, 483 insertions(+), 218 deletions(-) create mode 100644 eidos/eidos_sorting.cpp create mode 100644 eidos/eidos_sorting.h diff --git a/EidosScribe/EidosTextView.mm b/EidosScribe/EidosTextView.mm index 5c15a3352..6b10c37b6 100644 --- a/EidosScribe/EidosTextView.mm +++ b/EidosScribe/EidosTextView.mm @@ -29,6 +29,7 @@ #include "eidos_property_signature.h" #include "eidos_type_table.h" #include "eidos_type_interpreter.h" +#include "eidos_sorting.h" #include #include diff --git a/SLiM.xcodeproj/project.pbxproj b/SLiM.xcodeproj/project.pbxproj index 7153e806e..8cd0debce 100644 --- a/SLiM.xcodeproj/project.pbxproj +++ b/SLiM.xcodeproj/project.pbxproj @@ -350,6 +350,14 @@ 986D73E81B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; }; 986D73E91B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; }; 986D73EA1B07E89E007FBB70 /* eidos_call_signature.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */; }; + 98729ACF2A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD02A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD12A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD22A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD32A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD42A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD52A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; + 98729AD62A87AAB700E81662 /* eidos_sorting.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */; }; 98760EDD28CE5E7600CEBC40 /* libomp.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 98760EDC28CE5E7600CEBC40 /* libomp.dylib */; }; 98760EDE28CE5E8200CEBC40 /* libomp.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 98760EDC28CE5E7600CEBC40 /* libomp.dylib */; }; 9876E5F51ED5572C00FF9762 /* tdist.c in Sources */ = {isa = PBXBuildFile; fileRef = 9876E5F41ED5572C00FF9762 /* tdist.c */; }; @@ -1618,6 +1626,8 @@ 986926EA1AA6B7480000E138 /* GraphView_PopulationVisualization.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = GraphView_PopulationVisualization.mm; sourceTree = ""; }; 986D73E61B07E89E007FBB70 /* eidos_call_signature.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_call_signature.cpp; sourceTree = ""; }; 986D73E71B07E89E007FBB70 /* eidos_call_signature.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = eidos_call_signature.h; sourceTree = ""; }; + 98729ACD2A87A93500E81662 /* eidos_sorting.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = eidos_sorting.h; sourceTree = ""; }; + 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = eidos_sorting.cpp; sourceTree = ""; }; 9873B12328CE26CD00582D83 /* eidos_openmp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = eidos_openmp.h; sourceTree = ""; }; 98760EDC28CE5E7600CEBC40 /* libomp.dylib */ = {isa = PBXFileReference; lastKnownFileType = "compiled.mach-o.dylib"; name = libomp.dylib; path = /usr/local/lib/libomp.dylib; sourceTree = ""; }; 9876E5F31ED5572C00FF9762 /* gsl_cdf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = gsl_cdf.h; path = cdf/gsl_cdf.h; sourceTree = ""; }; @@ -2009,6 +2019,8 @@ 9857E33A1BB58DAE00F1C8A9 /* eidos_intrusive_ptr.h */, 985A11861BBA07CB009EE1FF /* eidos_object_pool.h */, 9809F8BD24F32B3E00D312E4 /* eidos_tinycolormap.h */, + 98729ACD2A87A93500E81662 /* eidos_sorting.h */, + 98729ACE2A87AAB700E81662 /* eidos_sorting.cpp */, 9893C7DC1CDA2D650029AC94 /* eidos_beep.h */, 9893C7DB1CDA2D650029AC94 /* eidos_beep.cpp */, 98B8AF8725A2BE7200C95D66 /* json_fwd.hpp */, @@ -3300,6 +3312,7 @@ 982556B81BA8EF8C0054CB3F /* eidos_property_signature.cpp in Sources */, 98C8217F1C7A983800548839 /* inline.c in Sources */, 985F3F0E24BA31D900E712E0 /* eidos_test_functions_statistics.cpp in Sources */, + 98729AD42A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98C821AC1C7A9B1600548839 /* discrete.c in Sources */, 9807663924493E0B00F6CBB4 /* gzwrite.c in Sources */, 98C8218C1C7A983800548839 /* poisson.c in Sources */, @@ -3371,6 +3384,7 @@ 98332ADC1FDBC0D000274FF0 /* dgemv.c in Sources */, 98332B111FDBD09800274FF0 /* init.c in Sources */, 98C8212C1C7A980000548839 /* gamma.c in Sources */, + 98729ACF2A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98C8212D1C7A980000548839 /* gauss.c in Sources */, 985F3F0124BA307200E712E0 /* eidos_test_operators_arithmetic.cpp in Sources */, 985F3F0B24BA31D900E712E0 /* eidos_test_functions_statistics.cpp in Sources */, @@ -3613,6 +3627,7 @@ 98332AEE1FDBC29500274FF0 /* rowcol.c in Sources */, 9890D1EF27136BB7001EAE98 /* eidos_class_DataFrame.cpp in Sources */, 987AD8871B2CC0C10035D6C8 /* EidosVariableBrowserController.mm in Sources */, + 98729AD22A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 9807662B24493E0B00F6CBB4 /* crc32.c in Sources */, 98C821661C7A983800548839 /* beta.c in Sources */, 985F3EFE24BA2F1500E712E0 /* eidos_test_functions_vector.cpp in Sources */, @@ -3691,6 +3706,7 @@ 98CF5215294A3FC900557BBA /* binomial_tpe.c in Sources */, 98CF5216294A3FC900557BBA /* tables.c in Sources */, 98CF5217294A3FC900557BBA /* species_eidos.cpp in Sources */, + 98729AD12A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98CF5218294A3FC900557BBA /* slim_functions.cpp in Sources */, 98CF5219294A3FC900557BBA /* deflate.c in Sources */, 98CF521A294A3FC900557BBA /* FindRecipeController.mm in Sources */, @@ -3918,6 +3934,7 @@ 98CF53E8294A714200557BBA /* rowcol.c in Sources */, 98CF53E9294A714200557BBA /* eidos_class_DataFrame.cpp in Sources */, 98CF53EA294A714200557BBA /* EidosVariableBrowserController.mm in Sources */, + 98729AD32A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98CF53EB294A714200557BBA /* crc32.c in Sources */, 98CF53EC294A714200557BBA /* beta.c in Sources */, 98CF53ED294A714200557BBA /* eidos_test_functions_vector.cpp in Sources */, @@ -3996,6 +4013,7 @@ 98C821491C7A983700548839 /* binomial_tpe.c in Sources */, 9850D8E9206309A0006BFD2E /* tables.c in Sources */, 98AC617B24BA34ED0001914C /* species_eidos.cpp in Sources */, + 98729AD02A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98DDAED7221765480038C133 /* slim_functions.cpp in Sources */, 98076618244934A800F6CBB4 /* deflate.c in Sources */, 984252C3216FA9930019696A /* FindRecipeController.mm in Sources */, @@ -4192,6 +4210,7 @@ 98D7EBB328CE557C00DEAAC4 /* poisson.c in Sources */, 98D7EBB428CE557C00DEAAC4 /* eidos_type_table.cpp in Sources */, 98D7EBB528CE557C00DEAAC4 /* exp.c in Sources */, + 98729AD52A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98D7EBB628CE557C00DEAAC4 /* eidos_test_functions_other.cpp in Sources */, 98D7EBB728CE557C00DEAAC4 /* eidos_script.cpp in Sources */, 981DC37B28E27300000ABE91 /* eidos_functions_distributions.cpp in Sources */, @@ -4263,6 +4282,7 @@ 98D7ECA428CE58FC00DEAAC4 /* dgemv.c in Sources */, 98D7ECA528CE58FC00DEAAC4 /* init.c in Sources */, 98D7ECA628CE58FC00DEAAC4 /* gamma.c in Sources */, + 98729AD62A87AAB700E81662 /* eidos_sorting.cpp in Sources */, 98D7ECA728CE58FC00DEAAC4 /* gauss.c in Sources */, 98D7ECA828CE58FC00DEAAC4 /* eidos_test_operators_arithmetic.cpp in Sources */, 981DC38528E27301000ABE91 /* eidos_functions_files.cpp in Sources */, diff --git a/VERSIONS b/VERSIONS index 2040fa601..853e46b45 100644 --- a/VERSIONS +++ b/VERSIONS @@ -85,6 +85,7 @@ PARALLEL changes (now in the master branch): add recipe 22.7, Parallelizing nonWF reproduction and spatial interactions add internal EidosBenchmark facility for timing of internal loops, for benchmarking of parallelization scaling add a command-line option, -perTaskThreads, to let the user control how many threads to use for different tasks (if not overridden) + parallelize sorting, where possible/efficient development head (in the master branch): fix a minor bug with autofix when opening multiple .slim documents at once in SLiMgui diff --git a/core/genome.cpp b/core/genome.cpp index c5b92d304..5e34e5bee 100644 --- a/core/genome.cpp +++ b/core/genome.cpp @@ -26,6 +26,7 @@ #include "species.h" #include "polymorphism.h" #include "subpopulation.h" +#include "eidos_sorting.h" #include #include diff --git a/eidos/eidos_functions_stats.cpp b/eidos/eidos_functions_stats.cpp index b90b6fbe4..5341a3692 100644 --- a/eidos/eidos_functions_stats.cpp +++ b/eidos/eidos_functions_stats.cpp @@ -19,6 +19,7 @@ #include "eidos_functions.h" +#include "eidos_sorting.h" #include #include diff --git a/eidos/eidos_functions_values.cpp b/eidos/eidos_functions_values.cpp index 218a607e4..1be3f53b1 100644 --- a/eidos/eidos_functions_values.cpp +++ b/eidos/eidos_functions_values.cpp @@ -21,6 +21,7 @@ #include "eidos_functions.h" #include "eidos_interpreter.h" #include "eidos_rng.h" +#include "eidos_sorting.h" #include #include diff --git a/eidos/eidos_globals.cpp b/eidos/eidos_globals.cpp index 9afadeec5..15250982d 100644 --- a/eidos/eidos_globals.cpp +++ b/eidos/eidos_globals.cpp @@ -3331,153 +3331,6 @@ bool Eidos_RegexWorks(void) return regex_works; } -// Here are some early explorations into parallelizing sorting. The speedups -// here are not particularly impressive. Parallel sorting is a very deep and -// complex rabbit hole to go down; see, e.g., Victor Duvanenko's work at -// https://github.com/DragonSpit/ParallelAlgorithms. But those algorithms -// use TBB instead of OpenMP, and require C++17, so they're not easily usable. -// Wikipedia at https://en.wikipedia.org/wiki/Merge_sort#Parallel_merge_sort -// also has some very interesting commentary about parallelization of sorting. -// The code here is also very primitive - integer only, no templates, no -// client-suppliable comparator, etc. – so it would be hard to integrate into -// all the ways Eidos presently uses std::sort. Improving this looks like a -// good project for somebody in CS. - -// This parallel quicksort code is thanks to Ruud van der Pas, modified from -// https://www.openmp.org/wp-content/uploads/sc16-openmp-booth-tasking-ruud.pdf -#ifdef _OPENMP -static void _Eidos_ParallelQuicksort_I(int64_t *values, int64_t lo, int64_t hi) -{ - if (lo >= hi) - return; - - if (hi - lo + 1 <= 1000) { - // fall over to using std::sort when below a threshold interval size - // the larger the threshold, the less time we spend thrashing tasks on small - // intervals, which is good; but it also sets a limit on how many threads we - // we bring to bear on relatively small sorts, which is bad; 1000 seems fine - std::sort(values + lo, values + hi + 1); - } else { - // choose the middle of three pivots, in an attempt to avoid really bad pivots - int64_t pivot1 = *(values + lo); - int64_t pivot2 = *(values + hi); - int64_t pivot3 = *(values + ((lo + hi) >> 1)); - int64_t pivot; - - if (pivot1 > pivot2) - { - if (pivot2 > pivot3) pivot = pivot2; - else if (pivot1 > pivot3) pivot = pivot3; - else pivot = pivot1; - } - else - { - if (pivot1 > pivot3) pivot = pivot1; - else if (pivot2 > pivot3) pivot = pivot3; - else pivot = pivot2; - } - - // note that std::partition is not guaranteed to leave the pivot value in position - // we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy - // this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance - // note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort - int64_t *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const int64_t& em) { return em < pivot; }); - int64_t *middle2 = std::partition(middle1, values + hi + 1, [pivot](const int64_t& em) { return !(pivot < em); }); - int64_t mid1 = middle1 - values; - int64_t mid2 = middle2 - values; - #pragma omp task default(none) firstprivate(values, lo, mid1) - { _Eidos_ParallelQuicksort_I(values, lo, mid1 - 1); } // Left branch - #pragma omp task default(none) firstprivate(values, hi, mid2) - { _Eidos_ParallelQuicksort_I(values, mid2, hi); } // Right branch - } -} -#endif - -void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements) -{ -#ifdef _OPENMP - if (nelements > 1000) { - #pragma omp parallel default(none) shared(values, nelements) - { - #pragma omp single nowait - { - _Eidos_ParallelQuicksort_I(values, 0, nelements - 1); - } - } // End of parallel region - } - else - { - // Use std::sort for small vectors - std::sort(values, values + nelements); - } -#else - // Use std::sort when not running parallel - std::sort(values, values + nelements); -#endif -} - -// This parallel mergesort code is thanks to Libor Bukata and Jan Dvořák, modified from -// https://cw.fel.cvut.cz/old/_media/courses/b4m35pag/lab6_slides_advanced_openmp.pdf -#ifdef _OPENMP -static void _Eidos_ParallelMergesort_I(int64_t *values, int64_t left, int64_t right) -{ - if (left >= right) - return; - - if (right - left <= 1000) - { - // fall over to using std::sort when below a threshold interval size - // the larger the threshold, the less time we spend thrashing tasks on small - // intervals, which is good; but it also sets a limit on how many threads we - // we bring to bear on relatively small sorts, which is bad; 1000 seems fine - std::sort(values + left, values + right + 1); - } - else - { - int64_t mid = (left + right) / 2; - #pragma omp taskgroup - { - // the original code had if() limits on task subdivision here, but that - // doesn't make sense to me, because we also have the threshold above, - // which serves the same purpose but avoids using std::sort on subdivided - // regions and then merging them with inplace_merge; if we assume that - // std::sort is faster than mergesort when running on one thread, then - // merging subdivided std::sort calls only seems like a good strategy - // when the std::sort calls happen on separate threads - #pragma omp task default(none) firstprivate(values, left, mid) untied - _Eidos_ParallelMergesort_I(values, left, mid); - #pragma omp task default(none) firstprivate(values, mid, right) untied - _Eidos_ParallelMergesort_I(values, mid + 1, right); - #pragma omp taskyield - } - std::inplace_merge(values + left, values + mid + 1, values + right + 1); - } -} -#endif - -void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements) -{ -#ifdef _OPENMP - if (nelements > 1000) { - #pragma omp parallel default(none) shared(values, nelements) - { - #pragma omp single - { - _Eidos_ParallelMergesort_I(values, 0, nelements - 1); - } - } // End of parallel region - } - else - { - // Use std::sort for small vectors - std::sort(values, values + nelements); - } -#else - // Use std::sort when not running parallel - std::sort(values, values + nelements); -#endif -} - // ******************************************************************************************************************* // diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index 8f12923bb..15f8e8119 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -590,73 +590,6 @@ std::string Eidos_string_escaped_CSV(const std::string &unescapedString); // BCH 13 December 2017: no longer used, commenting this out //std::string Eidos_Exec(const char *p_cmd); -// Get indexes that would result in sorted ordering of a vector. This rather nice code is adapted from http://stackoverflow.com/a/12399290/2752221 -template -std::vector EidosSortIndexes(const std::vector &p_v, bool p_ascending = true) -{ - // initialize original index locations - std::vector idx(p_v.size()); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - if (p_ascending) - std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];}); - else - std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];}); - - return idx; -} - -template <> -inline std::vector EidosSortIndexes(const std::vector &p_v, bool p_ascending) -{ - // initialize original index locations - std::vector idx(p_v.size()); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - // this specialization for type double sorts NaNs to the end - if (p_ascending) - std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);}); - else - std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);}); - - return idx; -} - -template -std::vector EidosSortIndexes(const T *p_v, size_t p_size, bool p_ascending = true) -{ - // initialize original index locations - std::vector idx(p_size); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - if (p_ascending) - std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];}); - else - std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];}); - - return idx; -} - -template <> -inline std::vector EidosSortIndexes(const double *p_v, size_t p_size, bool p_ascending) -{ - // initialize original index locations - std::vector idx(p_size); - std::iota(idx.begin(), idx.end(), 0); - - // sort indexes based on comparing values in v - // this specialization for type double sorts NaNs to the end - if (p_ascending) - std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);}); - else - std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);}); - - return idx; -} - extern int gEidosFloatOutputPrecision; // precision used for output of float values in Eidos; not user-visible at present std::string EidosStringForFloat(double p_value); @@ -689,10 +622,6 @@ BidiIter Eidos_random_unique(BidiIter begin, BidiIter end, size_t num_random) // The library does not work on Ubuntu 18.04, annoyingly; probably a very old compiler or something. So we have to check. bool Eidos_RegexWorks(void); -// Parallel sorting; these use std::sort when we are not running parallel, or for small jobs -void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements); -void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements); - // ******************************************************************************************************************* // diff --git a/eidos/eidos_openmp.h b/eidos/eidos_openmp.h index 722565581..1c7511900 100644 --- a/eidos/eidos_openmp.h +++ b/eidos/eidos_openmp.h @@ -59,6 +59,9 @@ #include #include +#include +#include +#include // This is the largest number of threads we allow the user to set. There is no hard limit in the code; diff --git a/eidos/eidos_sorting.cpp b/eidos/eidos_sorting.cpp new file mode 100644 index 000000000..4bd6c8bd9 --- /dev/null +++ b/eidos/eidos_sorting.cpp @@ -0,0 +1,208 @@ +// +// eidos_sorting.cpp +// SLiM +// +// Created by Ben Haller on 8/12/23. +// Copyright (c) 2023 Philipp Messer. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +#include "eidos_sorting.h" +#include "eidos_openmp.h" + + +// Here are some early explorations into parallelizing sorting. The speedups +// here are not particularly impressive. Parallel sorting is a very deep and +// complex rabbit hole to go down; see, e.g., Victor Duvanenko's work at +// https://github.com/DragonSpit/ParallelAlgorithms. But those algorithms +// use TBB instead of OpenMP, and require C++17, so they're not easily usable. +// Wikipedia at https://en.wikipedia.org/wiki/Merge_sort#Parallel_merge_sort +// also has some very interesting commentary about parallelization of sorting. +// The code here is also very primitive - integer only, no templates, no +// client-suppliable comparator, etc. – so it would be hard to integrate into +// all the ways Eidos presently uses std::sort. Improving this looks like a +// good project for somebody in CS. + +// This parallel quicksort code is thanks to Ruud van der Pas, modified from +// https://www.openmp.org/wp-content/uploads/sc16-openmp-booth-tasking-ruud.pdf +#ifdef _OPENMP +static void _Eidos_ParallelQuicksort_I(int64_t *values, int64_t lo, int64_t hi) +{ + if (lo >= hi) + return; + + if (hi - lo + 1 <= 1000) { + // fall over to using std::sort when below a threshold interval size + // the larger the threshold, the less time we spend thrashing tasks on small + // intervals, which is good; but it also sets a limit on how many threads we + // we bring to bear on relatively small sorts, which is bad; 1000 seems fine + std::sort(values + lo, values + hi + 1); + } else { + // choose the middle of three pivots, in an attempt to avoid really bad pivots + int64_t pivot1 = *(values + lo); + int64_t pivot2 = *(values + hi); + int64_t pivot3 = *(values + ((lo + hi) >> 1)); + int64_t pivot; + + if (pivot1 > pivot2) + { + if (pivot2 > pivot3) pivot = pivot2; + else if (pivot1 > pivot3) pivot = pivot3; + else pivot = pivot1; + } + else + { + if (pivot1 > pivot3) pivot = pivot1; + else if (pivot2 > pivot3) pivot = pivot3; + else pivot = pivot2; + } + + // note that std::partition is not guaranteed to leave the pivot value in position + // we do a second partition to exclude all duplicate pivot values, which seems to be one standard strategy + // this works particularly well when duplicate values are very common; it helps avoid O(n^2) performance + // note the partition is not parallelized; that is apparently a difficult problem for parallel quicksort + int64_t *middle1 = std::partition(values + lo, values + hi + 1, [pivot](const int64_t& em) { return em < pivot; }); + int64_t *middle2 = std::partition(middle1, values + hi + 1, [pivot](const int64_t& em) { return !(pivot < em); }); + int64_t mid1 = middle1 - values; + int64_t mid2 = middle2 - values; + #pragma omp task default(none) firstprivate(values, lo, mid1) + { _Eidos_ParallelQuicksort_I(values, lo, mid1 - 1); } // Left branch + #pragma omp task default(none) firstprivate(values, hi, mid2) + { _Eidos_ParallelQuicksort_I(values, mid2, hi); } // Right branch + } +} +#endif + +void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements) +{ +#ifdef _OPENMP + if (nelements > 1000) { + #pragma omp parallel default(none) shared(values, nelements) + { + #pragma omp single nowait + { + _Eidos_ParallelQuicksort_I(values, 0, nelements - 1); + } + } // End of parallel region + } + else + { + // Use std::sort for small vectors + std::sort(values, values + nelements); + } +#else + // Use std::sort when not running parallel + std::sort(values, values + nelements); +#endif +} + + +// This parallel mergesort code is thanks to Libor Bukata and Jan Dvořák, modified from +// https://cw.fel.cvut.cz/old/_media/courses/b4m35pag/lab6_slides_advanced_openmp.pdf +#ifdef _OPENMP +static void _Eidos_ParallelMergesort_I(int64_t *values, int64_t left, int64_t right) +{ + if (left >= right) + return; + + if (right - left <= 1000) + { + // fall over to using std::sort when below a threshold interval size + // the larger the threshold, the less time we spend thrashing tasks on small + // intervals, which is good; but it also sets a limit on how many threads we + // we bring to bear on relatively small sorts, which is bad; 1000 seems fine + std::sort(values + left, values + right + 1); + } + else + { + int64_t mid = (left + right) / 2; + #pragma omp taskgroup + { + // the original code had if() limits on task subdivision here, but that + // doesn't make sense to me, because we also have the threshold above, + // which serves the same purpose but avoids using std::sort on subdivided + // regions and then merging them with inplace_merge; if we assume that + // std::sort is faster than mergesort when running on one thread, then + // merging subdivided std::sort calls only seems like a good strategy + // when the std::sort calls happen on separate threads + #pragma omp task default(none) firstprivate(values, left, mid) untied + _Eidos_ParallelMergesort_I(values, left, mid); + #pragma omp task default(none) firstprivate(values, mid, right) untied + _Eidos_ParallelMergesort_I(values, mid + 1, right); + #pragma omp taskyield + } + std::inplace_merge(values + left, values + mid + 1, values + right + 1); + } +} +#endif + +void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements) +{ +#ifdef _OPENMP + if (nelements > 1000) { + #pragma omp parallel default(none) shared(values, nelements) + { + #pragma omp single + { + _Eidos_ParallelMergesort_I(values, 0, nelements - 1); + } + } // End of parallel region + } + else + { + // Use std::sort for small vectors + std::sort(values, values + nelements); + } +#else + // Use std::sort when not running parallel + std::sort(values, values + nelements); +#endif +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/eidos/eidos_sorting.h b/eidos/eidos_sorting.h new file mode 100644 index 000000000..9fe8c8b6c --- /dev/null +++ b/eidos/eidos_sorting.h @@ -0,0 +1,148 @@ +// +// eidos_sorting.h +// Eidos +// +// Created by Ben Haller on 8/12/23. +// Copyright (c) 2023 Philipp Messer. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + This file contains an assortment of sorting algorithms used for various purposes. + + */ + + +#ifndef __Eidos__eidos_sorting_h +#define __Eidos__eidos_sorting_h + +#include "eidos_openmp.h" + +#include +#include +#include +#include + + +// +// Parallel sorting; these use std::sort when we are not running parallel, or for small jobs +// + +void Eidos_ParallelQuicksort_I(int64_t *values, int64_t nelements); +void Eidos_ParallelMergesort_I(int64_t *values, int64_t nelements); + + +// +// Sriram's parallel sort +// + +template +void Sriram_parallel_omp_sort(std::vector &data, Compare comparator) +{ + int num_threads = omp_get_max_threads(); + size_t size = data.size(); + size_t chunk_size = size / num_threads; // chunk size per thread + +#pragma omp parallel + { + int tid = omp_get_thread_num(); + // cout << omp_get_thread_num() << std::endl; + + size_t start = tid * chunk_size; + size_t end = (tid + 1) * chunk_size; + + if (tid == num_threads - 1) + end = size; + + // sort sub-array using comparator + std::sort(data.begin() + start, data.begin() + end, comparator); + } + + std::sort(data.begin(), data.end(), comparator); +} + + +// +// Get indexes that would result in sorted ordering of a vector. +// This rather nice code is adapted from http://stackoverflow.com/a/12399290/2752221 +// + +template +std::vector EidosSortIndexes(const std::vector &p_v, bool p_ascending = true) +{ + // initialize original index locations + std::vector idx(p_v.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + if (p_ascending) + std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];}); + else + std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];}); + + return idx; +} + +template <> +inline std::vector EidosSortIndexes(const std::vector &p_v, bool p_ascending) +{ + // initialize original index locations + std::vector idx(p_v.size()); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + // this specialization for type double sorts NaNs to the end + if (p_ascending) + std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);}); + else + std::sort(idx.begin(), idx.end(), [&p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);}); + + return idx; +} + +template +std::vector EidosSortIndexes(const T *p_v, size_t p_size, bool p_ascending = true) +{ + // initialize original index locations + std::vector idx(p_size); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + if (p_ascending) + std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] < p_v[i2];}); + else + std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return p_v[i1] > p_v[i2];}); + + return idx; +} + +template <> +inline std::vector EidosSortIndexes(const double *p_v, size_t p_size, bool p_ascending) +{ + // initialize original index locations + std::vector idx(p_size); + std::iota(idx.begin(), idx.end(), 0); + + // sort indexes based on comparing values in v + // this specialization for type double sorts NaNs to the end + if (p_ascending) + std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] < p_v[i2]);}); + else + std::sort(idx.begin(), idx.end(), [p_v](int64_t i1, int64_t i2) {return std::isnan(p_v[i2]) || (p_v[i1] > p_v[i2]);}); + + return idx; +} + + +#endif /* __Eidos__eidos_sorting_h */ diff --git a/eidos/eidos_test.cpp b/eidos/eidos_test.cpp index 89305722a..7da266286 100644 --- a/eidos/eidos_test.cpp +++ b/eidos/eidos_test.cpp @@ -24,6 +24,7 @@ #include "eidos_interpreter.h" #include "eidos_globals.h" #include "eidos_rng.h" +#include "eidos_sorting.h" #include #include @@ -1276,6 +1277,103 @@ int RunEidosTests(void) } #endif +#if 1 + // Speed and correctness tests of various parallel sorting algorithms + { + std::cout << std::endl << "SORTING TESTS:" << std::endl; + + gsl_rng *rng = EIDOS_GSL_RNG(omp_get_thread_num()); // the single-threaded RNG + typedef int64_t SORT_TYPE; + + { + auto comparator = [](SORT_TYPE a, SORT_TYPE b) { + return a < b; + }; + const std::size_t test_size = 10000000; + const int reps = 5; + double time_sum; + std::vector data_original; + data_original.resize(test_size); + + for (std::size_t i = 0; i < test_size; ++i) + data_original[i] = Eidos_rng_uniform_int(rng, test_size); + + std::vector data_stdsort; + + std::cout << "time (std::sort): "; + time_sum = 0.0; + for (int rep = 0; rep < reps; ++rep) + { + data_stdsort = data_original; + eidos_profile_t begin = Eidos_BenchmarkTime(); + + std::sort(data_stdsort.begin(), data_stdsort.end()); + + eidos_profile_t end = Eidos_BenchmarkTime(); + double time_spent = Eidos_ElapsedProfileTime(end - begin); + std::cout << time_spent << " "; + time_sum += time_spent; + } + std::cout << " : mean " << time_sum / reps << std::endl; + +#ifdef _OPENMP + std::cout << "time (PQUICK): "; + time_sum = 0.0; + for (int rep = 0; rep < reps; ++rep) + { + std::vector data_PQUICK = data_original; + eidos_profile_t begin = Eidos_BenchmarkTime(); + + Eidos_ParallelQuicksort_I(data_PQUICK.data(), data_PQUICK.size()); + + eidos_profile_t end = Eidos_BenchmarkTime(); + double time_spent = Eidos_ElapsedProfileTime(end - begin); + bool correct = (data_PQUICK == data_stdsort); + std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : ""); + time_sum += time_spent; + } + std::cout << " : mean " << time_sum / reps << std::endl; + + std::cout << "time (PMERGE): "; + time_sum = 0.0; + for (int rep = 0; rep < reps; ++rep) + { + std::vector data_PMERGE = data_original; + eidos_profile_t begin = Eidos_BenchmarkTime(); + + Eidos_ParallelMergesort_I(data_PMERGE.data(), data_PMERGE.size()); + + eidos_profile_t end = Eidos_BenchmarkTime(); + double time_spent = Eidos_ElapsedProfileTime(end - begin); + bool correct = (data_PMERGE == data_stdsort); + std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : ""); + time_sum += time_spent; + } + std::cout << " : mean " << time_sum / reps << std::endl; + + std::cout << "time (PSRIRAM): "; + time_sum = 0.0; + for (int rep = 0; rep < reps; ++rep) + { + std::vector data_PSRIRAM = data_original; + eidos_profile_t begin = Eidos_BenchmarkTime(); + + Sriram_parallel_omp_sort(data_PSRIRAM, comparator); + + eidos_profile_t end = Eidos_BenchmarkTime(); + double time_spent = Eidos_ElapsedProfileTime(end - begin); + bool correct = (data_PSRIRAM == data_stdsort); + std::cout << time_spent << " " << (!correct ? "(INCORRECT) " : ""); + time_sum += time_spent; + } + std::cout << " : mean " << time_sum / reps << std::endl; +#endif + } + + std::cout << std::endl << std::endl; + } +#endif + // If we ran tests, the random number seed has been set; let's set it back to a good seed value Eidos_SetRNGSeed(Eidos_GenerateRNGSeed()); diff --git a/eidos/eidos_value.cpp b/eidos/eidos_value.cpp index 3e1aaeffc..b570d1076 100644 --- a/eidos/eidos_value.cpp +++ b/eidos/eidos_value.cpp @@ -19,6 +19,7 @@ #include "eidos_value.h" #include "eidos_functions.h" +#include "eidos_sorting.h" #include "eidos_interpreter.h" #include "eidos_call_signature.h" #include "eidos_property_signature.h"