From f04c3115c06c779d1b43a29ef5da6ce74f3342dc Mon Sep 17 00:00:00 2001 From: Ben Haller Date: Mon, 7 Aug 2023 21:25:48 -0400 Subject: [PATCH] add -perTaskThreads and benchmark-based counts --- EidosScribe/EidosAppDelegate.mm | 2 +- SLiMgui/AppDelegate.mm | 2 +- VERSIONS | 3 +- core/main.cpp | 37 +- eidos/eidos_functions_other.cpp | 8 +- eidos/eidos_globals.cpp | 596 +++++++++++++++++++++++++------- eidos/eidos_globals.h | 19 +- eidos/eidos_openmp.h | 2 +- eidostool/main.cpp | 30 +- 9 files changed, 539 insertions(+), 160 deletions(-) diff --git a/EidosScribe/EidosAppDelegate.mm b/EidosScribe/EidosAppDelegate.mm index cb73460b3..f6d202092 100644 --- a/EidosScribe/EidosAppDelegate.mm +++ b/EidosScribe/EidosAppDelegate.mm @@ -72,7 +72,7 @@ - (void)applicationWillFinishLaunching:(NSNotification *)aNotification // We always use 4 threads; we don't want to hog the whole machine, just run with a couple threads. // We pass false for active_threads to let the worker threads sleep, otherwise the CPU is pegged // the whole time EidosScribe is running, even when sitting idle. - Eidos_WarmUpOpenMP(&std::cout, true, 4, false); + Eidos_WarmUpOpenMP(&std::cout, true, 4, false, /* default per-task thread counts */ ""); #endif Eidos_WarmUp(); diff --git a/SLiMgui/AppDelegate.mm b/SLiMgui/AppDelegate.mm index 2639f40c9..0ac7ac2e6 100644 --- a/SLiMgui/AppDelegate.mm +++ b/SLiMgui/AppDelegate.mm @@ -224,7 +224,7 @@ - (void)applicationWillFinishLaunching:(NSNotification *)aNotification // We always use 4 threads; we don't want to hog the whole machine, just run with a couple threads. // We pass false for active_threads to let the worker threads sleep, otherwise the CPU is pegged // the whole time SLiMgui is running, even when sitting idle. - Eidos_WarmUpOpenMP(&std::cout, true, 4, false); + Eidos_WarmUpOpenMP(&std::cout, true, 4, false, /* default per-task thread counts */ ""); #endif Eidos_WarmUp(); diff --git a/VERSIONS b/VERSIONS index 78a49b57a..2040fa601 100644 --- a/VERSIONS +++ b/VERSIONS @@ -30,7 +30,7 @@ PARALLEL changes (now in the master branch): modify CMakeLists.txt to add /usr/local/include to all targets (through GSL_INCLUDES, which should maybe get cleaned up somehow) modify CMakeLists.txt to build executables eidos_multi and slim_multi when PARALLEL is ON, and to link in libomp disable parallel builds of EidosScribe, SLiMgui, and SLiMguiLegacy with #error directives; parallelization is for the command line only (active threads peg the CPU, passive threads are slower than single-threaded) - add a command-line option, -maxthreads , to control the maximum number of threads used in OpenMP + add a command-line option, -maxThreads , to control the maximum number of threads used in OpenMP add a header eidos_openmp.h that should be used instead of #include "omp.h", providing various useful definitions etc. add an initialization function for OpenMP, Eidos_WarmUpOpenMP(), that must be called when warming up; sets various options, logs diagnostic info set default values for OMP_WAIT_POLICY (active) and OMP_PROC_BIND (false) and OMP_DYNAMIC (false) for slim and eidos @@ -84,6 +84,7 @@ PARALLEL changes (now in the master branch): add parallelSetTaskThreadCounts() function to set per-task thread counts, and parallelGetTaskThreadCounts() to get them add recipe 22.7, Parallelizing nonWF reproduction and spatial interactions add internal EidosBenchmark facility for timing of internal loops, for benchmarking of parallelization scaling + add a command-line option, -perTaskThreads, to let the user control how many threads to use for different tasks (if not overridden) development head (in the master branch): fix a minor bug with autofix when opening multiple .slim documents at once in SLiMgui diff --git a/core/main.cpp b/core/main.cpp index 1630f15cc..d0a1e3037 100644 --- a/core/main.cpp +++ b/core/main.cpp @@ -104,8 +104,8 @@ static void PrintUsageAndDie(bool p_print_header, bool p_print_full_usage) SLIM_OUTSTREAM << " [-l[ong] []] [-s[eed] ] [-t[ime]] [-m[em]] [-M[emhist]] [-x]" << std::endl; SLIM_OUTSTREAM << " [-d[efine] ] "; #ifdef _OPENMP - // The -maxthreads flag is visible only for a parallel build - SLIM_OUTSTREAM << "[-maxthreads ] "; + // Some flags are visible only for a parallel build + SLIM_OUTSTREAM << "[-maxThreads ] [-perTaskThreads \"x\"] "; #endif #if (SLIMPROFILING == 1) // Some flags are visible only for a profile build @@ -130,8 +130,9 @@ static void PrintUsageAndDie(bool p_print_header, bool p_print_full_usage) SLIM_OUTSTREAM << " -x : disable SLiM's runtime safety/consistency checks" << std::endl; SLIM_OUTSTREAM << " -d[efine] : define an Eidos constant, such as \"mu=1e-7\"" << std::endl; #ifdef _OPENMP - // The -maxthreads flag is visible only for a parallel build - SLIM_OUTSTREAM << " -maxthreads : set the maximum number of threads used" << std::endl; + // Some flags are visible only for a parallel build + SLIM_OUTSTREAM << " -maxThreads : set the maximum number of threads used" << std::endl; + SLIM_OUTSTREAM << " -perTaskThreads \"x\": set per-task thread counts to named set \"x\"" << std::endl; #endif #if (SLIMPROFILING == 1) SLIM_OUTSTREAM << " " << std::endl; @@ -187,6 +188,7 @@ int main(int argc, char *argv[]) #ifdef _OPENMP long max_thread_count = omp_get_max_threads(); bool changed_max_thread_count = false; + std::string per_task_thread_count_set_name = ""; // default per-task thread counts #endif #if (SLIMPROFILING == 1) @@ -335,7 +337,7 @@ int main(int argc, char *argv[]) if (strcmp(arg, "--testEidos") == 0 || strcmp(arg, "-testEidos") == 0 || strcmp(arg, "-te") == 0) { #ifdef _OPENMP - Eidos_WarmUpOpenMP(&SLIM_ERRSTREAM, changed_max_thread_count, (int)max_thread_count, true); + Eidos_WarmUpOpenMP(&SLIM_ERRSTREAM, changed_max_thread_count, (int)max_thread_count, true, /* max per-task thread counts */ "maxThreads"); #endif Eidos_WarmUp(); @@ -351,7 +353,7 @@ int main(int argc, char *argv[]) if (strcmp(arg, "--testSLiM") == 0 || strcmp(arg, "-testSLiM") == 0 || strcmp(arg, "-ts") == 0) { #ifdef _OPENMP - Eidos_WarmUpOpenMP(&SLIM_ERRSTREAM, changed_max_thread_count, (int)max_thread_count, true); + Eidos_WarmUpOpenMP(&SLIM_ERRSTREAM, changed_max_thread_count, (int)max_thread_count, true, /* max per-task thread counts */ "maxThreads"); #endif Eidos_WarmUp(); SLiM_WarmUp(); @@ -383,8 +385,8 @@ int main(int argc, char *argv[]) continue; } - // -maxthreads : set the maximum number of OpenMP threads that will be used - if (strcmp(arg, "-maxthreads") == 0) + // -maxThreads : set the maximum number of OpenMP threads that will be used + if (strcmp(arg, "-maxThreads") == 0) { if (++arg_index == argc) PrintUsageAndDie(false, true); @@ -397,7 +399,7 @@ int main(int argc, char *argv[]) if ((max_thread_count < 1) || (max_thread_count > EIDOS_OMP_MAX_THREADS)) { - SLIM_OUTSTREAM << "The -maxthreads command-line option enforces a range of [1, " << EIDOS_OMP_MAX_THREADS << "]." << std::endl; + SLIM_OUTSTREAM << "The -maxThreads command-line option enforces a range of [1, " << EIDOS_OMP_MAX_THREADS << "]." << std::endl; exit(EXIT_FAILURE); } @@ -405,12 +407,25 @@ int main(int argc, char *argv[]) #else if (count != 1) { - SLIM_OUTSTREAM << "The -maxthreads command-line option only allows a value of 1 when not running a PARALLEL build." << std::endl; + SLIM_OUTSTREAM << "The -maxThreads command-line option only allows a value of 1 when not running a PARALLEL build." << std::endl; exit(EXIT_FAILURE); } #endif } + // -perTaskThreads "x": set the per-task thread counts to be used in OpenMP to a named set "x" + if (strcmp(arg, "-perTaskThreads") == 0) + { + if (++arg_index == argc) + PrintUsageAndDie(false, true); + +#ifdef _OPENMP + // We just take the name as given; testing against known values will be done later + // This command-line argument is ignored completely when not parallel + per_task_thread_count_set_name = std::string(argv[arg_index]); +#endif + } + #if (SLIMPROFILING == 1) if (strcmp(arg, "-profileStart") == 0) { @@ -500,7 +515,7 @@ int main(int argc, char *argv[]) #endif #ifdef _OPENMP - Eidos_WarmUpOpenMP((SLiM_verbosity_level >= 1) ? &SLIM_ERRSTREAM : nullptr, changed_max_thread_count, (int)max_thread_count, true); + Eidos_WarmUpOpenMP((SLiM_verbosity_level >= 1) ? &SLIM_ERRSTREAM : nullptr, changed_max_thread_count, (int)max_thread_count, true, per_task_thread_count_set_name); #endif if (SLiM_verbosity_level >= 2) diff --git a/eidos/eidos_functions_other.cpp b/eidos/eidos_functions_other.cpp index 8839cc4f5..7d896d84a 100644 --- a/eidos/eidos_functions_other.cpp +++ b/eidos/eidos_functions_other.cpp @@ -891,9 +891,9 @@ EidosValue_SP Eidos_ExecuteFunction_parallelSetTaskThreadCounts(__attribute__((u if (source_value->Type() == EidosValueType::kValueNULL) { - // A dict value of NULL means "reset to default settings", which we have a function for + // A dict value of NULL means "reset to the command-line default settings" #ifdef _OPENMP - _Eidos_SetDefaultOpenMPThreadCounts(); + _Eidos_SetOpenMPThreadCounts(gEidosDefaultPerTaskThreadCounts); #endif } else @@ -1048,6 +1048,10 @@ EidosValue_SP Eidos_ExecuteFunction_parallelSetTaskThreadCounts(__attribute__((u else if (key == "SURVIVAL") gEidos_OMP_threads_SURVIVAL = (int)value_int64; else EIDOS_TERMINATION << "ERROR (Eidos_ExecuteFunction_parallelSetTaskThreadCounts): parallelSetTaskThreadCounts() does not recognize the task name " << key << "." << EidosTerminate(nullptr); + + // This assumes that any thread count set might push the maximum per-task thread count higher, but not lower + gEidosPerTaskThreadCountsSetName = "UserDefined"; + gEidosPerTaskOriginalMaxThreadCount = std::max(gEidosPerTaskOriginalMaxThreadCount, (int)value_int64); #endif } else diff --git a/eidos/eidos_globals.cpp b/eidos/eidos_globals.cpp index 820360524..9afadeec5 100644 --- a/eidos/eidos_globals.cpp +++ b/eidos/eidos_globals.cpp @@ -230,7 +230,7 @@ EidosValue_SP Eidos_ValueForCommandLineExpression(std::string &p_value_expressio // Declarations for the number of threads we prefer to use for each parallel loop. // These default values are all EIDOS_OMP_MAX_THREADS, to use the maximum number // of threads in all cases. This is primarily useful for benchmarking; normally -// these default values get overwritten by _Eidos_SetDefaultOpenMPThreadCounts(). +// these default values get overwritten by _Eidos_SetOpenMPThreadCounts(). int gEidos_OMP_threads_ABS_FLOAT = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_CEIL = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_EXP_FLOAT = EIDOS_OMP_MAX_THREADS; @@ -350,137 +350,455 @@ int gEidos_OMP_threads_PARENTS_CLEAR = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_UNIQUE_MUTRUNS = EIDOS_OMP_MAX_THREADS; int gEidos_OMP_threads_SURVIVAL = EIDOS_OMP_MAX_THREADS; -void _Eidos_SetDefaultOpenMPThreadCounts(void) +EidosPerTaskThreadCounts gEidosDefaultPerTaskThreadCounts = EidosPerTaskThreadCounts::kDefault; +std::string gEidosPerTaskThreadCountsSetName = "DEFAULT"; // should get overwritten +int gEidosPerTaskOriginalMaxThreadCount = EIDOS_OMP_MAX_THREADS; +int gEidosPerTaskClippedMaxThreadCount = EIDOS_OMP_MAX_THREADS; + +void _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts per_task_thread_counts) { - // These default values are determined empirically by a profile on a big machine; where the scaling curve - // tops out, that determines the default number of threads (since performance degrades beyond that point). - // Of course that will be hardware-specific, so these defaults are just guesses really. - gEidos_OMP_threads_ABS_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_CEIL = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_EXP_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FLOOR = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_LOG_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_LOG10_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_LOG2_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_ROUND = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SQRT_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SUM_INTEGER = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SUM_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SUM_LOGICAL = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_TRUNC = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_MAX_INT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MAX_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MIN_INT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MIN_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMAX_INT_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMAX_INT_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMAX_FLOAT_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMAX_FLOAT_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMIN_INT_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMIN_INT_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMIN_FLOAT_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PMIN_FLOAT_2 = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_MATCH_INT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MATCH_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MATCH_STRING = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MATCH_OBJECT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_INDEX = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_R_INT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_R_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_R_OBJECT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_WR_INT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_WR_FLOAT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_WR_OBJECT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_TABULATE_MAXBIN = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_TABULATE = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_CONTAINS_MARKER_MUT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_I_COUNT_OF_MUTS_OF_TYPE = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_G_COUNT_OF_MUTS_OF_TYPE = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_INDS_W_PEDIGREE_IDS = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RELATEDNESS = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_INDIVIDUALS_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SAMPLE_INDIVIDUALS_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_FITNESS_SCALE_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_FITNESS_SCALE_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SUM_OF_MUTS_OF_TYPE = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_DNORM_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_DNORM_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RBINOM_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RBINOM_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RBINOM_3 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RDUNIF_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RDUNIF_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RDUNIF_3 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_REXP_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_REXP_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RNORM_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RNORM_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RNORM_3 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RPOIS_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RPOIS_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RUNIF_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RUNIF_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_RUNIF_3 = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_POINT_IN_BOUNDS_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_IN_BOUNDS_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_IN_BOUNDS_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_PERIODIC_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_PERIODIC_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_PERIODIC_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_REFLECTED_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_REFLECTED_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_REFLECTED_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_STOPPED_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_STOPPED_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_STOPPED_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_UNIFORM_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_UNIFORM_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_POINT_UNIFORM_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_SPATIAL_POS_1_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_SPATIAL_POS_1_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_SPATIAL_POS_1_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_SPATIAL_POS_2_1D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_SPATIAL_POS_2_2D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SET_SPATIAL_POS_2_3D = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SPATIAL_MAP_VALUE = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_CLIPPEDINTEGRAL_1S = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_CLIPPEDINTEGRAL_2S = EIDOS_OMP_MAX_THREADS; - //gEidos_OMP_threads_CLIPPEDINTEGRAL_3S = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_DRAWBYSTRENGTH = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_INTNEIGHCOUNT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_LOCALPOPDENSITY = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_NEARESTINTNEIGH = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_NEARESTNEIGH = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_NEIGHCOUNT = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_TOTNEIGHSTRENGTH = EIDOS_OMP_MAX_THREADS; - - gEidos_OMP_threads_AGE_INCR = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_DEFERRED_REPRO = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_WF_REPRO = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FITNESS_ASEX_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FITNESS_ASEX_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FITNESS_ASEX_3 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FITNESS_SEX_1 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FITNESS_SEX_2 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_FITNESS_SEX_3 = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_MIGRANT_CLEAR = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_PARENTS_CLEAR = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_UNIQUE_MUTRUNS = EIDOS_OMP_MAX_THREADS; - gEidos_OMP_threads_SURVIVAL = EIDOS_OMP_MAX_THREADS; + // This switches to a set of per-task thread counts. Ideally, these are determined using the + // SLiM-Benchmarks repo on GitHub, on the actual machine where production runs will be done. + // Where the scaling curve tops out for a given test, that determines the default number of + // threads that should be used (since performance degrades beyond that point). The values + // here come from tests on specific hardware that I use; they may or may not correspond to + // what provides good performance on the end user's hardware! + + // One question is what to put in when a task scales all the way up to the maximum number of + // threads that was tested. For example, if tests went to 16 threads and it scaled to 16, + // do you put 16, or do you put EIDOS_OMP_MAX_THREADS figuring that if someone uses those + // per-task maximum thread counts on a similar machine with even more cores, the task might + // well continue to scale? This is a guess; it's extrapolating beyond the data we have. + // But I have chosen, for that example, to use 16, not EIDOS_OMP_MAX_THREADS. The user can + // always fix this if they want to; better to err on the side of caution and not scale up + // to levels where performance might become atrocious. + + if (per_task_thread_counts == EidosPerTaskThreadCounts::kMaxThreads) + { + // These are all EIDOS_OMP_MAX_THREADS, as a template for modification + gEidosPerTaskThreadCountsSetName = "maxThreads"; + gEidosPerTaskOriginalMaxThreadCount = EIDOS_OMP_MAX_THREADS; + gEidosPerTaskClippedMaxThreadCount = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_ABS_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_CEIL = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_EXP_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FLOOR = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_LOG_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_LOG10_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_LOG2_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_ROUND = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SQRT_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SUM_INTEGER = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SUM_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SUM_LOGICAL = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_TRUNC = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_MAX_INT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MAX_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MIN_INT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MIN_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMAX_INT_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMAX_INT_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMAX_FLOAT_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMAX_FLOAT_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMIN_INT_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMIN_INT_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMIN_FLOAT_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PMIN_FLOAT_2 = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_MATCH_INT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MATCH_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MATCH_STRING = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MATCH_OBJECT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_INDEX = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_R_INT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_R_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_R_OBJECT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_WR_INT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_WR_FLOAT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_WR_OBJECT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_TABULATE_MAXBIN = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_TABULATE = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_CONTAINS_MARKER_MUT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_I_COUNT_OF_MUTS_OF_TYPE = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_G_COUNT_OF_MUTS_OF_TYPE = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_INDS_W_PEDIGREE_IDS = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RELATEDNESS = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_INDIVIDUALS_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SAMPLE_INDIVIDUALS_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_FITNESS_SCALE_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_FITNESS_SCALE_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SUM_OF_MUTS_OF_TYPE = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_DNORM_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_DNORM_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RBINOM_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RBINOM_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RBINOM_3 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RDUNIF_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RDUNIF_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RDUNIF_3 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_REXP_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_REXP_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RNORM_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RNORM_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RNORM_3 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RPOIS_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RPOIS_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RUNIF_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RUNIF_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_RUNIF_3 = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_IN_BOUNDS_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_IN_BOUNDS_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_PERIODIC_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_PERIODIC_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_PERIODIC_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_REFLECTED_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_REFLECTED_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_REFLECTED_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_STOPPED_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_STOPPED_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_STOPPED_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_UNIFORM_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_UNIFORM_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_POINT_UNIFORM_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_SPATIAL_POS_1_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_SPATIAL_POS_1_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_SPATIAL_POS_1_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_SPATIAL_POS_2_1D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_SPATIAL_POS_2_2D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SET_SPATIAL_POS_2_3D = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SPATIAL_MAP_VALUE = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_CLIPPEDINTEGRAL_1S = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_CLIPPEDINTEGRAL_2S = EIDOS_OMP_MAX_THREADS; + //gEidos_OMP_threads_CLIPPEDINTEGRAL_3S = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_DRAWBYSTRENGTH = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_INTNEIGHCOUNT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_LOCALPOPDENSITY = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_NEARESTINTNEIGH = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_NEARESTNEIGH = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_NEIGHCOUNT = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_TOTNEIGHSTRENGTH = EIDOS_OMP_MAX_THREADS; + + gEidos_OMP_threads_AGE_INCR = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_DEFERRED_REPRO = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_WF_REPRO = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FITNESS_ASEX_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FITNESS_ASEX_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FITNESS_ASEX_3 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FITNESS_SEX_1 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FITNESS_SEX_2 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_FITNESS_SEX_3 = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_MIGRANT_CLEAR = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_PARENTS_CLEAR = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_UNIQUE_MUTRUNS = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_SURVIVAL = EIDOS_OMP_MAX_THREADS; + } + else if (per_task_thread_counts == EidosPerTaskThreadCounts::kMacStudio2022_16) + { + // These counts are from a Mac Studio 2022 (Mac13,2), 20-core M1 Ultra, 128 GB + // It has 20 cores: 16 performance cores and 4 efficiency cores + // An effort was made with OMP_PLACES and OMP_PROC_BIND to stay on the perf cores, + // but I don't know how to tell whether that effort was successful or not, so. + // The raw data for these choices is presently in benchmarking/STUDIO 2023-08-07 + gEidosPerTaskThreadCountsSetName = "MacStudio2022_16"; + gEidosPerTaskOriginalMaxThreadCount = 16; + gEidosPerTaskClippedMaxThreadCount = 16; + + gEidos_OMP_threads_ABS_FLOAT = 8; + gEidos_OMP_threads_CEIL = 8; + gEidos_OMP_threads_EXP_FLOAT = 16; + gEidos_OMP_threads_FLOOR = 8; + gEidos_OMP_threads_LOG_FLOAT = 16; + gEidos_OMP_threads_LOG10_FLOAT = 16; + gEidos_OMP_threads_LOG2_FLOAT = 16; + gEidos_OMP_threads_ROUND = 8; + gEidos_OMP_threads_SQRT_FLOAT = 8; + gEidos_OMP_threads_SUM_INTEGER = 8; + gEidos_OMP_threads_SUM_FLOAT = 8; + gEidos_OMP_threads_SUM_LOGICAL = 8; + gEidos_OMP_threads_TRUNC = 8; + + gEidos_OMP_threads_MAX_INT = 8; + gEidos_OMP_threads_MAX_FLOAT = 16; + gEidos_OMP_threads_MIN_INT = 8; + gEidos_OMP_threads_MIN_FLOAT = 16; + gEidos_OMP_threads_PMAX_INT_1 = 8; + gEidos_OMP_threads_PMAX_INT_2 = 8; + gEidos_OMP_threads_PMAX_FLOAT_1 = 16; + gEidos_OMP_threads_PMAX_FLOAT_2 = 16; + gEidos_OMP_threads_PMIN_INT_1 = 8; + gEidos_OMP_threads_PMIN_INT_2 = 8; + gEidos_OMP_threads_PMIN_FLOAT_1 = 16; + gEidos_OMP_threads_PMIN_FLOAT_2 = 16; + + gEidos_OMP_threads_MATCH_INT = 16; + gEidos_OMP_threads_MATCH_FLOAT = 16; + gEidos_OMP_threads_MATCH_STRING = 16; + gEidos_OMP_threads_MATCH_OBJECT = 16; + gEidos_OMP_threads_SAMPLE_INDEX = 12; + gEidos_OMP_threads_SAMPLE_R_INT = 16; + gEidos_OMP_threads_SAMPLE_R_FLOAT = 16; + gEidos_OMP_threads_SAMPLE_R_OBJECT = 16; + gEidos_OMP_threads_SAMPLE_WR_INT = 12; + gEidos_OMP_threads_SAMPLE_WR_FLOAT = 8; + gEidos_OMP_threads_SAMPLE_WR_OBJECT = 16; + gEidos_OMP_threads_TABULATE_MAXBIN = 8; + gEidos_OMP_threads_TABULATE = 16; + + gEidos_OMP_threads_CONTAINS_MARKER_MUT = 16; + gEidos_OMP_threads_I_COUNT_OF_MUTS_OF_TYPE = 16; + gEidos_OMP_threads_G_COUNT_OF_MUTS_OF_TYPE = 16; + gEidos_OMP_threads_INDS_W_PEDIGREE_IDS = 8; + gEidos_OMP_threads_RELATEDNESS = 16; + gEidos_OMP_threads_SAMPLE_INDIVIDUALS_1 = 12; + gEidos_OMP_threads_SAMPLE_INDIVIDUALS_2 = 12; + gEidos_OMP_threads_SET_FITNESS_SCALE_1 = 8; + gEidos_OMP_threads_SET_FITNESS_SCALE_2 = 8; + gEidos_OMP_threads_SUM_OF_MUTS_OF_TYPE = 16; + + gEidos_OMP_threads_DNORM_1 = 16; + gEidos_OMP_threads_DNORM_2 = 16; + gEidos_OMP_threads_RBINOM_1 = 16; + gEidos_OMP_threads_RBINOM_2 = 16; + gEidos_OMP_threads_RBINOM_3 = 16; + gEidos_OMP_threads_RDUNIF_1 = 16; + gEidos_OMP_threads_RDUNIF_2 = 16; + gEidos_OMP_threads_RDUNIF_3 = 16; + gEidos_OMP_threads_REXP_1 = 16; + gEidos_OMP_threads_REXP_2 = 16; + gEidos_OMP_threads_RNORM_1 = 16; + gEidos_OMP_threads_RNORM_2 = 16; + gEidos_OMP_threads_RNORM_3 = 16; + gEidos_OMP_threads_RPOIS_1 = 16; + gEidos_OMP_threads_RPOIS_2 = 16; + gEidos_OMP_threads_RUNIF_1 = 16; + gEidos_OMP_threads_RUNIF_2 = 16; + gEidos_OMP_threads_RUNIF_3 = 16; + + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = 12; + gEidos_OMP_threads_POINT_IN_BOUNDS_2D = 12; + gEidos_OMP_threads_POINT_IN_BOUNDS_3D = 16; + gEidos_OMP_threads_POINT_PERIODIC_1D = 16; + gEidos_OMP_threads_POINT_PERIODIC_2D = 16; + gEidos_OMP_threads_POINT_PERIODIC_3D = 16; + gEidos_OMP_threads_POINT_REFLECTED_1D = 16; + gEidos_OMP_threads_POINT_REFLECTED_2D = 16; + gEidos_OMP_threads_POINT_REFLECTED_3D = 16; + gEidos_OMP_threads_POINT_STOPPED_1D = 16; + gEidos_OMP_threads_POINT_STOPPED_2D = 8; + gEidos_OMP_threads_POINT_STOPPED_3D = 8; + gEidos_OMP_threads_POINT_UNIFORM_1D = 16; + gEidos_OMP_threads_POINT_UNIFORM_2D = 16; + gEidos_OMP_threads_POINT_UNIFORM_3D = 16; + gEidos_OMP_threads_SET_SPATIAL_POS_1_1D = 4; + gEidos_OMP_threads_SET_SPATIAL_POS_1_2D = 4; + gEidos_OMP_threads_SET_SPATIAL_POS_1_3D = 4; + gEidos_OMP_threads_SET_SPATIAL_POS_2_1D = 4; + gEidos_OMP_threads_SET_SPATIAL_POS_2_2D = 4; + gEidos_OMP_threads_SET_SPATIAL_POS_2_3D = 4; + gEidos_OMP_threads_SPATIAL_MAP_VALUE = 16; + + gEidos_OMP_threads_CLIPPEDINTEGRAL_1S = 16; + gEidos_OMP_threads_CLIPPEDINTEGRAL_2S = 16; + //gEidos_OMP_threads_CLIPPEDINTEGRAL_3S = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_DRAWBYSTRENGTH = 16; + gEidos_OMP_threads_INTNEIGHCOUNT = 16; + gEidos_OMP_threads_LOCALPOPDENSITY = 16; + gEidos_OMP_threads_NEARESTINTNEIGH = 16; + gEidos_OMP_threads_NEARESTNEIGH = 16; + gEidos_OMP_threads_NEIGHCOUNT = 16; + gEidos_OMP_threads_TOTNEIGHSTRENGTH = 16; + + gEidos_OMP_threads_AGE_INCR = 4; + gEidos_OMP_threads_DEFERRED_REPRO = 4; + gEidos_OMP_threads_WF_REPRO = 4; + gEidos_OMP_threads_FITNESS_ASEX_1 = 8; + gEidos_OMP_threads_FITNESS_ASEX_2 = 8; + gEidos_OMP_threads_FITNESS_ASEX_3 = 2; + gEidos_OMP_threads_FITNESS_SEX_1 = 8; + gEidos_OMP_threads_FITNESS_SEX_2 = 8; + gEidos_OMP_threads_FITNESS_SEX_3 = 2; + gEidos_OMP_threads_MIGRANT_CLEAR = 4; + gEidos_OMP_threads_PARENTS_CLEAR = 16; + gEidos_OMP_threads_UNIQUE_MUTRUNS = 16; + gEidos_OMP_threads_SURVIVAL = 16; + } + else if (per_task_thread_counts == EidosPerTaskThreadCounts::kXeonGold2_40) + { + // These counts are from cbsulm21, a node in Cornell's BioHPC cluster + // It has two 20-core (40-hyperthreaded) Intel Xeon Gold 6148 2.4GHz + // That makes a total of 40 physical cores, 80 virtual cores + // These tests went up to 40 cores, avoiding hyperthreading + // The raw data for these choices is presently in benchmarking/BHPC 2023-08-07 + // These should be the defaults for production builds, on the + // assumption that users will be on similar big HPC nodes + gEidosPerTaskThreadCountsSetName = "XeonGold2_40"; + gEidosPerTaskOriginalMaxThreadCount = 40; + gEidosPerTaskClippedMaxThreadCount = 40; + + gEidos_OMP_threads_ABS_FLOAT = 40; + gEidos_OMP_threads_CEIL = 40; + gEidos_OMP_threads_EXP_FLOAT = 40; + gEidos_OMP_threads_FLOOR = 40; + gEidos_OMP_threads_LOG_FLOAT = 40; + gEidos_OMP_threads_LOG10_FLOAT = 40; + gEidos_OMP_threads_LOG2_FLOAT = 40; + gEidos_OMP_threads_ROUND = 40; + gEidos_OMP_threads_SQRT_FLOAT = 40; + gEidos_OMP_threads_SUM_INTEGER = 40; + gEidos_OMP_threads_SUM_FLOAT = 40; + gEidos_OMP_threads_SUM_LOGICAL = 40; + gEidos_OMP_threads_TRUNC = 40; + + gEidos_OMP_threads_MAX_INT = 40; + gEidos_OMP_threads_MAX_FLOAT = 40; + gEidos_OMP_threads_MIN_INT = 40; + gEidos_OMP_threads_MIN_FLOAT = 40; + gEidos_OMP_threads_PMAX_INT_1 = 40; + gEidos_OMP_threads_PMAX_INT_2 = 40; + gEidos_OMP_threads_PMAX_FLOAT_1 = 40; + gEidos_OMP_threads_PMAX_FLOAT_2 = 40; + gEidos_OMP_threads_PMIN_INT_1 = 40; + gEidos_OMP_threads_PMIN_INT_2 = 40; + gEidos_OMP_threads_PMIN_FLOAT_1 = 40; + gEidos_OMP_threads_PMIN_FLOAT_2 = 40; + + gEidos_OMP_threads_MATCH_INT = 40; + gEidos_OMP_threads_MATCH_FLOAT = 40; + gEidos_OMP_threads_MATCH_STRING = 40; + gEidos_OMP_threads_MATCH_OBJECT = 40; + gEidos_OMP_threads_SAMPLE_INDEX = 40; + gEidos_OMP_threads_SAMPLE_R_INT = 40; + gEidos_OMP_threads_SAMPLE_R_FLOAT = 40; + gEidos_OMP_threads_SAMPLE_R_OBJECT = 40; + gEidos_OMP_threads_SAMPLE_WR_INT = 40; + gEidos_OMP_threads_SAMPLE_WR_FLOAT = 40; + gEidos_OMP_threads_SAMPLE_WR_OBJECT = 40; + gEidos_OMP_threads_TABULATE_MAXBIN = 40; + gEidos_OMP_threads_TABULATE = 20; + + gEidos_OMP_threads_CONTAINS_MARKER_MUT = 40; + gEidos_OMP_threads_I_COUNT_OF_MUTS_OF_TYPE = 40; + gEidos_OMP_threads_G_COUNT_OF_MUTS_OF_TYPE = 40; + gEidos_OMP_threads_INDS_W_PEDIGREE_IDS = 5; + gEidos_OMP_threads_RELATEDNESS = 40; + gEidos_OMP_threads_SAMPLE_INDIVIDUALS_1 = 40; + gEidos_OMP_threads_SAMPLE_INDIVIDUALS_2 = 40; + gEidos_OMP_threads_SET_FITNESS_SCALE_1 = 40; + gEidos_OMP_threads_SET_FITNESS_SCALE_2 = 40; + gEidos_OMP_threads_SUM_OF_MUTS_OF_TYPE = 40; + + gEidos_OMP_threads_DNORM_1 = 40; + gEidos_OMP_threads_DNORM_2 = 40; + gEidos_OMP_threads_RBINOM_1 = 10; + gEidos_OMP_threads_RBINOM_2 = 40; + gEidos_OMP_threads_RBINOM_3 = 40; + gEidos_OMP_threads_RDUNIF_1 = 10; + gEidos_OMP_threads_RDUNIF_2 = 10; + gEidos_OMP_threads_RDUNIF_3 = 20; + gEidos_OMP_threads_REXP_1 = 40; + gEidos_OMP_threads_REXP_2 = 40; + gEidos_OMP_threads_RNORM_1 = 40; + gEidos_OMP_threads_RNORM_2 = 40; + gEidos_OMP_threads_RNORM_3 = 40; + gEidos_OMP_threads_RPOIS_1 = 40; + gEidos_OMP_threads_RPOIS_2 = 40; + gEidos_OMP_threads_RUNIF_1 = 40; + gEidos_OMP_threads_RUNIF_2 = 40; + gEidos_OMP_threads_RUNIF_3 = 40; + + gEidos_OMP_threads_POINT_IN_BOUNDS_1D = 40; + gEidos_OMP_threads_POINT_IN_BOUNDS_2D = 40; + gEidos_OMP_threads_POINT_IN_BOUNDS_3D = 40; + gEidos_OMP_threads_POINT_PERIODIC_1D = 40; + gEidos_OMP_threads_POINT_PERIODIC_2D = 40; + gEidos_OMP_threads_POINT_PERIODIC_3D = 40; + gEidos_OMP_threads_POINT_REFLECTED_1D = 40; + gEidos_OMP_threads_POINT_REFLECTED_2D = 40; + gEidos_OMP_threads_POINT_REFLECTED_3D = 40; + gEidos_OMP_threads_POINT_STOPPED_1D = 40; + gEidos_OMP_threads_POINT_STOPPED_2D = 40; + gEidos_OMP_threads_POINT_STOPPED_3D = 40; + gEidos_OMP_threads_POINT_UNIFORM_1D = 40; + gEidos_OMP_threads_POINT_UNIFORM_2D = 40; + gEidos_OMP_threads_POINT_UNIFORM_3D = 40; + gEidos_OMP_threads_SET_SPATIAL_POS_1_1D = 5; + gEidos_OMP_threads_SET_SPATIAL_POS_1_2D = 20; + gEidos_OMP_threads_SET_SPATIAL_POS_1_3D = 20; + gEidos_OMP_threads_SET_SPATIAL_POS_2_1D = 10; + gEidos_OMP_threads_SET_SPATIAL_POS_2_2D = 20; + gEidos_OMP_threads_SET_SPATIAL_POS_2_3D = 20; + gEidos_OMP_threads_SPATIAL_MAP_VALUE = 40; + + gEidos_OMP_threads_CLIPPEDINTEGRAL_1S = 40; + gEidos_OMP_threads_CLIPPEDINTEGRAL_2S = 40; + //gEidos_OMP_threads_CLIPPEDINTEGRAL_3S = EIDOS_OMP_MAX_THREADS; + gEidos_OMP_threads_DRAWBYSTRENGTH = 40; + gEidos_OMP_threads_INTNEIGHCOUNT = 40; + gEidos_OMP_threads_LOCALPOPDENSITY = 40; + gEidos_OMP_threads_NEARESTINTNEIGH = 10; + gEidos_OMP_threads_NEARESTNEIGH = 10; + gEidos_OMP_threads_NEIGHCOUNT = 40; + gEidos_OMP_threads_TOTNEIGHSTRENGTH = 40; + + gEidos_OMP_threads_AGE_INCR = 10; + gEidos_OMP_threads_DEFERRED_REPRO = 5; + gEidos_OMP_threads_WF_REPRO = 5; + gEidos_OMP_threads_FITNESS_ASEX_1 = 40; + gEidos_OMP_threads_FITNESS_ASEX_2 = 40; + gEidos_OMP_threads_FITNESS_ASEX_3 = 5; + gEidos_OMP_threads_FITNESS_SEX_1 = 40; + gEidos_OMP_threads_FITNESS_SEX_2 = 40; + gEidos_OMP_threads_FITNESS_SEX_3 = 5; + gEidos_OMP_threads_MIGRANT_CLEAR = 20; + gEidos_OMP_threads_PARENTS_CLEAR = 40; + gEidos_OMP_threads_UNIQUE_MUTRUNS = 40; + gEidos_OMP_threads_SURVIVAL = 40; + } + else + { + EIDOS_TERMINATION << "ERROR (_Eidos_SetOpenMPThreadCounts): (internal error) unrecognized EidosPerTaskThreadCounts value." << EidosTerminate(nullptr); + } // Always clip the above counts to gEidosMaxThreads _Eidos_ClipOpenMPThreadCounts(); } +void _Eidos_ChooseDefaultOpenMPThreadCounts() +{ +#if USE_OMP_LIMITS + + // If we are supposed to use our built-in default OMP limits, set them for our task thread counts + // Note that the default behavior here is nothing but a wild shot in the dark! +#ifdef __APPLE__ + // On macOS, we use the results from my Mac Studio 2022 by default; note it maxes out at 16 threads + gEidosDefaultPerTaskThreadCounts = EidosPerTaskThreadCounts::kMacStudio2022_16; +#else + // On other systems, we use the results from the Cornell BioHPC cluster machine I test on, with a max of 40 threads + gEidosDefaultPerTaskThreadCounts = EidosPerTaskThreadCounts::kXeonGold2_40; +#endif + +#else + + // Enforce gEidosMaxThreads for the thread count ivars that govern how many threads various loops will use + gEidosDefaultPerTaskThreadCounts = EidosPerTaskThreadCounts::kMaxThreads; + +#endif + + _Eidos_SetOpenMPThreadCounts(gEidosDefaultPerTaskThreadCounts); +} + void _Eidos_ClipOpenMPThreadCounts(void) { // This clips all thread-count ivars to gEidosMaxThreads, so they can be used at runtime without checking + gEidosPerTaskClippedMaxThreadCount = std::min(gEidosMaxThreads, gEidosPerTaskOriginalMaxThreadCount); + gEidos_OMP_threads_ABS_FLOAT = std::min(gEidosMaxThreads, gEidos_OMP_threads_ABS_FLOAT); gEidos_OMP_threads_CEIL = std::min(gEidosMaxThreads, gEidos_OMP_threads_CEIL); gEidos_OMP_threads_EXP_FLOAT = std::min(gEidosMaxThreads, gEidos_OMP_threads_EXP_FLOAT); @@ -601,7 +919,7 @@ void _Eidos_ClipOpenMPThreadCounts(void) gEidos_OMP_threads_SURVIVAL = std::min(gEidosMaxThreads, gEidos_OMP_threads_SURVIVAL); } -void Eidos_WarmUpOpenMP(std::ostream *outstream, bool changed_max_thread_count, int new_max_thread_count, bool active_threads) +void Eidos_WarmUpOpenMP(std::ostream *outstream, bool changed_max_thread_count, int new_max_thread_count, bool active_threads, std::string thread_count_set_name) { // When running under OpenMP, print a log, and also set values for the OpenMP ICV's that we want to guarantee // See http://www.archer.ac.uk/training/course-material/2018/09/openmp-imp/Slides/L10-TipsTricksGotchas.pdf @@ -641,20 +959,34 @@ void Eidos_WarmUpOpenMP(std::ostream *outstream, bool changed_max_thread_count, gEidosNumThreads = gEidosMaxThreads; gEidosNumThreadsOverride = false; -#if USE_OMP_LIMITS - // If we are supposed to use our built-in default OMP limits, set them for our task thread counts - _Eidos_SetDefaultOpenMPThreadCounts(); -#else - // Enforce gEidosMaxThreads for the thread count ivars that govern how many threads various loops will use - _Eidos_ClipOpenMPThreadCounts(); -#endif + // Set up per-task thread counts according to thread_count_set_name. If it is empty, we choose a + // default set heuristically, based upon the hardware platform. Otherwise, we look for a name we + // recognize, or error out. There are very few sets here now, so this is not terribly useful; + // but it does allow the benchmarking suite to turn off per-task limits with "maxThreads". + if (thread_count_set_name.length() == 0) + _Eidos_ChooseDefaultOpenMPThreadCounts(); + else if (thread_count_set_name == "maxThreads") + _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts::kMaxThreads); + else if (thread_count_set_name == "MacStudio2022_16") + _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts::kMacStudio2022_16); + else if (thread_count_set_name == "XeonGold2_40") + _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts::kXeonGold2_40); + else + EIDOS_TERMINATION << "ERROR (_Eidos_SetOpenMPThreadCounts): (internal error) unrecognized EidosPerTaskThreadCounts value." << EidosTerminate(nullptr); // Write some diagnostic output about our configuration. If the verbosity level is 0, outstream will be nullptr. if (outstream) { - (*outstream) << "// ********** Running multithreaded with OpenMP (max of " << gEidosMaxThreads << " threads)" << std::endl; + (*outstream) << "// ********** Running multithreaded with OpenMP (maxThreads == " << gEidosMaxThreads << ")" << std::endl; (*outstream) << "// ********** OMP_WAIT_POLICY == " << getenv("OMP_WAIT_POLICY") << ", OMP_PROC_BIND == " << getenv("OMP_PROC_BIND") << std::endl; +#if 1 + (*outstream) << "// ********** Per-task thread counts: '" << gEidosPerTaskThreadCountsSetName << "', max " << gEidosPerTaskOriginalMaxThreadCount; + if (gEidosPerTaskClippedMaxThreadCount < gEidosPerTaskOriginalMaxThreadCount) + (*outstream) << " (clipped to " << gEidosPerTaskClippedMaxThreadCount << ")"; + (*outstream) << std::endl; +#endif + #if 0 // BCH 5/19/2023: #if 0 for now, because this gives an error on some platforms; we don't support offloading anyway. // Look for devices (GPUs, accelerators) that we are able to offload to. diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index 4b17dc43e..8f12923bb 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -56,11 +56,24 @@ class EidosToken; #define EIDOS_VERSION_FLOAT (3.01) -// These should be called once at startup to give Eidos an opportunity to initialize static state #ifdef _OPENMP -void _Eidos_SetDefaultOpenMPThreadCounts(void); +typedef enum { + kDefault = 0, // indicates that one of the other values should be chosen heuristically + kMaxThreads, // use EIDOS_OMP_MAX_THREADS for everything + kMacStudio2022_16, // Mac Studio 2022 (Mac13,2), 20-core M1 Ultra (16 performance cores) + kXeonGold2_40, // two 20-core (40-hyperthreaded) Intel Xeon Gold 6148 2.4GHz (40 physical cores) +} EidosPerTaskThreadCounts; + +// Some state variables for user output regarding the OpenMP configuration +extern EidosPerTaskThreadCounts gEidosDefaultPerTaskThreadCounts; // the default set on the command line, or kDefault +extern std::string gEidosPerTaskThreadCountsSetName; +extern int gEidosPerTaskOriginalMaxThreadCount, gEidosPerTaskClippedMaxThreadCount; + +// Eidos_WarmUpOpenMP() should be called once at startup to give Eidos an opportunity to initialize static state +void _Eidos_SetOpenMPThreadCounts(EidosPerTaskThreadCounts per_task_thread_counts); +void _Eidos_ChooseDefaultOpenMPThreadCounts(void); void _Eidos_ClipOpenMPThreadCounts(void); -void Eidos_WarmUpOpenMP(std::ostream *outstream, bool changed_max_thread_count, int new_max_thread_count, bool active_threads); +void Eidos_WarmUpOpenMP(std::ostream *outstream, bool changed_max_thread_count, int new_max_thread_count, bool active_threads, std::string thread_count_set_name); #endif void Eidos_WarmUp(void); diff --git a/eidos/eidos_openmp.h b/eidos/eidos_openmp.h index b7f6f76ae..722565581 100644 --- a/eidos/eidos_openmp.h +++ b/eidos/eidos_openmp.h @@ -38,7 +38,7 @@ if building in Xcode, use the provided separate version of the project, SLiM_OpenMP.xcodeproj, and the separate targets eidos_multi and slim_multi with -DPARALLEL=ON, the built executables will be slim_multi and eidos_multi, to make it easier to distinguish them; but of course you may rename them as you see fit on macOS, you may (several times!) get a system alert that libomp was blocked for security; after that, go to System Preferences, Security & Privacy, tab General, click "Allow Anyway", and then click "Open" back in the system panel - use the -maxthreads command-line option to change the maximum number of threads from OpenMP's default + use the -maxThreads command-line option to change the maximum number of threads from OpenMP's default We allocate per-thread storage (for gEidosMaxThreads threads) at the global/species level for these facilities: diff --git a/eidostool/main.cpp b/eidostool/main.cpp index d8ea5c5e3..04823d77b 100644 --- a/eidostool/main.cpp +++ b/eidostool/main.cpp @@ -32,8 +32,8 @@ void PrintUsageAndDie() std::cout << "usage: eidos -version | -usage | -testEidos | [-time] [-mem]" << std::endl; std::cout << " "; #ifdef _OPENMP - // The -maxthreads flag is visible only for a parallel build - std::cout << "[-maxthreads ] "; + // Some flags are visible only for a parallel build + std::cout << "[-maxThreads ] [-perTaskThreads \"x\"] "; #endif std::cout << "