From 77392aad89406f02887e7f15bc848ba7ef602988 Mon Sep 17 00:00:00 2001 From: Jessie Yang Date: Thu, 25 Jan 2024 14:58:34 -0800 Subject: [PATCH] coll/tuned: Change the bcast default collective algorithm selection The default algorithm selections were out of date and not performing well. After gathering data using the ompi-collectives-tuning package, new default algorithm decisions are selected for bcast. Signed-off-by: Jessie Yang --- ompi/mca/coll/tuned/coll_tuned.h | 1 + .../coll/tuned/coll_tuned_decision_fixed.c | 76 +++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/ompi/mca/coll/tuned/coll_tuned.h b/ompi/mca/coll/tuned/coll_tuned.h index 8132f3fadd3..cb82b47e6be 100644 --- a/ompi/mca/coll/tuned/coll_tuned.h +++ b/ompi/mca/coll/tuned/coll_tuned.h @@ -136,6 +136,7 @@ int ompi_coll_tuned_barrier_intra_check_forced_init (coll_tuned_force_algorithm_ /* Bcast */ int ompi_coll_tuned_bcast_intra_dec_fixed(BCAST_ARGS); +int ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_dec_dynamic(BCAST_ARGS); int ompi_coll_tuned_bcast_intra_do_this(BCAST_ARGS, int algorithm, int faninout, int segsize); int ompi_coll_tuned_bcast_intra_check_forced_init (coll_tuned_force_algorithm_mca_param_indices_t *mca_param_indices); diff --git a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c index 1c445c0c889..1e8b70e8b3e 100644 --- a/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c +++ b/ompi/mca/coll/tuned/coll_tuned_decision_fixed.c @@ -514,6 +514,10 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { + if (OMPI_COMM_IS_DISJOINT(comm)) { + return ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(buff, count, datatype, root, comm, module); + } + size_t total_dsize, dsize; int communicator_size, alg; communicator_size = ompi_comm_size(comm); @@ -651,6 +655,78 @@ int ompi_coll_tuned_bcast_intra_dec_fixed(void *buff, int count, alg, 0, 0); } + +/* + * bcast_intra_dec for inter node communicators + * + * Function: - selects broadcast algorithm to use + * Accepts: - same arguments as MPI_Bcast() + * Returns: - MPI_SUCCESS or error code (passed from the bcast implementation) + */ +int ompi_coll_tuned_bcast_intra_disjoint_dec_fixed(void *buff, int count, + struct ompi_datatype_t *datatype, int root, + struct ompi_communicator_t *comm, + mca_coll_base_module_t *module) { + size_t total_dsize, dsize; + int communicator_size, alg; + communicator_size = ompi_comm_size(comm); + + ompi_datatype_type_size(datatype, &dsize); + total_dsize = dsize * (unsigned long)count; + + OPAL_OUTPUT((ompi_coll_tuned_stream, "ompi_coll_tuned_bcast_intra_disjoint_dec_fixed" + " root %d rank %d com_size %d", + root, ompi_comm_rank(comm), communicator_size)); + + /** Algorithms: + * {1, "basic_linear"}, + * {2, "chain"}, + * {3, "pipeline"}, + * {4, "split_binary_tree"}, + * {5, "binary_tree"}, + * {6, "binomial"}, + * {7, "knomial"}, + * {8, "scatter_allgather"}, + * {9, "scatter_allgather_ring"}, + */ + if (communicator_size < 4) { + alg = 1; + } else if (communicator_size < 8) { + if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 5; + } + } else if (communicator_size < 16) { + if (total_dsize < 1048576) { + alg = 1; + } else { + alg = 5; + } + } else if (communicator_size < 32) { + if (total_dsize < 262144) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 7; + } else { + alg = 5; + } + } else { + if (total_dsize < 65536) { + alg = 1; + } else if (total_dsize < 1048576) { + alg = 7; + } else { + alg = 5; + } + } + + return ompi_coll_tuned_bcast_intra_do_this (buff, count, datatype, root, + comm, module, + alg, 0, 0); +} + + /* * reduce_intra_dec *