From 29efcef4a45379495a657f369e9cc42b3d3a01f7 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Sat, 13 Jan 2024 21:38:26 +0000 Subject: [PATCH] ofi: follow user specified include/exclude list to select providers This PR addresses #12233 Since 5.0.x, we introduced an optional FI_HMEM capability in ofi provider selection logic(both mtl and btl) in order to support accelerator memory. As described in the issue, this introduced a bug that can cause the wrong ofi provider to be selected, even if the user explicitly includes/excludes the provider name. This change refactors the selection logic to correctly handle the include/exclude list, and therefore fixes the bug. Signed-off-by: Wenduo Wang --- ompi/mca/mtl/ofi/mtl_ofi_component.c | 10 +++++- opal/mca/btl/ofi/btl_ofi_component.c | 47 ++++++++++++++++-------- opal/mca/common/ofi/common_ofi.c | 54 ++++++++++++++++++++++++---- opal/mca/common/ofi/common_ofi.h | 32 +++++++++++++++++ 4 files changed, 121 insertions(+), 22 deletions(-) diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index e4ac687edd5..4fb44ecdf42 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -764,12 +764,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, "%s:%d: fi_getinfo(): %s\n", __FILE__, __LINE__, fi_strerror(-ret)); - if (FI_ENODATA == -ret) { + if ((FI_ENODATA == -ret) + || (0 == ret && include_list + && 0 == opal_common_ofi_count_providers_in_list(providers, include_list)) + || (0 == ret && !include_list && exclude_list + && opal_common_ofi_providers_subset_of_list(providers, exclude_list))) { #if defined(FI_HMEM) /* Attempt selecting a provider without FI_HMEM hints */ if (hints->caps & FI_HMEM) { hints->caps &= ~FI_HMEM; hints->domain_attr->mr_mode &= ~FI_MR_HMEM; + if (providers) { + (void) fi_freeinfo(providers); + providers = NULL; + } goto no_hmem; } #endif diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index a9de2620ac4..3c3ff6da4e1 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -260,7 +260,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, int rc; uint64_t progress_mode; unsigned resource_count = 0; - struct mca_btl_base_module_t **base_modules; + struct mca_btl_base_module_t **base_modules = NULL; char **include_list = NULL, **exclude_list = NULL; BTL_VERBOSE(("initializing ofi btl")); @@ -275,7 +275,7 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, return NULL; } - struct fi_info *info, *info_list, *selected_info; + struct fi_info *info, *info_list = NULL, *selected_info = NULL; struct fi_info hints = {0}; struct fi_ep_attr ep_attr = {0}; struct fi_rx_attr rx_attr = {0}; @@ -366,22 +366,31 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, * The earliest version the explictly allow provider to call CUDA API is 1.18 */ rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list); if (FI_ENOSYS == -rc) { - rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list); + rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list); } - if (0 != rc) { + + if ((FI_ENODATA == -rc) + || (0 == rc && include_list + && 0 == opal_common_ofi_count_providers_in_list(info_list, include_list)) + || (0 == rc && !include_list && exclude_list + && opal_common_ofi_providers_subset_of_list(info_list, exclude_list))) { #if defined(FI_HMEM) + /* Attempt selecting a provider without FI_HMEM hints */ if (hints.caps & FI_HMEM) { - /* Try again without FI_HMEM hints */ hints.caps &= ~FI_HMEM; hints.domain_attr->mr_mode &= ~FI_MR_HMEM; + if (info_list) { + (void) fi_freeinfo(info_list); + info_list = NULL; + } goto no_hmem; } #endif + /* It is not an error if no information is returned. */ + goto out; + } else if (0 != rc) { BTL_VERBOSE(("fi_getinfo failed with code %d: %s", rc, fi_strerror(-rc))); - if (NULL != include_list) { - opal_argv_free(include_list); - } - return NULL; + goto out; } #if defined(FI_HMEM) @@ -441,16 +450,15 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, info = info->next; } - /* We are done with the returned info. */ - fi_freeinfo(info_list); - if (NULL != include_list) { - opal_argv_free(include_list); + if (NULL == info) { + BTL_VERBOSE(("No provider is selected")); + goto out; } /* pass module array back to caller */ base_modules = calloc(mca_btl_ofi_component.module_count, sizeof(*base_modules)); if (NULL == base_modules) { - return NULL; + goto out; } memcpy(base_modules, mca_btl_ofi_component.modules, @@ -461,6 +469,17 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, *num_btl_modules = mca_btl_ofi_component.module_count; +out: + if (include_list) { + opal_argv_free(include_list); + } + if (exclude_list) { + opal_argv_free(exclude_list); + } + if (info_list) { + (void) fi_freeinfo(info_list); + } + return base_modules; } diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index f0da0f4a52c..0f6a37b9475 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -49,6 +49,15 @@ static int opal_common_ofi_init_ref_cnt = 0; static bool opal_common_ofi_installed_memory_monitor = false; #endif +/* Count providers returns the number of providers present in an fi_info list + * @param (IN) provider_list struct fi_info* list of providers available + * + * @param (OUT) int number of providers present in the list + * + * returns 0 if the list is NULL + */ +static int count_providers(struct fi_info *provider_list); + #ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR /* @@ -272,6 +281,44 @@ int opal_common_ofi_is_in_list(char **list, char *item) return 0; } +int opal_common_ofi_count_providers_in_list(struct fi_info *provider_list, char **list) +{ + int count = 0, matched = 0; + struct fi_info *prov = provider_list, *prev_prov = NULL; + char *name; + + while (prov) { + name = prov->fabric_attr->prov_name; + if (prev_prov && !strncasecmp(prev_prov->fabric_attr->prov_name, name, strlen(name))) { + /** + * Providers are usually sorted by name. We can reuse the previous matching result and + * avoid the potentially expensive list traversal. + */ + count += matched; + } else if (opal_common_ofi_is_in_list(list, prov->fabric_attr->prov_name)) { + matched = 1; + ++count; + } else { + matched = 0; + } + prev_prov = prov; + prov = prov->next; + } + + return count; +} + +int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, char **list) +{ + int num_prov = count_providers(provider_list); + + if (!num_prov) { + return 1; + } + + return num_prov == opal_common_ofi_count_providers_in_list(provider_list, list); +} + int opal_common_ofi_mca_register(const mca_base_component_t *component) { static int include_index = -1; @@ -737,13 +784,6 @@ static struct fi_info *select_provider_round_robin(struct fi_info *provider_list return current_provider; } -/* Count providers returns the number of providers present in an fi_info list - * @param (IN) provider_list struct fi_info* list of providers available - * - * @param (OUT) int number of providers present in the list - * - * returns 0 if the list is NULL - */ static int count_providers(struct fi_info *provider_list) { struct fi_info *dev = provider_list; diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index 10992410019..0bf114f5907 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -100,6 +100,38 @@ OPAL_DECLSPEC int opal_common_ofi_export_memory_monitor(void); */ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); +/** + * Get the number of providers whose names are included in a list + * + * This function takes a list of providers and a list of name strings + * as inputs, and return the number of providers whose names are included + * in the name strings. + * + * @param provider_list (IN) List of providers + * @param list (IN) List of name string + * + * @return Number of matched providers + * + */ +OPAL_DECLSPEC int opal_common_ofi_count_providers_in_list(struct fi_info *provider_list, + char **list); + +/** + * Determine whether all providers are included in a list + * + * This function takes a list of providers and a list of name strings + * as inputs, and return whether all provider names are included in the name strings. + * + * @param provider_list (IN) List of providers + * @param list (IN) List of name string + * + * @return 0 At least one provider's name is not included in the name strings. + * @return 1 All provider names are included in the name strings. + * + */ +OPAL_DECLSPEC int opal_common_ofi_providers_subset_of_list(struct fi_info *provider_list, + char **list); + /** * Selects NIC (provider) based on hardware locality *