Skip to content

Commit

Permalink
opal/mca/ofi: select NIC closest to accelerator if requested
Browse files Browse the repository at this point in the history
This patch introduces a new MCA parameter
opal_common_ofi_gpu_aware_provider_selection(default false), to select
the closest NIC to the accelerator device. If the accelerator or NIC
PCI information is not available, fallback to select the NIC on the
closest package.

The distance between the acclerator device and NIC is the number of
hwloc objects inbetween, which includes the lowest common ancestor on
the hwloc topology tree.

Signed-off-by: Wenduo Wang <[email protected]>
  • Loading branch information
wenduwan committed Jul 5, 2024
1 parent c29f239 commit a8587c2
Showing 1 changed file with 223 additions and 5 deletions.
228 changes: 223 additions & 5 deletions opal/mca/common/ofi/common_ofi.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include "common_ofi.h"
#include "opal/constants.h"
#include "opal/mca/accelerator/accelerator.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/mca/base/mca_base_var.h"
#include "opal/mca/hwloc/base/base.h"
Expand All @@ -38,13 +39,15 @@
#include "opal/util/argv.h"
#include "opal/util/show_help.h"

extern opal_accelerator_base_module_t opal_accelerator;
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
.prov_exclude = NULL,
.output = -1};
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net";
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
static int opal_common_ofi_verbose_level = 0;
static int opal_common_ofi_init_ref_cnt = 0;
static bool opal_common_ofi_gpu_aware_provider_selection = false;
#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR
static bool opal_common_ofi_installed_memory_monitor = false;
#endif
Expand Down Expand Up @@ -324,6 +327,7 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
static int include_index = -1;
static int exclude_index = -1;
static int verbose_index = -1;
static int gpu_aware_provider_selection = -1;
int ret;

if (fi_version() < FI_VERSION(1, 0)) {
Expand Down Expand Up @@ -389,6 +393,19 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
}
}

if (0 > gpu_aware_provider_selection) {
gpu_aware_provider_selection
= mca_base_var_register("opal", "opal_common", "ofi", "gpu_aware_provider_selection",
"Enable GPU-aware provider selection", MCA_BASE_VAR_TYPE_BOOL,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_1,
MCA_BASE_VAR_SCOPE_LOCAL,
&opal_common_ofi_gpu_aware_provider_selection);
if (0 > gpu_aware_provider_selection) {
ret = gpu_aware_provider_selection;
goto err;
}
}

if (component) {
ret = mca_base_var_register_synonym(include_index,
component->mca_project_name,
Expand All @@ -414,6 +431,15 @@ int opal_common_ofi_mca_register(const mca_base_component_t *component)
if (0 > ret) {
goto err;
}

ret = mca_base_var_register_synonym(gpu_aware_provider_selection,
component->mca_project_name,
component->mca_type_name,
component->mca_component_name,
"gpu_aware_provider_selection", 0);
if (0 > ret) {
goto err;
}
}

/* The frameworks initialize their output streams during
Expand Down Expand Up @@ -915,18 +941,192 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
return (uint32_t) process_info->myprocid.rank;
}

static int get_parent_distance(hwloc_obj_t parent, hwloc_obj_t child, int *distance)
{
int dist = 0;

while (child != parent) {
if (!child) {
return OPAL_ERROR;
}
child = child->parent;
++dist;
}

*distance = dist;
return OPAL_SUCCESS;
}

#if OPAL_OFI_PCI_DATA_AVAILABLE
/**
* @brief Attempt to find a nearest provider from the accelerator.
* Check if opal_accelerator is initialized with a valid PCI device, and find a provider from the
* shortest distance.
* Special cases:
* 1. If not accelerator device is available, returns OPAL_ERR_NOT_AVAILABLE.
* 2. If the provider does not have PCI attributers, we do not attempt to make a selection, and
* return OPAL_ERR_NOT_AVAILABLE.
* 3. If there are more than 1 providers with the same equal distance, break the tie using a modulo
* i.e. (local rank on the same accelerator) % (number of nearest providers)
* @param[in] provider_list linked list of providers
* @param[in] num_providers number of providers
* @param[in] accl_id Accelerator id
* @param[in] device_rank local rank on the accelerator
* @param[out] provider pointer to the selected provider
* @return OPAL_SUCCESS if a provider is successfully selected
* OPAL_ERR_NOT_AVAILABLE if a provider cannot be decided deterministically
* OPAL_ERROR if a fatal error happened
*/
static int find_nearest_provider_from_accelerator(struct fi_info *provider_list,
size_t num_providers,
int accl_id,
uint32_t device_rank,
struct fi_info **provider)
{
hwloc_obj_t accl_dev = NULL, prov_dev = NULL, common_ancestor = NULL;
int ret = -1, accl_distance = -1, prov_distance = -1, min_distance = INT_MAX;
opal_accelerator_pci_attr_t accl_pci_attr = {0};
struct fi_info *current_provider = NULL;
struct fi_pci_attr pci = {0};
uint32_t distances[num_providers], *distance = distances;
uint32_t near_provider_count = 0, provider_rank = 0;

memset(distances, 0, sizeof(distances));

ret = opal_accelerator.get_device_pci_attr(accl_id, &accl_pci_attr);
if (OPAL_SUCCESS != ret) {
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Accelerator PCI info is not available", __FILE__, __LINE__);
return OPAL_ERROR;
}

accl_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, accl_pci_attr.domain_id,
accl_pci_attr.bus_id, accl_pci_attr.device_id,
accl_pci_attr.function_id);
if (NULL == accl_dev) {
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Failed to find accelerator PCI device", __FILE__, __LINE__);
return OPAL_ERROR;
}

opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Found accelerator device %d: %04x:%02x:%02x.%x VID: %x DID: %x",
__FILE__, __LINE__, accl_id, accl_pci_attr.domain_id, accl_pci_attr.bus_id,
accl_pci_attr.device_id, accl_pci_attr.function_id,
accl_dev->attr->pcidev.vendor_id, accl_dev->attr->pcidev.device_id);

current_provider = provider_list;
while (NULL != current_provider) {
common_ancestor = NULL;
if (0 == check_provider_attr(provider_list, current_provider)
&& OPAL_SUCCESS == get_provider_nic_pci(current_provider, &pci)) {
prov_dev = hwloc_get_pcidev_by_busid(opal_hwloc_topology, pci.domain_id, pci.bus_id,
pci.device_id, pci.function_id);
if (NULL == prov_dev) {
opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Failed to find provider PCI device", __FILE__, __LINE__);
return OPAL_ERROR;
}

common_ancestor = hwloc_get_common_ancestor_obj(opal_hwloc_topology, accl_dev,
prov_dev);
if (!common_ancestor) {
opal_output_verbose(
1, opal_common_ofi.output,
"%s:%d:Failed to find common ancestor of accelerator and provider PCI device",
__FILE__, __LINE__);
/**
* Return error because any 2 PCI devices should share at least one common ancestor,
* i.e. root
*/
return OPAL_ERROR;
}

ret = get_parent_distance(common_ancestor, accl_dev, &accl_distance);
if (OPAL_SUCCESS != ret) {
opal_output_verbose(
1, opal_common_ofi.output,
"%s:%d:Failed to get distance between common ancestor and accelerator device",
__FILE__, __LINE__);
return OPAL_ERROR;
}

ret = get_parent_distance(common_ancestor, prov_dev, &prov_distance);
if (OPAL_SUCCESS != ret) {
opal_output_verbose(
1, opal_common_ofi.output,
"%s:%d:Failed to get distance between common ancestor and provider device",
__FILE__, __LINE__);
return OPAL_ERROR;
}

if (min_distance > accl_distance + prov_distance) {
min_distance = accl_distance + prov_distance;
near_provider_count = 1;
} else if (min_distance == accl_distance + prov_distance) {
++near_provider_count;
}
}

*(distance++) = !common_ancestor ? 0 : accl_distance + prov_distance;
current_provider = current_provider->next;
}

if (0 == near_provider_count) {
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Provider does not have PCI device",
__FILE__, __LINE__);
return OPAL_ERR_NOT_AVAILABLE;
}

provider_rank = device_rank % near_provider_count;

distance = distances;
current_provider = provider_list;
while (NULL != current_provider) {
if ((uint32_t) min_distance == *(distance++)
&& provider_rank == --near_provider_count) {
*provider = current_provider;
return OPAL_SUCCESS;
}

current_provider = current_provider->next;
}

assert(0 == near_provider_count);

return OPAL_ERROR;
}
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */


struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
opal_process_info_t *process_info)
{
int ret, num_providers = 0;
int ret, num_providers = 0, accel_id = -1;
struct fi_info *provider = NULL;
uint32_t package_rank = process_info->my_local_rank;
uint32_t package_rank;

/* Current process' local rank on the same package(socket) */
package_rank = process_info->proc_is_bound ? get_package_rank(process_info)
: process_info->my_local_rank;
num_providers = count_providers(provider_list);
if (!process_info->proc_is_bound || 2 > num_providers) {

#if OPAL_OFI_PCI_DATA_AVAILABLE
if (opal_common_ofi_gpu_aware_provider_selection) {
ret = opal_accelerator.get_device(&accel_id);
if (OPAL_SUCCESS != ret) {
opal_output_verbose(1, opal_common_ofi.output, "%s:%d:Accelerator is not available",
__FILE__, __LINE__);
accel_id = -1;
}
}
#endif /* OPAL_OFI_PCI_DATA_AVAILABLE */

if ((!process_info->proc_is_bound && 0 > accel_id) || 2 > num_providers) {
goto round_robin;
}

#if OPAL_OFI_PCI_DATA_AVAILABLE
/* Initialize opal_hwloc_topology if it is not already */
ret = opal_hwloc_base_get_topology();
if (0 > ret) {
Expand All @@ -935,9 +1135,27 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
__FILE__, __LINE__);
}

package_rank = get_package_rank(process_info);
if (0 <= accel_id) {
/**
* If accelerator is enabled, select the closest provider to the accelerator.
* Note: the function expects a local rank on the accelerator to break ties if there are
* multiple equidistant providers. package_rank is NOT an accurate measure, but a proxy.
*/
ret = find_nearest_provider_from_accelerator(provider_list, num_providers, accel_id,
package_rank, &provider);
if (OPAL_SUCCESS == ret) {
goto out;
}

opal_output_verbose(1, opal_common_ofi.output,
"%s:%d:Failed to find a provider close to the accelerator. Error: %d",
__FILE__, __LINE__, ret);

if (!process_info->proc_is_bound) {
goto round_robin;
}
}

#if OPAL_OFI_PCI_DATA_AVAILABLE
/**
* If provider PCI BDF information is available, we calculate its physical distance
* to the current process, and select the provider with the shortest distance.
Expand Down

0 comments on commit a8587c2

Please sign in to comment.