Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the NUMA detection for the smcuda BTL. #12391

Merged
merged 1 commit into from
Mar 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 61 additions & 57 deletions opal/mca/btl/smcuda/btl_smcuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2012-2023 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012-2024 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
Expand Down Expand Up @@ -216,56 +216,52 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
{
size_t length, length_payload;
sm_fifo_t *my_fifos;
int my_mem_node, num_mem_nodes, i, rc;
mca_common_sm_mpool_resources_t *res = NULL;
mca_btl_smcuda_component_t *m = &mca_btl_smcuda_component;
char *loc, *mynuma;
opal_process_name_t wildcard_rank;
int rc;

/* Assume we don't have hwloc support and fill in dummy info */
mca_btl_smcuda_component.mem_node = my_mem_node = 0;
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = 1;
mca_btl_smcuda_component.mem_node = -1;
mca_btl_smcuda_component.num_mem_nodes = 1;

/* see if we were given a topology signature */
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_TOPOLOGY_SIGNATURE, &wildcard_rank, &loc, PMIX_STRING);
if (OPAL_SUCCESS == rc) {
/* the number of NUMA nodes is right at the front */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = strtoul(loc, NULL, 10);
mca_btl_smcuda_component.num_mem_nodes = strtoul(loc, NULL, 10);
free(loc);
} else {
/* If we have hwloc support, then get accurate information */
loc = NULL;
if (OPAL_SUCCESS == opal_hwloc_base_get_topology()) {
i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);
rc = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0,
OPAL_HWLOC_AVAILABLE);

/* JMS This tells me how many numa nodes are *available*,
but it's not how many are being used *by this job*.
Note that this is the value we've previously used (from
the previous carto-based implementation), but it really
should be improved to be how many NUMA nodes are being
used *in this job*. */
mca_btl_smcuda_component.num_mem_nodes = num_mem_nodes = i;
mca_btl_smcuda_component.num_mem_nodes = rc;
}
}
/* see if we were given our location */
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING, &OPAL_PROC_MY_NAME, &loc, PMIX_STRING);
if (OPAL_SUCCESS == rc) {
if (NULL == loc) {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
} else {
if (NULL != loc) {
/* get our NUMA location */
mynuma = opal_hwloc_base_get_location(loc, HWLOC_OBJ_NODE, 0);
if (NULL == mynuma || NULL != strchr(mynuma, ',') || NULL != strchr(mynuma, '-')) {
/* we either have no idea what NUMA we are on, or we
* are on multiple NUMA nodes */
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
mca_btl_smcuda_component.mem_node = -1;
bosilca marked this conversation as resolved.
Show resolved Hide resolved
} else {
/* we are bound to a single NUMA node */
my_mem_node = strtoul(mynuma, NULL, 10);
mca_btl_smcuda_component.mem_node = my_mem_node;
mca_btl_smcuda_component.mem_node = strtoul(mynuma, NULL, 10);
}
if (NULL != mynuma) {
free(mynuma);
Expand All @@ -274,14 +270,14 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
}
} else {
/* If we have hwloc support, then get accurate information */
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && num_mem_nodes > 0
if (OPAL_SUCCESS == opal_hwloc_base_get_topology() && mca_btl_smcuda_component.num_mem_nodes > 0
&& NULL != opal_process_info.cpuset) {
int numa = 0, w;
unsigned n_bound = 0;
hwloc_obj_t obj;

/* count the number of NUMA nodes to which we are bound */
for (w = 0; w < i; w++) {
for (w = 0; w < mca_btl_smcuda_component.num_mem_nodes; w++) {
if (NULL
== (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE,
0, w, OPAL_HWLOC_AVAILABLE))) {
Expand All @@ -297,27 +293,35 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
* a NUMA we are on, then not much we can do
*/
if (1 == n_bound) {
mca_btl_smcuda_component.mem_node = my_mem_node = numa;
} else {
mca_btl_smcuda_component.mem_node = my_mem_node = -1;
mca_btl_smcuda_component.mem_node = numa;
}
}
}
/* sanity check: do we have the NUMA node info ? */
if( mca_btl_smcuda_component.mem_node < 0 ||
mca_btl_smcuda_component.num_mem_nodes < 1) {
opal_output_verbose(10, opal_btl_base_framework.framework_output,
"btl:smcuda: %s unable to find topological information mem_node=%d, num_mem_nodes=%d",
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
mca_btl_smcuda_component.mem_node, mca_btl_smcuda_component.num_mem_nodes);
mca_btl_smcuda_component.mem_node = 0;
mca_btl_smcuda_component.num_mem_nodes = 1;
}

if (NULL == (res = calloc(1, sizeof(*res)))) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

/* lookup shared memory pool */
mca_btl_smcuda_component.sm_mpools = (mca_mpool_base_module_t **)
calloc(num_mem_nodes, sizeof(mca_mpool_base_module_t *));
calloc(mca_btl_smcuda_component.num_mem_nodes, sizeof(mca_mpool_base_module_t *));

/* Disable memory binding, because each MPI process will claim pages in the
* mpool for their local NUMA node */
res->mem_node = -1;
res->allocator = mca_btl_smcuda_component.allocator;

if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) {
if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(&mca_btl_smcuda_component, res))) {
free(res);
return rc;
}
Expand All @@ -344,7 +348,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s

/* remember that node rank zero is already attached */
if (0 != my_smp_rank) {
if (OPAL_SUCCESS != (rc = sm_segment_attach(m))) {
if (OPAL_SUCCESS != (rc = sm_segment_attach(&mca_btl_smcuda_component))) {
free(res);
return rc;
}
Expand All @@ -357,7 +361,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
"btl:smcuda: host_register address=%p, size=%d",
mca_btl_smcuda_component.sm_mpool_base, (int) res->size);
if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) {
rc = opal_accelerator.host_register(MCA_ACCELERATOR_NO_DEVICE_ID, mca_btl_smcuda_component.sm_mpool_base, res->size);
rc = opal_accelerator.host_register(MCA_ACCELERATOR_NO_DEVICE_ID, mca_btl_smcuda_component.sm_mpool_base, res->size);
if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) {
/* If registering the memory fails, print a message and continue.
* This is not a fatal error. */
Expand Down Expand Up @@ -394,7 +398,7 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
mca_btl_smcuda_component.shm_bases[mca_btl_smcuda_component.my_smp_rank]
= (char *) mca_btl_smcuda_component.sm_mpool_base;
mca_btl_smcuda_component.shm_mem_nodes[mca_btl_smcuda_component.my_smp_rank] = (uint16_t)
my_mem_node;
mca_btl_smcuda_component.mem_node;

/* initialize the array of fifo's "owned" by this process */
if (NULL == (my_fifos = (sm_fifo_t *) mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t))))
Expand All @@ -420,45 +424,45 @@ static int smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, int32_t my_s
/* allocation will be for the fragment descriptor and payload buffer */
length = sizeof(mca_btl_smcuda_frag1_t);
length_payload = sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
i = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_eager, length, opal_cache_line_size,
OBJ_CLASS(mca_btl_smcuda_frag1_t), length_payload, opal_cache_line_size,
mca_btl_smcuda_component.sm_free_list_num,
mca_btl_smcuda_component.sm_free_list_max,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != i)
return i;
rc = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_eager, length, opal_cache_line_size,
OBJ_CLASS(mca_btl_smcuda_frag1_t), length_payload, opal_cache_line_size,
mca_btl_smcuda_component.sm_free_list_num,
mca_btl_smcuda_component.sm_free_list_max,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != rc)
return rc;

length = sizeof(mca_btl_smcuda_frag2_t);
length_payload = sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.max_frag_size;
i = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_max, length, opal_cache_line_size,
OBJ_CLASS(mca_btl_smcuda_frag2_t), length_payload, opal_cache_line_size,
mca_btl_smcuda_component.sm_free_list_num,
mca_btl_smcuda_component.sm_free_list_max,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != i)
return i;

i = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_user, sizeof(mca_btl_smcuda_user_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t),
sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size,
mca_btl_smcuda_component.sm_free_list_num,
mca_btl_smcuda_component.sm_free_list_max,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != i)
return i;
rc = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_max, length, opal_cache_line_size,
OBJ_CLASS(mca_btl_smcuda_frag2_t), length_payload, opal_cache_line_size,
mca_btl_smcuda_component.sm_free_list_num,
mca_btl_smcuda_component.sm_free_list_max,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != rc)
return rc;

rc = opal_free_list_init(&mca_btl_smcuda_component.sm_frags_user, sizeof(mca_btl_smcuda_user_t),
opal_cache_line_size, OBJ_CLASS(mca_btl_smcuda_user_t),
sizeof(mca_btl_smcuda_hdr_t), opal_cache_line_size,
mca_btl_smcuda_component.sm_free_list_num,
mca_btl_smcuda_component.sm_free_list_max,
mca_btl_smcuda_component.sm_free_list_inc,
mca_btl_smcuda_component.sm_mpool, 0, NULL, NULL, NULL);
if (OPAL_SUCCESS != rc)
return rc;

mca_btl_smcuda_component.num_outstanding_frags = 0;

mca_btl_smcuda_component.num_pending_sends = 0;
i = opal_free_list_init(&mca_btl_smcuda_component.pending_send_fl,
sizeof(btl_smcuda_pending_send_item_t), 8,
OBJ_CLASS(opal_free_list_item_t), 0, 0, 16, -1, 32, NULL, 0, NULL, NULL,
NULL);
if (OPAL_SUCCESS != i)
return i;
rc = opal_free_list_init(&mca_btl_smcuda_component.pending_send_fl,
sizeof(btl_smcuda_pending_send_item_t), 8,
OBJ_CLASS(opal_free_list_item_t), 0, 0, 16, -1, 32, NULL, 0, NULL, NULL,
NULL);
if (OPAL_SUCCESS != rc)
return rc;

/* set flag indicating btl has been inited */
smcuda_btl->btl_inited = true;
Expand Down
Loading