Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

btl/smcuda: add delayed stream initialization #12354

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions opal/mca/btl/smcuda/btl_smcuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2012-2023 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -204,6 +205,9 @@ struct mca_btl_smcuda_component_t {
int use_cuda_ipc;
int use_cuda_ipc_same_gpu;

int accelerator_delayed_ipc_init;
int accelerator_max_ipc_events;

unsigned long mpool_min_size;
char *allocator;
};
Expand Down
78 changes: 53 additions & 25 deletions opal/mca/btl/smcuda/btl_smcuda_accelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* Copyright (c) 2023 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -33,62 +34,80 @@ static int accelerator_event_ipc_first_used;
static volatile int accelerator_event_ipc_num_used;

/* Size of array holding events */
static int accelerator_event_max = 400;
static int accelerator_event_ipc_most = 0;
static bool smcuda_accelerator_initialized = false;

void mca_btl_smcuda_accelerator_fini(void);

int mca_btl_smcuda_accelerator_init(void)
/* Initialize the internal ipc stream and the events (s&e) */
static int mca_btl_smcuda_accelerator_ipc_init(void)
{
int rc = OPAL_SUCCESS;
int i;
OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t);
/* The first available status index is 0. Make an empty frag
array. */
int device_id;

rc = opal_accelerator.get_device(&device_id);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to retrieve current device.");
return OPAL_ERROR;
}

rc = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &ipc_stream);
rc = opal_accelerator.create_stream(device_id, &ipc_stream);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to create accelerator ipc_stream stream.");
goto cleanup_and_error;
return OPAL_ERROR;
}

/* Create the events since they can be reused. */
for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
rc = opal_accelerator.create_event(device_id, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
return OPAL_ERROR;
}
}

return OPAL_SUCCESS;
}

int mca_btl_smcuda_accelerator_init(void)
{
int rc = OPAL_SUCCESS;
int i;

OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t);

accelerator_event_ipc_num_used = 0;
accelerator_event_ipc_first_avail = 0;
accelerator_event_ipc_first_used = 0;

accelerator_event_ipc_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_ipc_array = calloc(mca_btl_smcuda_component.accelerator_max_ipc_events, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_ipc_array) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}
/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}
}

/* The first available status index is 0. Make an empty frag
array. */

accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) *
mca_btl_smcuda_component.accelerator_max_ipc_events);
if (NULL == accelerator_event_ipc_frag_array) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

if (!mca_btl_smcuda_component.accelerator_delayed_ipc_init) {
mca_btl_smcuda_accelerator_ipc_init();
}

smcuda_accelerator_initialized = true;

cleanup_and_error:
if (OPAL_SUCCESS != rc) {
if (NULL != accelerator_event_ipc_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
if (NULL != accelerator_event_ipc_array[i]) {
OBJ_RELEASE(accelerator_event_ipc_array[i]);
}
Expand Down Expand Up @@ -117,7 +136,7 @@ void mca_btl_smcuda_accelerator_fini(void)
}

if (NULL != accelerator_event_ipc_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
if (NULL != accelerator_event_ipc_array[i]) {
OBJ_RELEASE(accelerator_event_ipc_array[i]);
}
Expand All @@ -129,7 +148,9 @@ void mca_btl_smcuda_accelerator_fini(void)
free(accelerator_event_ipc_frag_array);
}

OBJ_RELEASE(ipc_stream);
if (NULL != ipc_stream) {
OBJ_RELEASE(ipc_stream);
}

OBJ_DESTRUCT(&btl_smcuda_accelerator_ipc_lock);
smcuda_accelerator_initialized = false;
Expand Down Expand Up @@ -175,7 +196,7 @@ int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **fra
/* Bump counters, loop around the circular buffer if necessary */
--accelerator_event_ipc_num_used;
++accelerator_event_ipc_first_used;
if (accelerator_event_ipc_first_used >= accelerator_event_max) {
if (accelerator_event_ipc_first_used >= mca_btl_smcuda_component.accelerator_max_ipc_events) {
accelerator_event_ipc_first_used = 0;
}
/* A return value of 1 indicates an event completed and a frag was returned */
Expand All @@ -196,10 +217,17 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
int result;
OPAL_THREAD_LOCK(&btl_smcuda_accelerator_ipc_lock);

if (NULL == ipc_stream) {
result = mca_btl_smcuda_accelerator_ipc_init();
if (OPAL_SUCCESS != result) {
return result;
}
}

/* First make sure there is room to store the event. If not, then
* return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */
if (accelerator_event_ipc_num_used == accelerator_event_max) {
if (accelerator_event_ipc_num_used == mca_btl_smcuda_component.accelerator_max_ipc_events) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: Out of event handles");
OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
Expand Down Expand Up @@ -237,7 +265,7 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,

/* Bump up the first available slot and number used by 1 */
accelerator_event_ipc_first_avail++;
if (accelerator_event_ipc_first_avail >= accelerator_event_max) {
if (accelerator_event_ipc_first_avail >= mca_btl_smcuda_component.accelerator_max_ipc_events) {
accelerator_event_ipc_first_avail = 0;
}
accelerator_event_ipc_num_used++;
Expand Down
17 changes: 17 additions & 0 deletions opal/mca/btl/smcuda/btl_smcuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* Copyright (c) 2023 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -169,6 +170,22 @@ static int smcuda_register(void)
mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5,
&mca_btl_smcuda_component.fifo_lazy_free);

/* Delay the creation of the IPC stream and events. This has the advantage of also
* working in scenarios where the user did not set the accelerator device
* before MPI_Init AND the stream/event has internally some reference to the device
* used at that time */
mca_btl_smcuda_component.accelerator_delayed_ipc_init = 1;
(void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "delayed_stream_init",
"Delay the initialization of the ipc stream and internal events",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_delayed_ipc_init);

mca_btl_smcuda_component.accelerator_max_ipc_events = 400;
(void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "max_ipc_events",
"Number of events created by the smcuda components internally",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_max_ipc_events);

/* default number of extra procs to allow for future growth */
mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9,
&mca_btl_smcuda_component.sm_extra_procs);
Expand Down
Loading