From 835eef5ebd06e44b132670ee6dba8db3ccd8e668 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 20 Feb 2024 13:07:28 -0800 Subject: [PATCH] btl/smcuda: add delayed stream initialization introduce two new mca parameterse to the smcuda component: - allow for delayed initialization of the internal ipc stream and the array of events. This allows to handle situations where the user code did not set the device before MPI_Init AND the internal stream and/or event structures have some dependence on the device id used during creation. - add a parameter to control how many events are created during initialization. Signed-off-by: Edgar Gabriel --- opal/mca/btl/smcuda/btl_smcuda.h | 4 + opal/mca/btl/smcuda/btl_smcuda_accelerator.c | 78 +++++++++++++------- opal/mca/btl/smcuda/btl_smcuda_component.c | 17 +++++ 3 files changed, 74 insertions(+), 25 deletions(-) diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h index 962e9c268a0..812e04ea7a6 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.h +++ b/opal/mca/btl/smcuda/btl_smcuda.h @@ -15,6 +15,7 @@ * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2012-2023 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -204,6 +205,9 @@ struct mca_btl_smcuda_component_t { int use_cuda_ipc; int use_cuda_ipc_same_gpu; + int accelerator_delayed_ipc_init; + int accelerator_max_ipc_events; + unsigned long mpool_min_size; char *allocator; }; diff --git a/opal/mca/btl/smcuda/btl_smcuda_accelerator.c b/opal/mca/btl/smcuda/btl_smcuda_accelerator.c index 81185f3fdb6..56245879dd8 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_accelerator.c +++ b/opal/mca/btl/smcuda/btl_smcuda_accelerator.c @@ -3,6 +3,7 @@ * Copyright (c) 2022 IBM Corporation. All rights reserved. * Copyright (c) 2023 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,62 +34,80 @@ static int accelerator_event_ipc_first_used; static volatile int accelerator_event_ipc_num_used; /* Size of array holding events */ -static int accelerator_event_max = 400; static int accelerator_event_ipc_most = 0; static bool smcuda_accelerator_initialized = false; void mca_btl_smcuda_accelerator_fini(void); -int mca_btl_smcuda_accelerator_init(void) +/* Initialize the internal ipc stream and the events (s&e) */ +static int mca_btl_smcuda_accelerator_ipc_init(void) { int rc = OPAL_SUCCESS; int i; - OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t); - /* The first available status index is 0. Make an empty frag - array. */ + int device_id; + + rc = opal_accelerator.get_device(&device_id); + if (OPAL_SUCCESS != rc) { + opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to retrieve current device."); + return OPAL_ERROR; + } - rc = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &ipc_stream); + rc = opal_accelerator.create_stream(device_id, &ipc_stream); if (OPAL_SUCCESS != rc) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to create accelerator ipc_stream stream."); - goto cleanup_and_error; + return OPAL_ERROR; } + /* Create the events since they can be reused. */ + for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) { + rc = opal_accelerator.create_event(device_id, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true); + if (OPAL_SUCCESS != rc) { + opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed."); + return OPAL_ERROR; + } + } + + return OPAL_SUCCESS; +} + +int mca_btl_smcuda_accelerator_init(void) +{ + int rc = OPAL_SUCCESS; + int i; + + OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t); + accelerator_event_ipc_num_used = 0; accelerator_event_ipc_first_avail = 0; accelerator_event_ipc_first_used = 0; - accelerator_event_ipc_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *)); + accelerator_event_ipc_array = calloc(mca_btl_smcuda_component.accelerator_max_ipc_events, sizeof(opal_accelerator_event_t *)); if (NULL == accelerator_event_ipc_array) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory."); rc = OPAL_ERROR; goto cleanup_and_error; } - /* Create the events since they can be reused. */ - for (i = 0; i < accelerator_event_max; i++) { - rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true); - if (OPAL_SUCCESS != rc) { - opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed."); - rc = OPAL_ERROR; - goto cleanup_and_error; - } - } /* The first available status index is 0. Make an empty frag array. */ - - accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max); + accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) * + mca_btl_smcuda_component.accelerator_max_ipc_events); if (NULL == accelerator_event_ipc_frag_array) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory."); rc = OPAL_ERROR; goto cleanup_and_error; } + if (!mca_btl_smcuda_component.accelerator_delayed_ipc_init) { + mca_btl_smcuda_accelerator_ipc_init(); + } + smcuda_accelerator_initialized = true; cleanup_and_error: if (OPAL_SUCCESS != rc) { if (NULL != accelerator_event_ipc_array) { - for (i = 0; i < accelerator_event_max; i++) { + for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) { if (NULL != accelerator_event_ipc_array[i]) { OBJ_RELEASE(accelerator_event_ipc_array[i]); } @@ -117,7 +136,7 @@ void mca_btl_smcuda_accelerator_fini(void) } if (NULL != accelerator_event_ipc_array) { - for (i = 0; i < accelerator_event_max; i++) { + for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) { if (NULL != accelerator_event_ipc_array[i]) { OBJ_RELEASE(accelerator_event_ipc_array[i]); } @@ -129,7 +148,9 @@ void mca_btl_smcuda_accelerator_fini(void) free(accelerator_event_ipc_frag_array); } - OBJ_RELEASE(ipc_stream); + if (NULL != ipc_stream) { + OBJ_RELEASE(ipc_stream); + } OBJ_DESTRUCT(&btl_smcuda_accelerator_ipc_lock); smcuda_accelerator_initialized = false; @@ -175,7 +196,7 @@ int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **fra /* Bump counters, loop around the circular buffer if necessary */ --accelerator_event_ipc_num_used; ++accelerator_event_ipc_first_used; - if (accelerator_event_ipc_first_used >= accelerator_event_max) { + if (accelerator_event_ipc_first_used >= mca_btl_smcuda_component.accelerator_max_ipc_events) { accelerator_event_ipc_first_used = 0; } /* A return value of 1 indicates an event completed and a frag was returned */ @@ -196,10 +217,17 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg, int result; OPAL_THREAD_LOCK(&btl_smcuda_accelerator_ipc_lock); + if (NULL == ipc_stream) { + result = mca_btl_smcuda_accelerator_ipc_init(); + if (OPAL_SUCCESS != result) { + return result; + } + } + /* First make sure there is room to store the event. If not, then * return an error. The error message will tell the user to try and * run again, but with a larger array for storing events. */ - if (accelerator_event_ipc_num_used == accelerator_event_max) { + if (accelerator_event_ipc_num_used == mca_btl_smcuda_component.accelerator_max_ipc_events) { opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: Out of event handles"); OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock); return OPAL_ERR_OUT_OF_RESOURCE; @@ -237,7 +265,7 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg, /* Bump up the first available slot and number used by 1 */ accelerator_event_ipc_first_avail++; - if (accelerator_event_ipc_first_avail >= accelerator_event_max) { + if (accelerator_event_ipc_first_avail >= mca_btl_smcuda_component.accelerator_max_ipc_events) { accelerator_event_ipc_first_avail = 0; } accelerator_event_ipc_num_used++; diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index aa1f8d3b60d..78e06751222 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -20,6 +20,7 @@ * Copyright (c) 2022 IBM Corporation. All rights reserved. * Copyright (c) 2023 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -169,6 +170,22 @@ static int smcuda_register(void) mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.fifo_lazy_free); + /* Delay the creation of the IPC stream and events. This has the advantage of also + * working in scenarios where the user did not set the accelerator device + * before MPI_Init AND the stream/event has internally some reference to the device + * used at that time */ + mca_btl_smcuda_component.accelerator_delayed_ipc_init = 1; + (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "delayed_stream_init", + "Delay the initialization of the ipc stream and internal events", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_delayed_ipc_init); + + mca_btl_smcuda_component.accelerator_max_ipc_events = 400; + (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "max_ipc_events", + "Number of events created by the smcuda components internally", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5, + MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_max_ipc_events); + /* default number of extra procs to allow for future growth */ mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9, &mca_btl_smcuda_component.sm_extra_procs);