From 835eef5ebd06e44b132670ee6dba8db3ccd8e668 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Tue, 20 Feb 2024 13:07:28 -0800
Subject: [PATCH] btl/smcuda: add delayed stream initialization

introduce two new mca parameterse to the smcuda component:

- allow for delayed initialization of the internal ipc stream and the
  array of events. This allows to handle situations where the user
  code did not set the device before MPI_Init AND the internal stream
  and/or event structures have some dependence on the device id used
  during creation.
- add a parameter to control how many events are created during
  initialization.

Signed-off-by: Edgar Gabriel <Edgar.Gabriel@amd.com>
---
 opal/mca/btl/smcuda/btl_smcuda.h             |  4 +
 opal/mca/btl/smcuda/btl_smcuda_accelerator.c | 78 +++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda_component.c   | 17 +++++
 3 files changed, 74 insertions(+), 25 deletions(-)

diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 962e9c268a0..812e04ea7a6 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -15,6 +15,7 @@
  * Copyright (c) 2010-2015 Los Alamos National Security, LLC.
  *                         All rights reserved.
  * Copyright (c) 2012-2023 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All Rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -204,6 +205,9 @@ struct mca_btl_smcuda_component_t {
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
 
+    int accelerator_delayed_ipc_init;
+    int accelerator_max_ipc_events;
+
     unsigned long mpool_min_size;
     char *allocator;
 };
diff --git a/opal/mca/btl/smcuda/btl_smcuda_accelerator.c b/opal/mca/btl/smcuda/btl_smcuda_accelerator.c
index 81185f3fdb6..56245879dd8 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_accelerator.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_accelerator.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2022      IBM Corporation.  All rights reserved.
  * Copyright (c) 2023      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All Rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -33,62 +34,80 @@ static int accelerator_event_ipc_first_used;
 static volatile int accelerator_event_ipc_num_used;
 
 /* Size of array holding events */
-static int accelerator_event_max = 400;
 static int accelerator_event_ipc_most = 0;
 static bool smcuda_accelerator_initialized = false;
 
 void mca_btl_smcuda_accelerator_fini(void);
 
-int mca_btl_smcuda_accelerator_init(void)
+/* Initialize the internal ipc stream and the events (s&e) */
+static int mca_btl_smcuda_accelerator_ipc_init(void)
 {
     int rc = OPAL_SUCCESS;
     int i;
-    OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t);
-    /* The first available status index is 0.  Make an empty frag
-       array. */
+    int device_id;
+
+    rc = opal_accelerator.get_device(&device_id);
+    if (OPAL_SUCCESS != rc) {
+        opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to retrieve current device.");
+        return OPAL_ERROR;
+    }
 
-    rc = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &ipc_stream);
+    rc = opal_accelerator.create_stream(device_id, &ipc_stream);
     if (OPAL_SUCCESS != rc) {
         opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to create accelerator ipc_stream stream.");
-        goto cleanup_and_error;
+        return OPAL_ERROR;
     }
 
+    /* Create the events since they can be reused. */
+    for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
+        rc = opal_accelerator.create_event(device_id, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true);
+        if (OPAL_SUCCESS != rc) {
+            opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
+            return OPAL_ERROR;
+        }
+    }
+
+    return OPAL_SUCCESS;
+}
+
+int mca_btl_smcuda_accelerator_init(void)
+{
+    int rc = OPAL_SUCCESS;
+    int i;
+
+    OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t);
+
     accelerator_event_ipc_num_used = 0;
     accelerator_event_ipc_first_avail = 0;
     accelerator_event_ipc_first_used = 0;
 
-    accelerator_event_ipc_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
+    accelerator_event_ipc_array = calloc(mca_btl_smcuda_component.accelerator_max_ipc_events, sizeof(opal_accelerator_event_t *));
     if (NULL == accelerator_event_ipc_array) {
         opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory.");
         rc = OPAL_ERROR;
         goto cleanup_and_error;
     }
-    /* Create the events since they can be reused. */
-    for (i = 0; i < accelerator_event_max; i++) {
-        rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true);
-        if (OPAL_SUCCESS != rc) {
-            opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
-            rc = OPAL_ERROR;
-            goto cleanup_and_error;
-        }
-    }
 
     /* The first available status index is 0.  Make an empty frag
        array. */
-
-    accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
+    accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) *
+                                                                                    mca_btl_smcuda_component.accelerator_max_ipc_events);
     if (NULL == accelerator_event_ipc_frag_array) {
         opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory.");
         rc = OPAL_ERROR;
         goto cleanup_and_error;
     }
 
+    if (!mca_btl_smcuda_component.accelerator_delayed_ipc_init) {
+        mca_btl_smcuda_accelerator_ipc_init();
+    }
+
     smcuda_accelerator_initialized = true;
 
 cleanup_and_error:
     if (OPAL_SUCCESS != rc) {
         if (NULL != accelerator_event_ipc_array) {
-            for (i = 0; i < accelerator_event_max; i++) {
+            for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
                 if (NULL != accelerator_event_ipc_array[i]) {
                     OBJ_RELEASE(accelerator_event_ipc_array[i]);
                 }
@@ -117,7 +136,7 @@ void mca_btl_smcuda_accelerator_fini(void)
     }
 
     if (NULL != accelerator_event_ipc_array) {
-        for (i = 0; i < accelerator_event_max; i++) {
+        for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
             if (NULL != accelerator_event_ipc_array[i]) {
                 OBJ_RELEASE(accelerator_event_ipc_array[i]);
             }
@@ -129,7 +148,9 @@ void mca_btl_smcuda_accelerator_fini(void)
         free(accelerator_event_ipc_frag_array);
     }
 
-    OBJ_RELEASE(ipc_stream);
+    if (NULL != ipc_stream) {
+        OBJ_RELEASE(ipc_stream);
+    }
 
     OBJ_DESTRUCT(&btl_smcuda_accelerator_ipc_lock);
     smcuda_accelerator_initialized = false;
@@ -175,7 +196,7 @@ int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **fra
         /* Bump counters, loop around the circular buffer if necessary */
         --accelerator_event_ipc_num_used;
         ++accelerator_event_ipc_first_used;
-        if (accelerator_event_ipc_first_used >= accelerator_event_max) {
+        if (accelerator_event_ipc_first_used >= mca_btl_smcuda_component.accelerator_max_ipc_events) {
             accelerator_event_ipc_first_used = 0;
         }
         /* A return value of 1 indicates an event completed and a frag was returned */
@@ -196,10 +217,17 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
     int result;
     OPAL_THREAD_LOCK(&btl_smcuda_accelerator_ipc_lock);
 
+    if (NULL == ipc_stream) {
+        result = mca_btl_smcuda_accelerator_ipc_init();
+        if (OPAL_SUCCESS != result) {
+            return result;
+        }
+    }
+
     /* First make sure there is room to store the event.  If not, then
      * return an error.  The error message will tell the user to try and
      * run again, but with a larger array for storing events. */
-    if (accelerator_event_ipc_num_used == accelerator_event_max) {
+    if (accelerator_event_ipc_num_used == mca_btl_smcuda_component.accelerator_max_ipc_events) {
         opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: Out of event handles");
         OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock);
         return OPAL_ERR_OUT_OF_RESOURCE;
@@ -237,7 +265,7 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
 
     /* Bump up the first available slot and number used by 1 */
     accelerator_event_ipc_first_avail++;
-    if (accelerator_event_ipc_first_avail >= accelerator_event_max) {
+    if (accelerator_event_ipc_first_avail >= mca_btl_smcuda_component.accelerator_max_ipc_events) {
         accelerator_event_ipc_first_avail = 0;
     }
     accelerator_event_ipc_num_used++;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index aa1f8d3b60d..78e06751222 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -20,6 +20,7 @@
  * Copyright (c) 2022      IBM Corporation.  All rights reserved.
  * Copyright (c) 2023      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All Rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -169,6 +170,22 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5,
                                        &mca_btl_smcuda_component.fifo_lazy_free);
 
+    /* Delay the creation of the IPC stream and events. This has the advantage of also
+     * working in scenarios where the user did not set the accelerator device
+     * before MPI_Init AND the stream/event has internally some reference to the device
+     * used at that time */
+    mca_btl_smcuda_component.accelerator_delayed_ipc_init = 1;
+    (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "delayed_stream_init",
+                                           "Delay the initialization of the ipc stream and internal events",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_delayed_ipc_init);
+
+    mca_btl_smcuda_component.accelerator_max_ipc_events = 400;
+    (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "max_ipc_events",
+                                           "Number of events created by the smcuda components internally",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_max_ipc_events);
+
     /* default number of extra procs to allow for future growth */
     mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9,
                                       &mca_btl_smcuda_component.sm_extra_procs);