diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h
index 0d660725acc..caac15970dc 100644
--- a/opal/mca/accelerator/accelerator.h
+++ b/opal/mca/accelerator/accelerator.h
@@ -5,6 +5,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * $COPYRIGHT$
  *
@@ -184,6 +187,19 @@ typedef int (*opal_accelerator_base_module_check_addr_fn_t)(
 typedef int (*opal_accelerator_base_module_create_stream_fn_t)(
     int dev_id, opal_accelerator_stream_t **stream);
 
+
+/**
+ * Query the default stream.
+ *
+ * @param[IN] dev_id         Associated device for the stream or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[OUT] stream        Set to the default stream.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_get_default_stream_fn_t)(
+    int dev_id, opal_accelerator_stream_t **stream);
+
 /**
  * Creates an event. An event is a synchronization marker that can be
  * appended to a stream to monitor device progress or synchronize the
@@ -193,7 +209,7 @@ typedef int (*opal_accelerator_base_module_create_stream_fn_t)(
  * @param[IN]  dev_id        Associated device for the event or
  *                           MCA_ACCELERATOR_NO_DEVICE_ID
  * @param[OUT] event         Event to create
- * @param[IN]  enable_ipc    support inter-process tracking of the event 
+ * @param[IN]  enable_ipc    support inter-process tracking of the event
  *
  * @return                   OPAL_SUCCESS or error status on failure.
  */
@@ -310,6 +326,31 @@ typedef int (*opal_accelerator_base_module_memmove_fn_t)(
     int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
     opal_accelerator_transfer_type_t type);
 
+
+/**
+ * Copies memory asynchronously from src to dest. Memory of dest and src
+ * may overlap. Optionally can specify the transfer type to
+ * avoid pointer detection for performance. The operations will be enqueued
+ * into the provided stream but are not guaranteed to be complete upon return.
+ *
+ * @param[IN] dest_dev_id    Associated device to copy to or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] src_dev_id     Associated device to copy from or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] dest           Destination to copy memory to
+ * @param[IN] src            Source to copy memory from
+ * @param[IN] size           Size of memory to copy
+ * @param[IN] stream         Stream to perform asynchronous move on
+ * @param[IN] type           Transfer type field for performance
+ *                           Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC
+ *                           if caller is unsure of the transfer direction.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_memmove_async_fn_t)(
+    int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+    opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
+
 /**
  * Allocates size bytes memory from the device and sets ptr to the
  * pointer of the allocated memory. The memory is not initialized.
@@ -340,6 +381,46 @@ typedef int (*opal_accelerator_base_module_mem_alloc_fn_t)(
 typedef int (*opal_accelerator_base_module_mem_release_fn_t)(
     int dev_id, void *ptr);
 
+
+/**
+ * Allocates size bytes memory from the device and sets ptr to the
+ * pointer of the allocated memory. The memory is not initialized.
+ * The allocation request is placed into the stream object.
+ * Any use of the memory must succeed the completion of this
+ * operation on the stream.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[OUT] ptr           Returns pointer to allocated memory
+ * @param[IN] size           Size of memory to allocate
+ * @param[IN] stream         Stream into which to insert the allocation request
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_alloc_stream_fn_t)(
+    int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream);
+
+/**
+ * Frees the memory space pointed to by ptr which has been returned by
+ * a previous call to an opal_accelerator_base_module_mem_alloc_stream_fn_t().
+ * If the function is called on a ptr that has already been freed,
+ * undefined behavior occurs. If ptr is NULL, no operation is performed,
+ * and the function returns OPAL_SUCCESS.
+ * The release of the memory will be inserted into the stream and occurs after
+ * all previous operations have completed.
+ *
+ * @param[IN] dev_id         Associated device for the allocation or
+ *                           MCA_ACCELERATOR_NO_DEVICE_ID
+ * @param[IN] ptr            Pointer to free
+ * @param[IN] stream         Stream into which to insert the free operation
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_mem_release_stream_fn_t)(
+    int dev_id, void *ptr, opal_accelerator_stream_t *stream);
+
+
+
 /**
  * Retrieves the base address and/or size of a memory allocation of the
  * device.
@@ -557,11 +638,41 @@ typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)(
 typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)(
     int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+/**
+ * Wait for the completion of all operations inserted into the stream.
+ *
+ * @param[IN] stram          The stream to wait for.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_stream_t *stream);
+
+/**
+ * Get the number of devices available.
+ *
+ * @param[OUT] stram         Number of devices.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices);
+
+/**
+ * Get the memory bandwidth of the device.
+ *
+ * @param[IN] device         The device to query.
+ * @param[OUT] bw            The returned bandwidth for the device.
+ *
+ * @return                   OPAL_SUCCESS or error status on failure
+ */
+typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw);
+
+
 /*
  * the standard public API data structure
  */
 typedef struct {
     /* accelerator function table */
+    opal_accelerator_base_get_default_stream_fn_t get_default_stream;
     opal_accelerator_base_module_check_addr_fn_t check_addr;
 
     opal_accelerator_base_module_create_stream_fn_t create_stream;
@@ -572,10 +683,13 @@ typedef struct {
 
     opal_accelerator_base_module_memcpy_async_fn_t mem_copy_async;
     opal_accelerator_base_module_memcpy_fn_t mem_copy;
+    opal_accelerator_base_module_memmove_async_fn_t mem_move_async;
     opal_accelerator_base_module_memmove_fn_t mem_move;
 
     opal_accelerator_base_module_mem_alloc_fn_t mem_alloc;
     opal_accelerator_base_module_mem_release_fn_t mem_release;
+    opal_accelerator_base_module_mem_alloc_stream_fn_t mem_alloc_stream;
+    opal_accelerator_base_module_mem_release_stream_fn_t mem_release_stream;
     opal_accelerator_base_module_get_address_range_fn_t get_address_range;
 
     opal_accelerator_base_module_is_ipc_enabled_fn_t is_ipc_enabled;
@@ -595,6 +709,10 @@ typedef struct {
     opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer;
 
     opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id;
+
+    opal_accelerator_base_module_wait_stream_fn_t wait_stream;
+    opal_accelerator_base_module_get_num_devices_fn_t num_devices;
+    opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw;
 } opal_accelerator_base_module_t;
 
 /**
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c
index edabb864c3d..91d9cac4f59 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.c
@@ -6,6 +6,9 @@
  *                         All rights reserved.
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -23,6 +26,7 @@
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"
 /* Accelerator API's */
+static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 
@@ -35,10 +39,16 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src,
+                                          size_t size, opal_accelerator_stream_t *stream,
+                                          opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                              opal_accelerator_transfer_type_t type);
 static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_cuda_mem_release(int dev_id, void *ptr);
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream);
+static int accelerator_cuda_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size);
 
@@ -67,10 +77,17 @@ static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int d
 
 static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream);
+static int accelerator_cuda_get_num_devices(int *num_devices);
+static int accelerator_cuda_get_mem_bw(int device, float *bw);
+
+
 #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *)_stream->stream))
 
 opal_accelerator_base_module_t opal_accelerator_cuda_module =
 {
+    accelerator_cuda_get_default_stream,
+
     accelerator_cuda_check_addr,
 
     accelerator_cuda_create_stream,
@@ -82,9 +99,12 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
 
     accelerator_cuda_memcpy_async,
     accelerator_cuda_memcpy,
+    accelerator_cuda_memmove_async,
     accelerator_cuda_memmove,
     accelerator_cuda_mem_alloc,
     accelerator_cuda_mem_release,
+    accelerator_cuda_mem_alloc_stream,
+    accelerator_cuda_mem_release_stream,
     accelerator_cuda_get_address_range,
 
     accelerator_cuda_is_ipc_enabled,
@@ -103,9 +123,31 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module =
     accelerator_cuda_get_device_pci_attr,
     accelerator_cuda_device_can_access_peer,
 
-    accelerator_cuda_get_buffer_id
+    accelerator_cuda_get_buffer_id,
+
+    accelerator_cuda_wait_stream,
+    accelerator_cuda_get_num_devices,
+    accelerator_cuda_get_mem_bw
 };
 
+static int accelerator_cuda_get_device_id(CUcontext mem_ctx) {
+    /* query the device from the context */
+    int dev_id = -1;
+    CUdevice ptr_dev;
+    cuCtxPushCurrent(mem_ctx);
+    cuCtxGetDevice(&ptr_dev);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        cuDeviceGet(&dev, i);
+        if (dev == ptr_dev) {
+            dev_id = i;
+            break;
+        }
+    }
+    cuCtxPopCurrent(&mem_ctx);
+    return dev_id;
+}
+
 static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags)
 {
     CUresult result;
@@ -154,6 +196,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (0 == mem_type) {
         /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */
         return 0;
+    } else {
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -169,6 +214,10 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     } else if (CU_MEMORYTYPE_HOST == mem_type) {
         /* Host memory, nothing to do here */
         return 0;
+    } else {
+        result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf);
+        /* query the device from the context */
+        *dev_id = accelerator_cuda_get_device_id(mem_ctx);
     }
     /* Must be a device pointer */
     assert(CU_MEMORYTYPE_DEVICE == mem_type);
@@ -216,7 +265,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
         }
     }
 
-    /* WORKAROUND - They are times when the above code determines a pice of memory
+    /* WORKAROUND - There are times when the above code determines a pice of memory
      * is GPU memory, but it actually is not.  That has been seen on multi-GPU systems
      * with 6 or 8 GPUs on them. Therefore, we will do this extra check.  Note if we
      * made it this far, then the assumption at this point is we have GPU memory.
@@ -240,6 +289,17 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *
     return 1;
 }
 
+static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    *stream = &opal_accelerator_cuda_default_stream.base;
+    return OPAL_SUCCESS;
+}
+
+
 static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     CUresult result;
@@ -435,34 +495,23 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_SUCCESS;
     }
 
-    /* Async copy then synchronize is the default behavior as some applications
-     * cannot utilize synchronous copies. In addition, host memory does not need
-     * to be page-locked if an Async memory copy is done (It just makes it synchronous
-     * which is what we want anyway):
-     * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-     * Additionally, cuMemcpy is not necessarily always synchronous. See:
-     * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html
-     * TODO: Add optimizations for type field */
-    result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src,
-                       size, result);
-        return OPAL_ERROR;
-    }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+        opal_show_help("help-accelerator-cuda.txt", "cuMemcpy failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
     return OPAL_SUCCESS;
 }
 
-static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
-                             opal_accelerator_transfer_type_t type)
+static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                          const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream,
+                                          opal_accelerator_transfer_type_t type)
 {
     CUdeviceptr tmp;
     CUresult result;
+    void *ptr;
 
     int delayed_init = opal_accelerator_cuda_delayed_init();
     if (OPAL_UNLIKELY(0 != delayed_init)) {
@@ -473,29 +522,41 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest,
         return OPAL_ERR_BAD_PARAM;
     }
 
-    result = cuMemAlloc(&tmp, size);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream);
+    tmp = (CUdeviceptr)ptr;
+    result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size,
                        result);
         return OPAL_ERROR;
     }
-    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream);
+    result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, *(CUstream*)stream->stream);
     if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp,
                        size, result);
         return OPAL_ERROR;
     }
-    result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream);
-    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    return accelerator_cuda_mem_release_stream(src_dev_id, ptr, stream);
+}
+
+static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                    opal_accelerator_transfer_type_t type)
+{
+    int ret;
+
+    ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream.base, type);
+    if (OPAL_SUCCESS != ret) {
+        return OPAL_ERROR;
+    }
+    ret = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream.base);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         return OPAL_ERROR;
     }
-    cuMemFree(tmp);
     return OPAL_SUCCESS;
 }
 
@@ -512,15 +573,35 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size)
         return OPAL_ERR_BAD_PARAM;
     }
 
-    if (size > 0) {
-        result = cuMemAlloc((CUdeviceptr *) ptr, size);
-        if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
-            opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
-                           OPAL_PROC_MY_HOSTNAME, result);
-            return OPAL_ERROR;
-        }
+    result = cuMemAlloc((CUdeviceptr *) ptr, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
     }
-    return 0;
+    return OPAL_SUCCESS;
+}
+
+
+
+static int accelerator_cuda_mem_alloc_stream(int dev_id, void **addr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    /* fall-back to regular stream allocation */
+
+    CUresult result = cuMemAllocAsync((CUdeviceptr*)addr, size, *(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
 }
 
 static int accelerator_cuda_mem_release(int dev_id, void *ptr)
@@ -537,6 +618,38 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr)
     return 0;
 }
 
+static int accelerator_cuda_mem_release_stream(int dev_id, void *addr,
+                                               opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = cuMemFreeAsync((CUdeviceptr)addr, *(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream)
+{
+    CUresult result;
+    result = cuStreamSynchronize(*(CUstream*)stream->stream);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
 static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base,
                                        size_t *size)
 {
@@ -764,3 +877,29 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc
     }
     return OPAL_SUCCESS;
 }
+
+
+
+static int accelerator_cuda_get_num_devices(int *num_devices)
+{
+
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    *num_devices = opal_accelerator_cuda_num_devices;
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_cuda_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_cuda_delayed_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_cuda_mem_bw != NULL);
+
+    *bw = opal_accelerator_cuda_mem_bw[device];
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h
index 694a4192231..b00403302b8 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda.h
+++ b/opal/mca/accelerator/cuda/accelerator_cuda.h
@@ -2,6 +2,9 @@
  * Copyright (c) 2014      Intel, Inc.  All rights reserved.
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -38,13 +41,19 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t);
 
 /* Declare extern variables, defined in accelerator_cuda_component.c */
-OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream;
+OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream;
+OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_alloc_stream;
+OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream;
 OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock;
 
 OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component;
 
 OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module;
 
+OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices;
+
+OPAL_DECLSPEC extern float *opal_accelerator_cuda_mem_bw;
+
 OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void);
 
 END_C_DECLS
diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
index d48e29c9f65..0076a19e8cc 100644
--- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c
+++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c
@@ -6,6 +6,9 @@
  *                         reserved.
  * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -34,13 +37,18 @@
 #include "opal/sys/atomic.h"
 
 /* Define global variables, used in accelerator_cuda.c */
-CUstream opal_accelerator_cuda_memcpy_stream = NULL;
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream = {0};
+CUstream opal_accelerator_cuda_alloc_stream = NULL;
+opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream = {0};
 opal_mutex_t opal_accelerator_cuda_stream_lock = {0};
+int opal_accelerator_cuda_num_devices = 0;
 
 /* Initialization lock for delayed cuda initialization */
 static opal_mutex_t accelerator_cuda_init_lock;
 static bool accelerator_cuda_init_complete = false;
 
+float *opal_accelerator_cuda_mem_bw = NULL;
+
 #define STRINGIFY2(x) #x
 #define STRINGIFY(x)  STRINGIFY2(x)
 
@@ -122,6 +130,7 @@ static int accelerator_cuda_component_register(void)
 int opal_accelerator_cuda_delayed_init()
 {
     int result = OPAL_SUCCESS;
+    int prio_lo, prio_hi;
     CUcontext cuContext;
 
     /* Double checked locking to avoid having to
@@ -137,6 +146,8 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     }
 
+    cuDeviceGetCount(&opal_accelerator_cuda_num_devices);
+
     /* Check to see if this process is running in a CUDA context.  If
      * so, all is good.  If not, then disable registration of memory. */
     result = cuCtxGetCurrent(&cuContext);
@@ -145,31 +156,115 @@ int opal_accelerator_cuda_delayed_init()
         goto out;
     } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context");
-        result = OPAL_ERROR;
-        goto out;
+
+        /* create a context for each device */
+        for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+            CUdevice dev;
+            result = cuDeviceGet(&dev, i);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDeviceGet failed");
+                result = OPAL_ERROR;
+                goto out;
+            }
+            result = cuDevicePrimaryCtxRetain(&cuContext, dev);
+            if (CUDA_SUCCESS != result) {
+                opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                    "CUDA: cuDevicePrimaryCtxRetain failed");
+                result = OPAL_ERROR;
+                goto out;
+            }
+            if (0 == i) {
+                result = cuCtxPushCurrent(cuContext);
+                if (CUDA_SUCCESS != result) {
+                    opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                        "CUDA: cuCtxPushCurrent failed");
+                    result = OPAL_ERROR;
+                    goto out;
+                }
+            }
+        }
+
+
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded");
     }
 
     /* Create stream for use in cuMemcpyAsync synchronous copies */
-    result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0);
+    CUstream memcpy_stream;
+    result = cuStreamCreate(&memcpy_stream, 0);
+    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        goto out;
+    }
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_cuda_stream_t);
+    opal_accelerator_cuda_memcpy_stream.base.stream = malloc(sizeof(CUstream));
+    *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream = memcpy_stream;
+
+    /* Create stream for use in cuMemAllocAsync synchronous copies */
+    result = cuStreamCreate(&opal_accelerator_cuda_alloc_stream, 0);
     if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
         opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
                        OPAL_PROC_MY_HOSTNAME, result);
         goto out;
     }
 
+    /* Create a default stream to be used by various components.
+     * We try to create a high-priority stream and fall back to a regular stream.
+     */
+    CUstream *default_stream = malloc(sizeof(CUstream));
+    result = cuCtxGetStreamPriorityRange(&prio_lo, &prio_hi);
+    if (CUDA_SUCCESS != result) {
+        result = cuStreamCreateWithPriority(default_stream,
+                                            CU_STREAM_NON_BLOCKING, prio_hi);
+    } else {
+        result = cuStreamCreate(default_stream, 0);
+    }
+    if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) {
+        opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        goto out;
+    }
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_cuda_stream_t);
+    opal_accelerator_cuda_default_stream.base.stream = default_stream;
+
     result = cuMemHostRegister(&checkmem, sizeof(int), 0);
     if (result != CUDA_SUCCESS) {
         /* If registering the memory fails, print a message and continue.
          * This is not a fatal error. */
         opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true,
                        &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem");
-
     } else {
         opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
                             "CUDA: cuMemHostRegister OK on test region");
     }
+
+    /* determine the memory bandwidth */
+    opal_accelerator_cuda_mem_bw = malloc(sizeof(float)*opal_accelerator_cuda_num_devices);
+    for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) {
+        CUdevice dev;
+        result = cuDeviceGet(&dev, i);
+        if (CUDA_SUCCESS != result) {
+            opal_output_verbose(20, opal_accelerator_base_framework.framework_output,
+                                "CUDA: cuDeviceGet failed");
+            goto out;
+        }
+        int mem_clock_rate; // kHz
+        result = cuDeviceGetAttribute(&mem_clock_rate,
+                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                                dev);
+        int bus_width; // bit
+        result = cuDeviceGetAttribute(&bus_width,
+                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                                dev);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_cuda_mem_bw[i] = bw;
+    }
+
     result = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_cuda_init_complete = true;
@@ -182,6 +277,9 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void)
 {
     OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t);
     OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t);
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_stream_t);
+    OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_stream_t);
+
     /* First check if the support is enabled.  In the case that the user has
      * turned it off, we do not need to continue with any CUDA specific
      * initialization.  Do this after MCA parameter registration. */
@@ -205,9 +303,19 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module)
     if (CUDA_SUCCESS != result) {
         ctx_ok = 0;
     }
-    if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) {
-        cuStreamDestroy(opal_accelerator_cuda_memcpy_stream);
+
+    if ((NULL != opal_accelerator_cuda_memcpy_stream.base.stream) && ctx_ok) {
+        OBJ_DESTRUCT(&opal_accelerator_cuda_memcpy_stream);
     }
+    if ((NULL != opal_accelerator_cuda_alloc_stream) && ctx_ok) {
+        cuStreamDestroy(opal_accelerator_cuda_alloc_stream);
+    }
+    if ((NULL != opal_accelerator_cuda_default_stream.base.stream) && ctx_ok) {
+        OBJ_DESTRUCT(&opal_accelerator_cuda_default_stream);
+    }
+
+    free(opal_accelerator_cuda_mem_bw);
+    opal_accelerator_cuda_mem_bw = NULL;
 
     OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock);
     OBJ_DESTRUCT(&accelerator_cuda_init_lock);
diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c
index 1bd6e0e2811..1d76790c149 100644
--- a/opal/mca/accelerator/null/accelerator_null_component.c
+++ b/opal/mca/accelerator/null/accelerator_null_component.c
@@ -9,6 +9,9 @@
  * Copyright (c)           Amazon.com, Inc. or its affiliates.
  *                         All Rights reserved.
  * Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -28,6 +31,8 @@
 const char *opal_accelerator_null_component_version_string
     = "OPAL null accelerator MCA component version " OPAL_VERSION;
 
+static opal_accelerator_stream_t default_stream;
+
 /*
  * Component API functions
  */
@@ -40,6 +45,7 @@ static void accelerator_null_finalize(opal_accelerator_base_module_t* module);
 /* Accelerator API's */
 static int accelerator_null_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 
+static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_null_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int accelerator_null_create_event(int dev_id, opal_accelerator_event_t **event, bool enable_ipc);
 static int accelerator_null_record_event(int dev_id, opal_accelerator_event_t *event, opal_accelerator_stream_t *stream);
@@ -50,11 +56,15 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void *
                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                                    size_t size, opal_accelerator_transfer_type_t type);
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                     opal_accelerator_transfer_type_t type);
 
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size);
 static int accelerator_null_mem_release(int dev_id, void *ptr);
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t* stream);
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size);
 
 static bool accelerator_null_is_ipc_enabled(void);
@@ -82,6 +92,12 @@ static int accelerator_null_device_can_access_peer(int *access, int dev1, int de
 
 static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream);
+
+static int accelerator_null_get_num_devices(int *num_devices);
+
+static int accelerator_null_get_mem_bw(int device, float *bw);
+
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
@@ -122,6 +138,8 @@ opal_accelerator_null_component_t mca_accelerator_null_component = {{
 
 opal_accelerator_base_module_t opal_accelerator_null_module =
 {
+    accelerator_null_get_default_stream,
+
     accelerator_null_check_addr,
 
     accelerator_null_create_stream,
@@ -133,9 +151,12 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
 
     accelerator_null_memcpy_async,
     accelerator_null_memcpy,
+    accelerator_null_memmove_async,
     accelerator_null_memmove,
     accelerator_null_mem_alloc,
     accelerator_null_mem_release,
+    accelerator_null_mem_alloc_stream,
+    accelerator_null_mem_release_stream,
     accelerator_null_get_address_range,
 
     accelerator_null_is_ipc_enabled,
@@ -154,7 +175,11 @@ opal_accelerator_base_module_t opal_accelerator_null_module =
     accelerator_null_get_device_pci_attr,
     accelerator_null_device_can_access_peer,
 
-    accelerator_null_get_buffer_id
+    accelerator_null_get_buffer_id,
+
+    accelerator_null_wait_stream,
+    accelerator_null_get_num_devices,
+    accelerator_null_get_mem_bw
 };
 
 static int accelerator_null_open(void)
@@ -189,6 +214,12 @@ static int accelerator_null_check_addr(const void *addr, int *dev_id, uint64_t *
     return 0;
 }
 
+static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    *stream = &default_stream;
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     *stream = OBJ_NEW(opal_accelerator_stream_t);
@@ -237,6 +268,13 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest,
     return OPAL_SUCCESS;
 }
 
+static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
+                                          opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type)
+{
+    memmove(dest, src, size);
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size)
 {
     *ptr = malloc(size);
@@ -249,6 +287,23 @@ static int accelerator_null_mem_release(int dev_id, void *ptr)
     return OPAL_SUCCESS;
 }
 
+
+static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                             opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    *ptr = malloc(size);
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_mem_release_stream(int dev_id, void *ptr,
+                                               opal_accelerator_stream_t *stream)
+{
+    (void)stream;
+    free(ptr);
+    return OPAL_SUCCESS;
+}
+
 static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base,
                                               size_t *size)
 {
@@ -331,3 +386,21 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc
 {
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
+
+static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream)
+{
+    return OPAL_SUCCESS;
+}
+
+static int accelerator_null_get_num_devices(int *num_devices)
+{
+    *num_devices = 0;
+    return OPAL_SUCCESS;
+}
+
+
+static int accelerator_null_get_mem_bw(int device, float *bw)
+{
+    *bw = 1.0; // return something that is not 0
+    return OPAL_SUCCESS;
+}
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h
index 38409778ad4..510702cdb4e 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm.h
+++ b/opal/mca/accelerator/rocm/accelerator_rocm.h
@@ -1,5 +1,8 @@
 /*
  * Copyright (c) 2022-2023  Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * $COPYRIGHT$
  *
@@ -67,12 +70,18 @@ struct opal_accelerator_rocm_ipc_event_handle_t {
 typedef struct opal_accelerator_rocm_ipc_event_handle_t opal_accelerator_rocm_ipc_event_handle_t;
 OBJ_CLASS_DECLARATION(opal_accelerator_rocm_ipc_event_handle_t);
 
-OPAL_DECLSPEC extern hipStream_t opal_accelerator_rocm_MemcpyStream;
+OPAL_DECLSPEC extern hipStream_t *opal_accelerator_rocm_MemcpyStream;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_memcpy_async;
 OPAL_DECLSPEC extern int opal_accelerator_rocm_verbose;
 OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyH2D_limit;
 OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyD2H_limit;
 
+OPAL_DECLSPEC extern hipStream_t opal_accelerator_alloc_stream;
+OPAL_DECLSPEC extern opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream;
+OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_rocm_stream_lock;
+OPAL_DECLSPEC extern int opal_accelerator_rocm_num_devices;
+OPAL_DECLSPEC extern float *opal_accelerator_rocm_mem_bw;
+
 OPAL_DECLSPEC extern int opal_accelerator_rocm_lazy_init(void);
 
 #endif
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
index 8f1bbbb53a5..28f2d7506c4 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c
@@ -19,7 +19,9 @@
 #include <stdio.h>
 
 #include "opal/mca/dl/base/base.h"
+#include "opal/mca/accelerator/base/base.h"
 #include "opal/runtime/opal_params.h"
+#include "opal/util/proc.h"
 #include "accelerator_rocm.h"
 
 int opal_accelerator_rocm_memcpy_async = 0;
@@ -31,7 +33,13 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576;
 static opal_mutex_t accelerator_rocm_init_lock;
 static bool accelerator_rocm_init_complete = false;
 
-hipStream_t opal_accelerator_rocm_MemcpyStream = NULL;
+/* Define global variables, used in accelerator_rocm.c */
+hipStream_t opal_accelerator_rocm_alloc_stream = NULL;
+opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream = {0};
+opal_mutex_t opal_accelerator_rocm_stream_lock = {0};
+int opal_accelerator_rocm_num_devices = 0;
+float *opal_accelerator_rocm_mem_bw = NULL;
+hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL;
 
 /*
  * Public string showing the accelerator rocm component version number
@@ -159,6 +167,7 @@ static int accelerator_rocm_component_register(void)
 
 int opal_accelerator_rocm_lazy_init()
 {
+    int prio_hi, prio_lo;
     int err = OPAL_SUCCESS;
 
     /* Double checked locking to avoid having to
@@ -174,13 +183,62 @@ int opal_accelerator_rocm_lazy_init()
         goto out;
     }
 
-    err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream);
+    hipGetDeviceCount(&opal_accelerator_rocm_num_devices);
+
+    /* Create stream for use in cuMemcpyAsync synchronous copies */
+    hipStream_t memcpy_stream;
+    err = hipStreamCreate(&memcpy_stream);
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true,
+                       OPAL_PROC_MY_HOSTNAME, err);
+        goto out;
+    }
+    opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t));
+    *(hipStream_t*)opal_accelerator_rocm_MemcpyStream = memcpy_stream;
+
+    /* Create stream for use in cuMemcpyAsync synchronous copies */
+    err = hipStreamCreateWithFlags(&opal_accelerator_rocm_alloc_stream, 0);
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true,
+                       OPAL_PROC_MY_HOSTNAME, err);
+        goto out;
+    }
+
+    /* Create a default stream to be used by various components.
+     * We try to create a high-priority stream and fall back to a regular stream.
+     */
+    hipStream_t *default_stream = malloc(sizeof(hipStream_t));
+    err = hipDeviceGetStreamPriorityRange(&prio_lo, &prio_hi);
     if (hipSuccess != err) {
+        err = hipStreamCreateWithPriority(default_stream,
+                                            hipStreamNonBlocking, prio_hi);
+    } else {
+        err = hipStreamCreateWithFlags(default_stream, 0);
+    }
+    if (OPAL_UNLIKELY(err != hipSuccess)) {
         opal_output(0, "Could not create hipStream, err=%d %s\n",
                 err, hipGetErrorString(err));
         goto out;
     }
-
+    OBJ_CONSTRUCT(&opal_accelerator_rocm_default_stream, opal_accelerator_rocm_stream_t);
+    opal_accelerator_rocm_default_stream.base.stream = default_stream;
+
+    opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices);
+    for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) {
+        int mem_clock_rate; // kHz
+        err = hipDeviceGetAttribute(&mem_clock_rate,
+                                    hipDeviceAttributeMemoryClockRate,
+                                    i);
+        int bus_width; // bit
+        err = hipDeviceGetAttribute(&bus_width,
+                                    hipDeviceAttributeMemoryBusWidth,
+                                    i);
+        /* bw = clock_rate * bus width * 2bit multiplier
+         * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940
+         */
+        float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8;
+        opal_accelerator_rocm_mem_bw[i] = bw;
+    }
     err = OPAL_SUCCESS;
     opal_atomic_wmb();
     accelerator_rocm_init_complete = true;
@@ -192,7 +250,8 @@ int opal_accelerator_rocm_lazy_init()
 static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 {
     OBJ_CONSTRUCT(&accelerator_rocm_init_lock, opal_mutex_t);
-    
+    OBJ_CONSTRUCT(&opal_accelerator_rocm_stream_lock, opal_mutex_t);
+
     hipError_t err;
 
     if (opal_rocm_runtime_initialized) {
@@ -214,14 +273,20 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void)
 
 static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module)
 {
-    if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) {
-        hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream);
+    if (NULL != opal_accelerator_rocm_MemcpyStream) {
+        hipError_t err = hipStreamDestroy(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err) {
             opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n");
         }
+        free(opal_accelerator_rocm_MemcpyStream);
         opal_accelerator_rocm_MemcpyStream = NULL;
+
+        OBJ_DESTRUCT(&opal_accelerator_rocm_default_stream);
+        free(opal_accelerator_rocm_mem_bw);
+        opal_accelerator_rocm_mem_bw = NULL;
     }
 
     OBJ_DESTRUCT(&accelerator_rocm_init_lock);
+    OBJ_DESTRUCT(&opal_accelerator_rocm_stream_lock);
     return;
 }
diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
index 6db5e0d4927..66c94b1b01a 100644
--- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c
+++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c
@@ -1,6 +1,9 @@
 /*
  * Copyright (c) 2022-2023  Advanced Micro Devices, Inc. All Rights reserved.
  * $COPYRIGHT$
+ * Copyright (c) 2024      The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
  *
  * Additional copyrights may follow
  *
@@ -15,6 +18,7 @@
 #include "opal/util/output.h"
 
 /* Accelerator API's */
+static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int mca_accelerator_rocm_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 static int mca_accelerator_rocm_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 
@@ -27,10 +31,17 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                              const void *src, size_t size,
+                                              opal_accelerator_stream_t *stream,
+                                              opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_rocm_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_rocm_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_rocm_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                                 opal_accelerator_stream_t *stream);
+static int mca_accelerator_rocm_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_rocm_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -59,11 +70,17 @@ static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, i
 
 static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices);
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw);
 
 #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream))
 
 opal_accelerator_base_module_t opal_accelerator_rocm_module =
 {
+    mca_accelerator_rocm_get_default_stream,
     mca_accelerator_rocm_check_addr,
 
     mca_accelerator_rocm_create_stream,
@@ -75,9 +92,12 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
 
     mca_accelerator_rocm_memcpy_async,
     mca_accelerator_rocm_memcpy,
+    mca_accelerator_rocm_memmove_async,
     mca_accelerator_rocm_memmove,
     mca_accelerator_rocm_mem_alloc,
     mca_accelerator_rocm_mem_release,
+    mca_accelerator_rocm_mem_alloc_stream,
+    mca_accelerator_rocm_mem_release_stream,
     mca_accelerator_rocm_get_address_range,
 
     mca_accelerator_rocm_is_ipc_enabled,
@@ -96,7 +116,11 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module =
     mca_accelerator_rocm_get_device_pci_attr,
     mca_accelerator_rocm_device_can_access_peer,
 
-    mca_accelerator_rocm_get_buffer_id
+    mca_accelerator_rocm_get_buffer_id,
+
+    mca_accelerator_rocm_wait_stream,
+    mca_accelerator_rocm_get_num_devices,
+    mca_accelerator_rocm_get_mem_bw
 };
 
 
@@ -123,6 +147,10 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
             opal_accelerator_rocm_lazy_init();
             *dev_id = srcAttr.device;
             ret = 1;
+            // TODO: on Frontier the host can access any device memory
+            // is that true everywhere?
+            //*flags |= MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY;
+            //*dev_id = srcAttr.device;
 #if HIP_VERSION >= 50731921
         } else if (hipMemoryTypeUnified == srcAttr.type) {
 #else
@@ -138,6 +166,16 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6
     return ret;
 }
 
+static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    *stream = &opal_accelerator_rocm_default_stream.base;
+    return OPAL_SUCCESS;
+}
+
 static int mca_accelerator_rocm_create_stream(int dev_id, opal_accelerator_stream_t **stream)
 {
     if (NULL == stream) {
@@ -233,7 +271,7 @@ OBJ_CLASS_INSTANCE(
     opal_accelerator_event_t,
     NULL,
     mca_accelerator_rocm_event_destruct);
- 
+
 static int mca_accelerator_rocm_record_event(int dev_id, opal_accelerator_event_t *event,
                                              opal_accelerator_stream_t *stream)
 {
@@ -348,14 +386,14 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(dest, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error starting async copy\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream after async copy\n");
@@ -373,6 +411,43 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de
     return OPAL_SUCCESS;
 }
 
+static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src,
+                                              size_t size, opal_accelerator_stream_t *stream,
+                                              opal_accelerator_transfer_type_t type)
+{
+    hipDeviceptr_t tmp;
+    hipError_t result;
+    void *ptr;
+
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == dest || NULL == src || size <= 0) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = mca_accelerator_rocm_mem_alloc_stream(src_dev_id, &ptr, size, stream);
+    if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) {
+        return OPAL_ERROR;
+    }
+    tmp = (hipDeviceptr_t)ptr;
+    result = hipMemcpyAsync(tmp, (hipDeviceptr_t) src, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error during synchronous copy\n");
+        return OPAL_ERROR;
+    }
+    result = hipMemcpyAsync((hipDeviceptr_t) dest, tmp, size, hipMemcpyDefault, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
+                            "error during synchronous copy\n");
+        return OPAL_ERROR;
+    }
+    return mca_accelerator_rocm_mem_release_stream(src_dev_id, ptr, stream);
+}
+
 static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest,
                                         const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type)
@@ -393,7 +468,7 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
 
     if (opal_accelerator_rocm_memcpy_async) {
         err = hipMemcpyAsync(tmp, src, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
@@ -401,14 +476,14 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d
         }
 
         err = hipMemcpyAsync(dest, tmp, size, hipMemcpyDefault,
-                                       opal_accelerator_rocm_MemcpyStream);
+                                       *opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error in async memcpy for memmove\n");
             return OPAL_ERROR;
         }
 
-        err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream);
+        err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream);
         if (hipSuccess != err ) {
             opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                                 "error synchronizing stream for memmove\n");
@@ -535,7 +610,7 @@ static int mca_accelerator_rocm_get_ipc_handle(int dev_id, void *dev_ptr,
                             "Error in hipIpcGetMemHandle dev_ptr %p", dev_ptr);
         OBJ_DESTRUCT(rocm_handle);
         return OPAL_ERROR;
-    }    
+    }
     memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE);
 
     return OPAL_SUCCESS;
@@ -597,7 +672,7 @@ static int mca_accelerator_rocm_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HAN
 
 static void mca_accelerator_rocm_ipc_event_handle_destruct(opal_accelerator_rocm_ipc_handle_t *handle)
 {
-    // Just a place holder, there is no hipIpcCloseEventHandle. 
+    // Just a place holder, there is no hipIpcCloseEventHandle.
 }
 
 OBJ_CLASS_INSTANCE(
@@ -617,7 +692,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e
     hipIpcEventHandle_t rocm_ipc_handle;
     opal_accelerator_rocm_ipc_event_handle_t *rocm_handle = (opal_accelerator_rocm_ipc_event_handle_t *) handle;
     OBJ_CONSTRUCT(rocm_handle, opal_accelerator_rocm_ipc_event_handle_t);
-    
+
     memset(rocm_ipc_handle.reserved, 0, HIP_IPC_HANDLE_SIZE);
     hipError_t err = hipIpcGetEventHandle(&rocm_ipc_handle,
                                           *((hipEvent_t *)event->event));
@@ -626,7 +701,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e
                             "error in hipIpcGetEventHandle");
         OBJ_DESTRUCT(rocm_handle);
         return OPAL_ERROR;
-    }    
+    }
     memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE);
 
     return OPAL_SUCCESS;
@@ -664,7 +739,7 @@ static int mca_accelerator_rocm_open_ipc_event_handle(opal_accelerator_ipc_event
         opal_output_verbose(10, opal_accelerator_base_framework.framework_output,
                             "error in hipIpcOpenEventHandle");
         return OPAL_ERROR;
-    }    
+    }
 
     return OPAL_SUCCESS;
 }
@@ -802,3 +877,81 @@ static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal
 #endif
     return OPAL_SUCCESS;
 }
+
+
+static int mca_accelerator_rocm_mem_alloc_stream(
+    int dev_id,
+    void **addr,
+    size_t size,
+    opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+
+    if (NULL == stream || NULL == addr || 0 == size) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipMallocAsync(addr, size, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMalloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_mem_release_stream(
+    int dev_id,
+    void *addr,
+    opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+
+    if (NULL == stream || NULL == addr) {
+        return OPAL_ERR_BAD_PARAM;
+    }
+
+    result = hipFreeAsync(addr, *(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipMalloc failed", true,
+                        OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream)
+{
+    hipError_t result;
+    result = hipStreamSynchronize(*(hipStream_t*)stream->stream);
+    if (OPAL_UNLIKELY(hipSuccess != result)) {
+        opal_show_help("help-accelerator-rocm.txt", "hipStreamSynchronize failed", true,
+                       OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+
+static int mca_accelerator_rocm_get_num_devices(int *num_devices)
+{
+    *num_devices = opal_accelerator_rocm_num_devices;
+    return OPAL_SUCCESS;
+}
+
+static int mca_accelerator_rocm_get_mem_bw(int device, float *bw)
+{
+    int delayed_init = opal_accelerator_rocm_lazy_init();
+    if (OPAL_UNLIKELY(0 != delayed_init)) {
+        return delayed_init;
+    }
+    assert(opal_accelerator_rocm_mem_bw != NULL);
+
+    *bw = opal_accelerator_rocm_mem_bw[device];
+    return OPAL_SUCCESS;
+}
\ No newline at end of file
diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c
index 38b49cf4290..33edab901b7 100644
--- a/opal/mca/accelerator/ze/accelerator_ze_module.c
+++ b/opal/mca/accelerator/ze/accelerator_ze_module.c
@@ -20,6 +20,7 @@
 #include "opal/util/output.h"
 
 /* Accelerator API's */
+static int mca_accelerator_ze_get_default_stream(int dev_id, opal_accelerator_stream_t **stream);
 static int mca_accelerator_ze_check_addr(const void *addr, int *dev_id, uint64_t *flags);
 static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_t **stream);
 
@@ -32,10 +33,17 @@ static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void
                                   opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src,
                             size_t size, opal_accelerator_transfer_type_t type);
+static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                            const void *src, size_t size,
+                                            opal_accelerator_stream_t *stream,
+                                            opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size,
                                         opal_accelerator_transfer_type_t type);
 static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size);
 static int mca_accelerator_ze_mem_release(int dev_id, void *ptr);
+static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                               opal_accelerator_stream_t *stream);
+static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream);
 static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base,
                                                   size_t *size);
 
@@ -65,8 +73,15 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p
 
 static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id);
 
+static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream);
+
+static int mca_accelerator_ze_get_num_devices(int *num_devices);
+
+static int mca_accelerator_ze_get_mem_bw(int device, float *bw);
+
 opal_accelerator_base_module_t opal_accelerator_ze_module =
 {
+    .get_default_stream = mca_accelerator_rocm_get_default_stream,
     .check_addr = mca_accelerator_ze_check_addr,
 
     .create_stream = mca_accelerator_ze_create_stream,
@@ -77,10 +92,13 @@ opal_accelerator_base_module_t opal_accelerator_ze_module =
 
     .mem_copy_async = mca_accelerator_ze_memcpy_async,
     .mem_copy = mca_accelerator_ze_memcpy,
+    .mem_move_async = mca_accelerator_ze_memmove_async,
     .mem_move = mca_accelerator_ze_memmove,
 
     .mem_alloc = mca_accelerator_ze_mem_alloc,
     .mem_release = mca_accelerator_ze_mem_release,
+    .mem_alloc_stream = mca_accelerator_ze_mem_alloc_stream,
+    .mem_release_stream = mca_accelerator_ze_mem_release_stream,
     .get_address_range = mca_accelerator_ze_get_address_range,
 
     .is_ipc_enabled = mca_accelerator_ze_is_ipc_enabled,
@@ -99,7 +117,10 @@ opal_accelerator_base_module_t opal_accelerator_ze_module =
     .get_device_pci_attr = mca_accelerator_ze_get_device_pci_attr,
     .device_can_access_peer = mca_accelerator_ze_device_can_access_peer,
 
-    .get_buffer_id = mca_accelerator_ze_get_buffer_id
+    .get_buffer_id = mca_accelerator_ze_get_buffer_id,
+    .wait_stream = mca_accelerator_ze_wait_stream,
+    .num_devices = mca_accelerator_ze_get_num_devices,
+    .get_mem_bw = mca_accelerator_ze_get_mem_bw
 };
 
 static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice)
@@ -116,6 +137,14 @@ static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice)
     return ret;
 }
 
+static int mca_accelerator_ze_get_default_stream(int dev_id, opal_accelerator_stream_t **stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_check_addr (const void *addr, int *dev_id, uint64_t *flags)
 {
     ze_result_t zret;
@@ -548,6 +577,17 @@ static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *des
     return OPAL_ERR_NOT_IMPLEMENTED;
 }
 
+static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest,
+                                            const void *src, size_t size,
+                                            opal_accelerator_stream_t *stream,
+                                            opal_accelerator_transfer_type_t type)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size)
 {
    ze_result_t zret;
@@ -603,6 +643,23 @@ static int mca_accelerator_ze_mem_release(int dev_id, void *ptr)
     return OPAL_ERROR;
 }
 
+static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size,
+                                               opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
 static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base,
 						  size_t *size)
 {
@@ -801,3 +858,27 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a
                                    
     return OPAL_SUCCESS;
 }
+
+static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_get_num_devices(int *num_devices)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
+
+static int mca_accelerator_ze_get_mem_bw(int device, float *bw)
+{
+    /*
+     * TODO
+     */
+    return OPAL_ERR_NOT_IMPLEMENTED;
+}
\ No newline at end of file