diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h index 0d660725acc..caac15970dc 100644 --- a/opal/mca/accelerator/accelerator.h +++ b/opal/mca/accelerator/accelerator.h @@ -5,6 +5,9 @@ * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * * $COPYRIGHT$ * @@ -184,6 +187,19 @@ typedef int (*opal_accelerator_base_module_check_addr_fn_t)( typedef int (*opal_accelerator_base_module_create_stream_fn_t)( int dev_id, opal_accelerator_stream_t **stream); + +/** + * Query the default stream. + * + * @param[IN] dev_id Associated device for the stream or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[OUT] stream Set to the default stream. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_get_default_stream_fn_t)( + int dev_id, opal_accelerator_stream_t **stream); + /** * Creates an event. An event is a synchronization marker that can be * appended to a stream to monitor device progress or synchronize the @@ -193,7 +209,7 @@ typedef int (*opal_accelerator_base_module_create_stream_fn_t)( * @param[IN] dev_id Associated device for the event or * MCA_ACCELERATOR_NO_DEVICE_ID * @param[OUT] event Event to create - * @param[IN] enable_ipc support inter-process tracking of the event + * @param[IN] enable_ipc support inter-process tracking of the event * * @return OPAL_SUCCESS or error status on failure. */ @@ -310,6 +326,31 @@ typedef int (*opal_accelerator_base_module_memmove_fn_t)( int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); + +/** + * Copies memory asynchronously from src to dest. Memory of dest and src + * may overlap. Optionally can specify the transfer type to + * avoid pointer detection for performance. The operations will be enqueued + * into the provided stream but are not guaranteed to be complete upon return. + * + * @param[IN] dest_dev_id Associated device to copy to or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[IN] src_dev_id Associated device to copy from or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[IN] dest Destination to copy memory to + * @param[IN] src Source to copy memory from + * @param[IN] size Size of memory to copy + * @param[IN] stream Stream to perform asynchronous move on + * @param[IN] type Transfer type field for performance + * Can be set to MCA_ACCELERATOR_TRANSFER_UNSPEC + * if caller is unsure of the transfer direction. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_memmove_async_fn_t)( + int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); + /** * Allocates size bytes memory from the device and sets ptr to the * pointer of the allocated memory. The memory is not initialized. @@ -340,6 +381,46 @@ typedef int (*opal_accelerator_base_module_mem_alloc_fn_t)( typedef int (*opal_accelerator_base_module_mem_release_fn_t)( int dev_id, void *ptr); + +/** + * Allocates size bytes memory from the device and sets ptr to the + * pointer of the allocated memory. The memory is not initialized. + * The allocation request is placed into the stream object. + * Any use of the memory must succeed the completion of this + * operation on the stream. + * + * @param[IN] dev_id Associated device for the allocation or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[OUT] ptr Returns pointer to allocated memory + * @param[IN] size Size of memory to allocate + * @param[IN] stream Stream into which to insert the allocation request + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_mem_alloc_stream_fn_t)( + int dev_id, void **ptr, size_t size, opal_accelerator_stream_t *stream); + +/** + * Frees the memory space pointed to by ptr which has been returned by + * a previous call to an opal_accelerator_base_module_mem_alloc_stream_fn_t(). + * If the function is called on a ptr that has already been freed, + * undefined behavior occurs. If ptr is NULL, no operation is performed, + * and the function returns OPAL_SUCCESS. + * The release of the memory will be inserted into the stream and occurs after + * all previous operations have completed. + * + * @param[IN] dev_id Associated device for the allocation or + * MCA_ACCELERATOR_NO_DEVICE_ID + * @param[IN] ptr Pointer to free + * @param[IN] stream Stream into which to insert the free operation + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_mem_release_stream_fn_t)( + int dev_id, void *ptr, opal_accelerator_stream_t *stream); + + + /** * Retrieves the base address and/or size of a memory allocation of the * device. @@ -557,11 +638,41 @@ typedef int (*opal_accelerator_base_module_device_can_access_peer_fn_t)( typedef int (*opal_accelerator_base_module_get_buffer_id_fn_t)( int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +/** + * Wait for the completion of all operations inserted into the stream. + * + * @param[IN] stram The stream to wait for. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_wait_stream_fn_t)(opal_accelerator_stream_t *stream); + +/** + * Get the number of devices available. + * + * @param[OUT] stram Number of devices. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_devices); + +/** + * Get the memory bandwidth of the device. + * + * @param[IN] device The device to query. + * @param[OUT] bw The returned bandwidth for the device. + * + * @return OPAL_SUCCESS or error status on failure + */ +typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw); + + /* * the standard public API data structure */ typedef struct { /* accelerator function table */ + opal_accelerator_base_get_default_stream_fn_t get_default_stream; opal_accelerator_base_module_check_addr_fn_t check_addr; opal_accelerator_base_module_create_stream_fn_t create_stream; @@ -572,10 +683,13 @@ typedef struct { opal_accelerator_base_module_memcpy_async_fn_t mem_copy_async; opal_accelerator_base_module_memcpy_fn_t mem_copy; + opal_accelerator_base_module_memmove_async_fn_t mem_move_async; opal_accelerator_base_module_memmove_fn_t mem_move; opal_accelerator_base_module_mem_alloc_fn_t mem_alloc; opal_accelerator_base_module_mem_release_fn_t mem_release; + opal_accelerator_base_module_mem_alloc_stream_fn_t mem_alloc_stream; + opal_accelerator_base_module_mem_release_stream_fn_t mem_release_stream; opal_accelerator_base_module_get_address_range_fn_t get_address_range; opal_accelerator_base_module_is_ipc_enabled_fn_t is_ipc_enabled; @@ -595,6 +709,10 @@ typedef struct { opal_accelerator_base_module_device_can_access_peer_fn_t device_can_access_peer; opal_accelerator_base_module_get_buffer_id_fn_t get_buffer_id; + + opal_accelerator_base_module_wait_stream_fn_t wait_stream; + opal_accelerator_base_module_get_num_devices_fn_t num_devices; + opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw; } opal_accelerator_base_module_t; /** diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index edabb864c3d..91d9cac4f59 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -6,6 +6,9 @@ * All rights reserved. * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -23,6 +26,7 @@ #include "opal/util/show_help.h" #include "opal/util/proc.h" /* Accelerator API's */ +static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stream_t **stream); static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags); static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream); @@ -35,10 +39,16 @@ static int accelerator_cuda_memcpy_async(int dest_dev_id, int src_dev_id, void * opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, + size_t size, opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type); static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size); static int accelerator_cuda_mem_release(int dev_id, void *ptr); +static int accelerator_cuda_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream); +static int accelerator_cuda_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); @@ -67,10 +77,17 @@ static int accelerator_cuda_device_can_access_peer( int *access, int dev1, int d static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream); +static int accelerator_cuda_get_num_devices(int *num_devices); +static int accelerator_cuda_get_mem_bw(int device, float *bw); + + #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *)_stream->stream)) opal_accelerator_base_module_t opal_accelerator_cuda_module = { + accelerator_cuda_get_default_stream, + accelerator_cuda_check_addr, accelerator_cuda_create_stream, @@ -82,9 +99,12 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_memcpy_async, accelerator_cuda_memcpy, + accelerator_cuda_memmove_async, accelerator_cuda_memmove, accelerator_cuda_mem_alloc, accelerator_cuda_mem_release, + accelerator_cuda_mem_alloc_stream, + accelerator_cuda_mem_release_stream, accelerator_cuda_get_address_range, accelerator_cuda_is_ipc_enabled, @@ -103,9 +123,31 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_get_device_pci_attr, accelerator_cuda_device_can_access_peer, - accelerator_cuda_get_buffer_id + accelerator_cuda_get_buffer_id, + + accelerator_cuda_wait_stream, + accelerator_cuda_get_num_devices, + accelerator_cuda_get_mem_bw }; +static int accelerator_cuda_get_device_id(CUcontext mem_ctx) { + /* query the device from the context */ + int dev_id = -1; + CUdevice ptr_dev; + cuCtxPushCurrent(mem_ctx); + cuCtxGetDevice(&ptr_dev); + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + cuDeviceGet(&dev, i); + if (dev == ptr_dev) { + dev_id = i; + break; + } + } + cuCtxPopCurrent(&mem_ctx); + return dev_id; +} + static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags) { CUresult result; @@ -154,6 +196,9 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } else if (0 == mem_type) { /* This can happen when CUDA is initialized but dbuf is not valid CUDA pointer */ return 0; + } else { + /* query the device from the context */ + *dev_id = accelerator_cuda_get_device_id(mem_ctx); } /* Must be a device pointer */ assert(CU_MEMORYTYPE_DEVICE == mem_type); @@ -169,6 +214,10 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } else if (CU_MEMORYTYPE_HOST == mem_type) { /* Host memory, nothing to do here */ return 0; + } else { + result = cuPointerGetAttribute(&mem_ctx, CU_POINTER_ATTRIBUTE_CONTEXT, dbuf); + /* query the device from the context */ + *dev_id = accelerator_cuda_get_device_id(mem_ctx); } /* Must be a device pointer */ assert(CU_MEMORYTYPE_DEVICE == mem_type); @@ -216,7 +265,7 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * } } - /* WORKAROUND - They are times when the above code determines a pice of memory + /* WORKAROUND - There are times when the above code determines a pice of memory * is GPU memory, but it actually is not. That has been seen on multi-GPU systems * with 6 or 8 GPUs on them. Therefore, we will do this extra check. Note if we * made it this far, then the assumption at this point is we have GPU memory. @@ -240,6 +289,17 @@ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t * return 1; } +static int accelerator_cuda_get_default_stream(int dev_id, opal_accelerator_stream_t **stream) +{ + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + *stream = &opal_accelerator_cuda_default_stream.base; + return OPAL_SUCCESS; +} + + static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream) { CUresult result; @@ -435,34 +495,23 @@ static int accelerator_cuda_memcpy(int dest_dev_id, int src_dev_id, void *dest, return OPAL_SUCCESS; } - /* Async copy then synchronize is the default behavior as some applications - * cannot utilize synchronous copies. In addition, host memory does not need - * to be page-locked if an Async memory copy is done (It just makes it synchronous - * which is what we want anyway): - * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device - * Additionally, cuMemcpy is not necessarily always synchronous. See: - * https://docs.nvidia.com/cuda/cuda-driver-api/api-sync-behavior.html - * TODO: Add optimizations for type field */ - result = cuMemcpyAsync((CUdeviceptr) dest, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); + result = cuMemcpy((CUdeviceptr) dest, (CUdeviceptr) src, size); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, src, - size, result); - return OPAL_ERROR; - } - result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, + opal_show_help("help-accelerator-cuda.txt", "cuMemcpy failed", true, OPAL_PROC_MY_HOSTNAME, result); return OPAL_ERROR; } return OPAL_SUCCESS; } -static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, - opal_accelerator_transfer_type_t type) +static int accelerator_cuda_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) { CUdeviceptr tmp; CUresult result; + void *ptr; int delayed_init = opal_accelerator_cuda_delayed_init(); if (OPAL_UNLIKELY(0 != delayed_init)) { @@ -473,29 +522,41 @@ static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, return OPAL_ERR_BAD_PARAM; } - result = cuMemAlloc(&tmp, size); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + result = accelerator_cuda_mem_alloc_stream(src_dev_id, &ptr, size, stream); + if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { return OPAL_ERROR; } - result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, opal_accelerator_cuda_memcpy_stream); + tmp = (CUdeviceptr)ptr; + result = cuMemcpyAsync(tmp, (CUdeviceptr) src, size, *(CUstream*)stream->stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, tmp, src, size, result); return OPAL_ERROR; } - result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, opal_accelerator_cuda_memcpy_stream); + result = cuMemcpyAsync((CUdeviceptr) dest, tmp, size, *(CUstream*)stream->stream); if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuMemcpyAsync failed", true, dest, tmp, size, result); return OPAL_ERROR; } - result = cuStreamSynchronize(opal_accelerator_cuda_memcpy_stream); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + return accelerator_cuda_mem_release_stream(src_dev_id, ptr, stream); +} + +static int accelerator_cuda_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_transfer_type_t type) +{ + int ret; + + ret = accelerator_cuda_memmove_async(dest_dev_id, src_dev_id, dest, src, size, &opal_accelerator_cuda_memcpy_stream.base, type); + if (OPAL_SUCCESS != ret) { + return OPAL_ERROR; + } + ret = accelerator_cuda_wait_stream(&opal_accelerator_cuda_memcpy_stream.base); + if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, OPAL_PROC_MY_HOSTNAME, result); return OPAL_ERROR; } - cuMemFree(tmp); return OPAL_SUCCESS; } @@ -512,15 +573,35 @@ static int accelerator_cuda_mem_alloc(int dev_id, void **ptr, size_t size) return OPAL_ERR_BAD_PARAM; } - if (size > 0) { - result = cuMemAlloc((CUdeviceptr *) ptr, size); - if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { - opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, - OPAL_PROC_MY_HOSTNAME, result); - return OPAL_ERROR; - } + result = cuMemAlloc((CUdeviceptr *) ptr, size); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; } - return 0; + return OPAL_SUCCESS; +} + + + +static int accelerator_cuda_mem_alloc_stream(int dev_id, void **addr, size_t size, + opal_accelerator_stream_t *stream) +{ + + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + /* fall-back to regular stream allocation */ + + CUresult result = cuMemAllocAsync((CUdeviceptr*)addr, size, *(CUstream*)stream->stream); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemAlloc failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; } static int accelerator_cuda_mem_release(int dev_id, void *ptr) @@ -537,6 +618,38 @@ static int accelerator_cuda_mem_release(int dev_id, void *ptr) return 0; } +static int accelerator_cuda_mem_release_stream(int dev_id, void *addr, + opal_accelerator_stream_t *stream) +{ + CUresult result; + + if (NULL == stream || NULL == addr) { + return OPAL_ERR_BAD_PARAM; + } + + result = cuMemFreeAsync((CUdeviceptr)addr, *(CUstream*)stream->stream); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuMemFree failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + + +static int accelerator_cuda_wait_stream(opal_accelerator_stream_t *stream) +{ + CUresult result; + result = cuStreamSynchronize(*(CUstream*)stream->stream); + if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) { + opal_show_help("help-accelerator-cuda.txt", "cuStreamSynchronize failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + + static int accelerator_cuda_get_address_range(int dev_id, const void *ptr, void **base, size_t *size) { @@ -764,3 +877,29 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc } return OPAL_SUCCESS; } + + + +static int accelerator_cuda_get_num_devices(int *num_devices) +{ + + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + *num_devices = opal_accelerator_cuda_num_devices; + return OPAL_SUCCESS; +} + +static int accelerator_cuda_get_mem_bw(int device, float *bw) +{ + int delayed_init = opal_accelerator_cuda_delayed_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + assert(opal_accelerator_cuda_mem_bw != NULL); + + *bw = opal_accelerator_cuda_mem_bw[device]; + return OPAL_SUCCESS; +} \ No newline at end of file diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.h b/opal/mca/accelerator/cuda/accelerator_cuda.h index 694a4192231..b00403302b8 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.h +++ b/opal/mca/accelerator/cuda/accelerator_cuda.h @@ -2,6 +2,9 @@ * Copyright (c) 2014 Intel, Inc. All rights reserved. * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,13 +41,19 @@ typedef struct opal_accelerator_cuda_event_t opal_accelerator_cuda_event_t; OBJ_CLASS_DECLARATION(opal_accelerator_cuda_event_t); /* Declare extern variables, defined in accelerator_cuda_component.c */ -OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_memcpy_stream; +OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream; +OPAL_DECLSPEC extern CUstream opal_accelerator_cuda_alloc_stream; +OPAL_DECLSPEC extern opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream; OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_cuda_stream_lock; OPAL_DECLSPEC extern opal_accelerator_cuda_component_t mca_accelerator_cuda_component; OPAL_DECLSPEC extern opal_accelerator_base_module_t opal_accelerator_cuda_module; +OPAL_DECLSPEC extern int opal_accelerator_cuda_num_devices; + +OPAL_DECLSPEC extern float *opal_accelerator_cuda_mem_bw; + OPAL_DECLSPEC extern int opal_accelerator_cuda_delayed_init(void); END_C_DECLS diff --git a/opal/mca/accelerator/cuda/accelerator_cuda_component.c b/opal/mca/accelerator/cuda/accelerator_cuda_component.c index d48e29c9f65..0076a19e8cc 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda_component.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda_component.c @@ -6,6 +6,9 @@ * reserved. * Copyright (c) 2017-2022 Amazon.com, Inc. or its affiliates. * All Rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,13 +37,18 @@ #include "opal/sys/atomic.h" /* Define global variables, used in accelerator_cuda.c */ -CUstream opal_accelerator_cuda_memcpy_stream = NULL; +opal_accelerator_cuda_stream_t opal_accelerator_cuda_memcpy_stream = {0}; +CUstream opal_accelerator_cuda_alloc_stream = NULL; +opal_accelerator_cuda_stream_t opal_accelerator_cuda_default_stream = {0}; opal_mutex_t opal_accelerator_cuda_stream_lock = {0}; +int opal_accelerator_cuda_num_devices = 0; /* Initialization lock for delayed cuda initialization */ static opal_mutex_t accelerator_cuda_init_lock; static bool accelerator_cuda_init_complete = false; +float *opal_accelerator_cuda_mem_bw = NULL; + #define STRINGIFY2(x) #x #define STRINGIFY(x) STRINGIFY2(x) @@ -122,6 +130,7 @@ static int accelerator_cuda_component_register(void) int opal_accelerator_cuda_delayed_init() { int result = OPAL_SUCCESS; + int prio_lo, prio_hi; CUcontext cuContext; /* Double checked locking to avoid having to @@ -137,6 +146,8 @@ int opal_accelerator_cuda_delayed_init() goto out; } + cuDeviceGetCount(&opal_accelerator_cuda_num_devices); + /* Check to see if this process is running in a CUDA context. If * so, all is good. If not, then disable registration of memory. */ result = cuCtxGetCurrent(&cuContext); @@ -145,31 +156,115 @@ int opal_accelerator_cuda_delayed_init() goto out; } else if ((CUDA_SUCCESS == result) && (NULL == cuContext)) { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent returned NULL context"); - result = OPAL_ERROR; - goto out; + + /* create a context for each device */ + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + result = cuDeviceGet(&dev, i); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuDeviceGet failed"); + result = OPAL_ERROR; + goto out; + } + result = cuDevicePrimaryCtxRetain(&cuContext, dev); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuDevicePrimaryCtxRetain failed"); + result = OPAL_ERROR; + goto out; + } + if (0 == i) { + result = cuCtxPushCurrent(cuContext); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuCtxPushCurrent failed"); + result = OPAL_ERROR; + goto out; + } + } + } + + } else { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuCtxGetCurrent succeeded"); } /* Create stream for use in cuMemcpyAsync synchronous copies */ - result = cuStreamCreate(&opal_accelerator_cuda_memcpy_stream, 0); + CUstream memcpy_stream; + result = cuStreamCreate(&memcpy_stream, 0); + if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { + opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, + OPAL_PROC_MY_HOSTNAME, result); + goto out; + } + OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_cuda_stream_t); + opal_accelerator_cuda_memcpy_stream.base.stream = malloc(sizeof(CUstream)); + *(CUstream*)opal_accelerator_cuda_memcpy_stream.base.stream = memcpy_stream; + + /* Create stream for use in cuMemAllocAsync synchronous copies */ + result = cuStreamCreate(&opal_accelerator_cuda_alloc_stream, 0); if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, OPAL_PROC_MY_HOSTNAME, result); goto out; } + /* Create a default stream to be used by various components. + * We try to create a high-priority stream and fall back to a regular stream. + */ + CUstream *default_stream = malloc(sizeof(CUstream)); + result = cuCtxGetStreamPriorityRange(&prio_lo, &prio_hi); + if (CUDA_SUCCESS != result) { + result = cuStreamCreateWithPriority(default_stream, + CU_STREAM_NON_BLOCKING, prio_hi); + } else { + result = cuStreamCreate(default_stream, 0); + } + if (OPAL_UNLIKELY(result != CUDA_SUCCESS)) { + opal_show_help("help-accelerator-cuda.txt", "cuStreamCreate failed", true, + OPAL_PROC_MY_HOSTNAME, result); + goto out; + } + OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_cuda_stream_t); + opal_accelerator_cuda_default_stream.base.stream = default_stream; + result = cuMemHostRegister(&checkmem, sizeof(int), 0); if (result != CUDA_SUCCESS) { /* If registering the memory fails, print a message and continue. * This is not a fatal error. */ opal_show_help("help-accelerator-cuda.txt", "cuMemHostRegister during init failed", true, &checkmem, sizeof(int), OPAL_PROC_MY_HOSTNAME, result, "checkmem"); - } else { opal_output_verbose(20, opal_accelerator_base_framework.framework_output, "CUDA: cuMemHostRegister OK on test region"); } + + /* determine the memory bandwidth */ + opal_accelerator_cuda_mem_bw = malloc(sizeof(float)*opal_accelerator_cuda_num_devices); + for (int i = 0; i < opal_accelerator_cuda_num_devices; ++i) { + CUdevice dev; + result = cuDeviceGet(&dev, i); + if (CUDA_SUCCESS != result) { + opal_output_verbose(20, opal_accelerator_base_framework.framework_output, + "CUDA: cuDeviceGet failed"); + goto out; + } + int mem_clock_rate; // kHz + result = cuDeviceGetAttribute(&mem_clock_rate, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + dev); + int bus_width; // bit + result = cuDeviceGetAttribute(&bus_width, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + dev); + /* bw = clock_rate * bus width * 2bit multiplier + * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940 + */ + float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8; + opal_accelerator_cuda_mem_bw[i] = bw; + } + result = OPAL_SUCCESS; opal_atomic_wmb(); accelerator_cuda_init_complete = true; @@ -182,6 +277,9 @@ static opal_accelerator_base_module_t* accelerator_cuda_init(void) { OBJ_CONSTRUCT(&opal_accelerator_cuda_stream_lock, opal_mutex_t); OBJ_CONSTRUCT(&accelerator_cuda_init_lock, opal_mutex_t); + OBJ_CONSTRUCT(&opal_accelerator_cuda_default_stream, opal_accelerator_stream_t); + OBJ_CONSTRUCT(&opal_accelerator_cuda_memcpy_stream, opal_accelerator_stream_t); + /* First check if the support is enabled. In the case that the user has * turned it off, we do not need to continue with any CUDA specific * initialization. Do this after MCA parameter registration. */ @@ -205,9 +303,19 @@ static void accelerator_cuda_finalize(opal_accelerator_base_module_t* module) if (CUDA_SUCCESS != result) { ctx_ok = 0; } - if ((NULL != opal_accelerator_cuda_memcpy_stream) && ctx_ok) { - cuStreamDestroy(opal_accelerator_cuda_memcpy_stream); + + if ((NULL != opal_accelerator_cuda_memcpy_stream.base.stream) && ctx_ok) { + OBJ_DESTRUCT(&opal_accelerator_cuda_memcpy_stream); } + if ((NULL != opal_accelerator_cuda_alloc_stream) && ctx_ok) { + cuStreamDestroy(opal_accelerator_cuda_alloc_stream); + } + if ((NULL != opal_accelerator_cuda_default_stream.base.stream) && ctx_ok) { + OBJ_DESTRUCT(&opal_accelerator_cuda_default_stream); + } + + free(opal_accelerator_cuda_mem_bw); + opal_accelerator_cuda_mem_bw = NULL; OBJ_DESTRUCT(&opal_accelerator_cuda_stream_lock); OBJ_DESTRUCT(&accelerator_cuda_init_lock); diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c index 1bd6e0e2811..1d76790c149 100644 --- a/opal/mca/accelerator/null/accelerator_null_component.c +++ b/opal/mca/accelerator/null/accelerator_null_component.c @@ -9,6 +9,9 @@ * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,6 +31,8 @@ const char *opal_accelerator_null_component_version_string = "OPAL null accelerator MCA component version " OPAL_VERSION; +static opal_accelerator_stream_t default_stream; + /* * Component API functions */ @@ -40,6 +45,7 @@ static void accelerator_null_finalize(opal_accelerator_base_module_t* module); /* Accelerator API's */ static int accelerator_null_check_addr(const void *addr, int *dev_id, uint64_t *flags); +static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream); static int accelerator_null_create_stream(int dev_id, opal_accelerator_stream_t **stream); static int accelerator_null_create_event(int dev_id, opal_accelerator_event_t **event, bool enable_ipc); static int accelerator_null_record_event(int dev_id, opal_accelerator_event_t *event, opal_accelerator_stream_t *stream); @@ -50,11 +56,15 @@ static int accelerator_null_memcpy_async(int dest_dev_id, int src_dev_id, void * opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_null_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size); static int accelerator_null_mem_release(int dev_id, void *ptr); +static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, opal_accelerator_stream_t* stream); +static int accelerator_null_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); static bool accelerator_null_is_ipc_enabled(void); @@ -82,6 +92,12 @@ static int accelerator_null_device_can_access_peer(int *access, int dev1, int de static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream); + +static int accelerator_null_get_num_devices(int *num_devices); + +static int accelerator_null_get_mem_bw(int device, float *bw); + /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it @@ -122,6 +138,8 @@ opal_accelerator_null_component_t mca_accelerator_null_component = {{ opal_accelerator_base_module_t opal_accelerator_null_module = { + accelerator_null_get_default_stream, + accelerator_null_check_addr, accelerator_null_create_stream, @@ -133,9 +151,12 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_memcpy_async, accelerator_null_memcpy, + accelerator_null_memmove_async, accelerator_null_memmove, accelerator_null_mem_alloc, accelerator_null_mem_release, + accelerator_null_mem_alloc_stream, + accelerator_null_mem_release_stream, accelerator_null_get_address_range, accelerator_null_is_ipc_enabled, @@ -154,7 +175,11 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_get_device_pci_attr, accelerator_null_device_can_access_peer, - accelerator_null_get_buffer_id + accelerator_null_get_buffer_id, + + accelerator_null_wait_stream, + accelerator_null_get_num_devices, + accelerator_null_get_mem_bw }; static int accelerator_null_open(void) @@ -189,6 +214,12 @@ static int accelerator_null_check_addr(const void *addr, int *dev_id, uint64_t * return 0; } +static int accelerator_null_get_default_stream(int dev_id, opal_accelerator_stream_t **stream) +{ + *stream = &default_stream; + return OPAL_SUCCESS; +} + static int accelerator_null_create_stream(int dev_id, opal_accelerator_stream_t **stream) { *stream = OBJ_NEW(opal_accelerator_stream_t); @@ -237,6 +268,13 @@ static int accelerator_null_memmove(int dest_dev_id, int src_dev_id, void *dest, return OPAL_SUCCESS; } +static int accelerator_null_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, + opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type) +{ + memmove(dest, src, size); + return OPAL_SUCCESS; +} + static int accelerator_null_mem_alloc(int dev_id, void **ptr, size_t size) { *ptr = malloc(size); @@ -249,6 +287,23 @@ static int accelerator_null_mem_release(int dev_id, void *ptr) return OPAL_SUCCESS; } + +static int accelerator_null_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream) +{ + (void)stream; + *ptr = malloc(size); + return OPAL_SUCCESS; +} + +static int accelerator_null_mem_release_stream(int dev_id, void *ptr, + opal_accelerator_stream_t *stream) +{ + (void)stream; + free(ptr); + return OPAL_SUCCESS; +} + static int accelerator_null_get_address_range(int dev_id, const void *ptr, void **base, size_t *size) { @@ -331,3 +386,21 @@ static int accelerator_null_get_buffer_id(int dev_id, const void *addr, opal_acc { return OPAL_ERR_NOT_IMPLEMENTED; } + +static int accelerator_null_wait_stream(opal_accelerator_stream_t *stream) +{ + return OPAL_SUCCESS; +} + +static int accelerator_null_get_num_devices(int *num_devices) +{ + *num_devices = 0; + return OPAL_SUCCESS; +} + + +static int accelerator_null_get_mem_bw(int device, float *bw) +{ + *bw = 1.0; // return something that is not 0 + return OPAL_SUCCESS; +} diff --git a/opal/mca/accelerator/rocm/accelerator_rocm.h b/opal/mca/accelerator/rocm/accelerator_rocm.h index 38409778ad4..510702cdb4e 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm.h +++ b/opal/mca/accelerator/rocm/accelerator_rocm.h @@ -1,5 +1,8 @@ /* * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * * $COPYRIGHT$ * @@ -67,12 +70,18 @@ struct opal_accelerator_rocm_ipc_event_handle_t { typedef struct opal_accelerator_rocm_ipc_event_handle_t opal_accelerator_rocm_ipc_event_handle_t; OBJ_CLASS_DECLARATION(opal_accelerator_rocm_ipc_event_handle_t); -OPAL_DECLSPEC extern hipStream_t opal_accelerator_rocm_MemcpyStream; +OPAL_DECLSPEC extern hipStream_t *opal_accelerator_rocm_MemcpyStream; OPAL_DECLSPEC extern int opal_accelerator_rocm_memcpy_async; OPAL_DECLSPEC extern int opal_accelerator_rocm_verbose; OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyH2D_limit; OPAL_DECLSPEC extern size_t opal_accelerator_rocm_memcpyD2H_limit; +OPAL_DECLSPEC extern hipStream_t opal_accelerator_alloc_stream; +OPAL_DECLSPEC extern opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream; +OPAL_DECLSPEC extern opal_mutex_t opal_accelerator_rocm_stream_lock; +OPAL_DECLSPEC extern int opal_accelerator_rocm_num_devices; +OPAL_DECLSPEC extern float *opal_accelerator_rocm_mem_bw; + OPAL_DECLSPEC extern int opal_accelerator_rocm_lazy_init(void); #endif diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_component.c b/opal/mca/accelerator/rocm/accelerator_rocm_component.c index 8f1bbbb53a5..28f2d7506c4 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_component.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_component.c @@ -19,7 +19,9 @@ #include #include "opal/mca/dl/base/base.h" +#include "opal/mca/accelerator/base/base.h" #include "opal/runtime/opal_params.h" +#include "opal/util/proc.h" #include "accelerator_rocm.h" int opal_accelerator_rocm_memcpy_async = 0; @@ -31,7 +33,13 @@ size_t opal_accelerator_rocm_memcpyH2D_limit=1048576; static opal_mutex_t accelerator_rocm_init_lock; static bool accelerator_rocm_init_complete = false; -hipStream_t opal_accelerator_rocm_MemcpyStream = NULL; +/* Define global variables, used in accelerator_rocm.c */ +hipStream_t opal_accelerator_rocm_alloc_stream = NULL; +opal_accelerator_rocm_stream_t opal_accelerator_rocm_default_stream = {0}; +opal_mutex_t opal_accelerator_rocm_stream_lock = {0}; +int opal_accelerator_rocm_num_devices = 0; +float *opal_accelerator_rocm_mem_bw = NULL; +hipStream_t *opal_accelerator_rocm_MemcpyStream = NULL; /* * Public string showing the accelerator rocm component version number @@ -159,6 +167,7 @@ static int accelerator_rocm_component_register(void) int opal_accelerator_rocm_lazy_init() { + int prio_hi, prio_lo; int err = OPAL_SUCCESS; /* Double checked locking to avoid having to @@ -174,13 +183,62 @@ int opal_accelerator_rocm_lazy_init() goto out; } - err = hipStreamCreate(&opal_accelerator_rocm_MemcpyStream); + hipGetDeviceCount(&opal_accelerator_rocm_num_devices); + + /* Create stream for use in cuMemcpyAsync synchronous copies */ + hipStream_t memcpy_stream; + err = hipStreamCreate(&memcpy_stream); + if (OPAL_UNLIKELY(err != hipSuccess)) { + opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true, + OPAL_PROC_MY_HOSTNAME, err); + goto out; + } + opal_accelerator_rocm_MemcpyStream = malloc(sizeof(hipStream_t)); + *(hipStream_t*)opal_accelerator_rocm_MemcpyStream = memcpy_stream; + + /* Create stream for use in cuMemcpyAsync synchronous copies */ + err = hipStreamCreateWithFlags(&opal_accelerator_rocm_alloc_stream, 0); + if (OPAL_UNLIKELY(err != hipSuccess)) { + opal_show_help("help-accelerator-rocm.txt", "hipStreamCreateWithFlags failed", true, + OPAL_PROC_MY_HOSTNAME, err); + goto out; + } + + /* Create a default stream to be used by various components. + * We try to create a high-priority stream and fall back to a regular stream. + */ + hipStream_t *default_stream = malloc(sizeof(hipStream_t)); + err = hipDeviceGetStreamPriorityRange(&prio_lo, &prio_hi); if (hipSuccess != err) { + err = hipStreamCreateWithPriority(default_stream, + hipStreamNonBlocking, prio_hi); + } else { + err = hipStreamCreateWithFlags(default_stream, 0); + } + if (OPAL_UNLIKELY(err != hipSuccess)) { opal_output(0, "Could not create hipStream, err=%d %s\n", err, hipGetErrorString(err)); goto out; } - + OBJ_CONSTRUCT(&opal_accelerator_rocm_default_stream, opal_accelerator_rocm_stream_t); + opal_accelerator_rocm_default_stream.base.stream = default_stream; + + opal_accelerator_rocm_mem_bw = malloc(sizeof(float)*opal_accelerator_rocm_num_devices); + for (int i = 0; i < opal_accelerator_rocm_num_devices; ++i) { + int mem_clock_rate; // kHz + err = hipDeviceGetAttribute(&mem_clock_rate, + hipDeviceAttributeMemoryClockRate, + i); + int bus_width; // bit + err = hipDeviceGetAttribute(&bus_width, + hipDeviceAttributeMemoryBusWidth, + i); + /* bw = clock_rate * bus width * 2bit multiplier + * See https://forums.developer.nvidia.com/t/memory-clock-rate/107940 + */ + float bw = ((float)mem_clock_rate*(float)bus_width*2.0) / 1024 / 1024 / 8; + opal_accelerator_rocm_mem_bw[i] = bw; + } err = OPAL_SUCCESS; opal_atomic_wmb(); accelerator_rocm_init_complete = true; @@ -192,7 +250,8 @@ int opal_accelerator_rocm_lazy_init() static opal_accelerator_base_module_t* accelerator_rocm_init(void) { OBJ_CONSTRUCT(&accelerator_rocm_init_lock, opal_mutex_t); - + OBJ_CONSTRUCT(&opal_accelerator_rocm_stream_lock, opal_mutex_t); + hipError_t err; if (opal_rocm_runtime_initialized) { @@ -214,14 +273,20 @@ static opal_accelerator_base_module_t* accelerator_rocm_init(void) static void accelerator_rocm_finalize(opal_accelerator_base_module_t* module) { - if (NULL != (void*)opal_accelerator_rocm_MemcpyStream) { - hipError_t err = hipStreamDestroy(opal_accelerator_rocm_MemcpyStream); + if (NULL != opal_accelerator_rocm_MemcpyStream) { + hipError_t err = hipStreamDestroy(*opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err) { opal_output_verbose(10, 0, "hip_dl_finalize: error while destroying the hipStream\n"); } + free(opal_accelerator_rocm_MemcpyStream); opal_accelerator_rocm_MemcpyStream = NULL; + + OBJ_DESTRUCT(&opal_accelerator_rocm_default_stream); + free(opal_accelerator_rocm_mem_bw); + opal_accelerator_rocm_mem_bw = NULL; } OBJ_DESTRUCT(&accelerator_rocm_init_lock); + OBJ_DESTRUCT(&opal_accelerator_rocm_stream_lock); return; } diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index 6db5e0d4927..66c94b1b01a 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -1,6 +1,9 @@ /* * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All Rights reserved. * $COPYRIGHT$ + * Copyright (c) 2024 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. * * Additional copyrights may follow * @@ -15,6 +18,7 @@ #include "opal/util/output.h" /* Accelerator API's */ +static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream); static int mca_accelerator_rocm_check_addr(const void *addr, int *dev_id, uint64_t *flags); static int mca_accelerator_rocm_create_stream(int dev_id, opal_accelerator_stream_t **stream); @@ -27,10 +31,17 @@ static int mca_accelerator_rocm_memcpy_async(int dest_dev_id, int src_dev_id, vo opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int mca_accelerator_rocm_mem_alloc(int dev_id, void **ptr, size_t size); static int mca_accelerator_rocm_mem_release(int dev_id, void *ptr); +static int mca_accelerator_rocm_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream); +static int mca_accelerator_rocm_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int mca_accelerator_rocm_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); @@ -59,11 +70,17 @@ static int mca_accelerator_rocm_device_can_access_peer( int *access, int dev1, i static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream); + +static int mca_accelerator_rocm_get_num_devices(int *num_devices); + +static int mca_accelerator_rocm_get_mem_bw(int device, float *bw); #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream)) opal_accelerator_base_module_t opal_accelerator_rocm_module = { + mca_accelerator_rocm_get_default_stream, mca_accelerator_rocm_check_addr, mca_accelerator_rocm_create_stream, @@ -75,9 +92,12 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_memcpy_async, mca_accelerator_rocm_memcpy, + mca_accelerator_rocm_memmove_async, mca_accelerator_rocm_memmove, mca_accelerator_rocm_mem_alloc, mca_accelerator_rocm_mem_release, + mca_accelerator_rocm_mem_alloc_stream, + mca_accelerator_rocm_mem_release_stream, mca_accelerator_rocm_get_address_range, mca_accelerator_rocm_is_ipc_enabled, @@ -96,7 +116,11 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_get_device_pci_attr, mca_accelerator_rocm_device_can_access_peer, - mca_accelerator_rocm_get_buffer_id + mca_accelerator_rocm_get_buffer_id, + + mca_accelerator_rocm_wait_stream, + mca_accelerator_rocm_get_num_devices, + mca_accelerator_rocm_get_mem_bw }; @@ -123,6 +147,10 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6 opal_accelerator_rocm_lazy_init(); *dev_id = srcAttr.device; ret = 1; + // TODO: on Frontier the host can access any device memory + // is that true everywhere? + //*flags |= MCA_ACCELERATOR_FLAGS_UNIFIED_MEMORY; + //*dev_id = srcAttr.device; #if HIP_VERSION >= 50731921 } else if (hipMemoryTypeUnified == srcAttr.type) { #else @@ -138,6 +166,16 @@ static int mca_accelerator_rocm_check_addr (const void *addr, int *dev_id, uint6 return ret; } +static int mca_accelerator_rocm_get_default_stream(int dev_id, opal_accelerator_stream_t **stream) +{ + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + *stream = &opal_accelerator_rocm_default_stream.base; + return OPAL_SUCCESS; +} + static int mca_accelerator_rocm_create_stream(int dev_id, opal_accelerator_stream_t **stream) { if (NULL == stream) { @@ -233,7 +271,7 @@ OBJ_CLASS_INSTANCE( opal_accelerator_event_t, NULL, mca_accelerator_rocm_event_destruct); - + static int mca_accelerator_rocm_record_event(int dev_id, opal_accelerator_event_t *event, opal_accelerator_stream_t *stream) { @@ -348,14 +386,14 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de if (opal_accelerator_rocm_memcpy_async) { err = hipMemcpyAsync(dest, src, size, hipMemcpyDefault, - opal_accelerator_rocm_MemcpyStream); + *opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error starting async copy\n"); return OPAL_ERROR; } - err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream); + err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error synchronizing stream after async copy\n"); @@ -373,6 +411,43 @@ static int mca_accelerator_rocm_memcpy(int dest_dev_id, int src_dev_id, void *de return OPAL_SUCCESS; } +static int mca_accelerator_rocm_memmove_async(int dest_dev_id, int src_dev_id, void *dest, const void *src, + size_t size, opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) +{ + hipDeviceptr_t tmp; + hipError_t result; + void *ptr; + + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + if (NULL == dest || NULL == src || size <= 0) { + return OPAL_ERR_BAD_PARAM; + } + + result = mca_accelerator_rocm_mem_alloc_stream(src_dev_id, &ptr, size, stream); + if (OPAL_UNLIKELY(OPAL_SUCCESS != result)) { + return OPAL_ERROR; + } + tmp = (hipDeviceptr_t)ptr; + result = hipMemcpyAsync(tmp, (hipDeviceptr_t) src, size, hipMemcpyDefault, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error during synchronous copy\n"); + return OPAL_ERROR; + } + result = hipMemcpyAsync((hipDeviceptr_t) dest, tmp, size, hipMemcpyDefault, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_output_verbose(10, opal_accelerator_base_framework.framework_output, + "error during synchronous copy\n"); + return OPAL_ERROR; + } + return mca_accelerator_rocm_mem_release_stream(src_dev_id, ptr, stream); +} + static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type) @@ -393,7 +468,7 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d if (opal_accelerator_rocm_memcpy_async) { err = hipMemcpyAsync(tmp, src, size, hipMemcpyDefault, - opal_accelerator_rocm_MemcpyStream); + *opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error in async memcpy for memmove\n"); @@ -401,14 +476,14 @@ static int mca_accelerator_rocm_memmove(int dest_dev_id, int src_dev_id, void *d } err = hipMemcpyAsync(dest, tmp, size, hipMemcpyDefault, - opal_accelerator_rocm_MemcpyStream); + *opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error in async memcpy for memmove\n"); return OPAL_ERROR; } - err = hipStreamSynchronize(opal_accelerator_rocm_MemcpyStream); + err = hipStreamSynchronize(*opal_accelerator_rocm_MemcpyStream); if (hipSuccess != err ) { opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error synchronizing stream for memmove\n"); @@ -535,7 +610,7 @@ static int mca_accelerator_rocm_get_ipc_handle(int dev_id, void *dev_ptr, "Error in hipIpcGetMemHandle dev_ptr %p", dev_ptr); OBJ_DESTRUCT(rocm_handle); return OPAL_ERROR; - } + } memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE); return OPAL_SUCCESS; @@ -597,7 +672,7 @@ static int mca_accelerator_rocm_compare_ipc_handles(uint8_t handle_1[IPC_MAX_HAN static void mca_accelerator_rocm_ipc_event_handle_destruct(opal_accelerator_rocm_ipc_handle_t *handle) { - // Just a place holder, there is no hipIpcCloseEventHandle. + // Just a place holder, there is no hipIpcCloseEventHandle. } OBJ_CLASS_INSTANCE( @@ -617,7 +692,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e hipIpcEventHandle_t rocm_ipc_handle; opal_accelerator_rocm_ipc_event_handle_t *rocm_handle = (opal_accelerator_rocm_ipc_event_handle_t *) handle; OBJ_CONSTRUCT(rocm_handle, opal_accelerator_rocm_ipc_event_handle_t); - + memset(rocm_ipc_handle.reserved, 0, HIP_IPC_HANDLE_SIZE); hipError_t err = hipIpcGetEventHandle(&rocm_ipc_handle, *((hipEvent_t *)event->event)); @@ -626,7 +701,7 @@ static int mca_accelerator_rocm_get_ipc_event_handle(opal_accelerator_event_t *e "error in hipIpcGetEventHandle"); OBJ_DESTRUCT(rocm_handle); return OPAL_ERROR; - } + } memcpy(rocm_handle->base.handle, &rocm_ipc_handle, IPC_MAX_HANDLE_SIZE); return OPAL_SUCCESS; @@ -664,7 +739,7 @@ static int mca_accelerator_rocm_open_ipc_event_handle(opal_accelerator_ipc_event opal_output_verbose(10, opal_accelerator_base_framework.framework_output, "error in hipIpcOpenEventHandle"); return OPAL_ERROR; - } + } return OPAL_SUCCESS; } @@ -802,3 +877,81 @@ static int mca_accelerator_rocm_get_buffer_id(int dev_id, const void *addr, opal #endif return OPAL_SUCCESS; } + + +static int mca_accelerator_rocm_mem_alloc_stream( + int dev_id, + void **addr, + size_t size, + opal_accelerator_stream_t *stream) +{ + hipError_t result; + + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + + if (NULL == stream || NULL == addr || 0 == size) { + return OPAL_ERR_BAD_PARAM; + } + + result = hipMallocAsync(addr, size, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_show_help("help-accelerator-rocm.txt", "hipMalloc failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + +static int mca_accelerator_rocm_mem_release_stream( + int dev_id, + void *addr, + opal_accelerator_stream_t *stream) +{ + hipError_t result; + + if (NULL == stream || NULL == addr) { + return OPAL_ERR_BAD_PARAM; + } + + result = hipFreeAsync(addr, *(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_show_help("help-accelerator-rocm.txt", "hipMalloc failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + +static int mca_accelerator_rocm_wait_stream(opal_accelerator_stream_t *stream) +{ + hipError_t result; + result = hipStreamSynchronize(*(hipStream_t*)stream->stream); + if (OPAL_UNLIKELY(hipSuccess != result)) { + opal_show_help("help-accelerator-rocm.txt", "hipStreamSynchronize failed", true, + OPAL_PROC_MY_HOSTNAME, result); + return OPAL_ERROR; + } + return OPAL_SUCCESS; +} + + +static int mca_accelerator_rocm_get_num_devices(int *num_devices) +{ + *num_devices = opal_accelerator_rocm_num_devices; + return OPAL_SUCCESS; +} + +static int mca_accelerator_rocm_get_mem_bw(int device, float *bw) +{ + int delayed_init = opal_accelerator_rocm_lazy_init(); + if (OPAL_UNLIKELY(0 != delayed_init)) { + return delayed_init; + } + assert(opal_accelerator_rocm_mem_bw != NULL); + + *bw = opal_accelerator_rocm_mem_bw[device]; + return OPAL_SUCCESS; +} \ No newline at end of file diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c index 38b49cf4290..33edab901b7 100644 --- a/opal/mca/accelerator/ze/accelerator_ze_module.c +++ b/opal/mca/accelerator/ze/accelerator_ze_module.c @@ -20,6 +20,7 @@ #include "opal/util/output.h" /* Accelerator API's */ +static int mca_accelerator_ze_get_default_stream(int dev_id, opal_accelerator_stream_t **stream); static int mca_accelerator_ze_check_addr(const void *addr, int *dev_id, uint64_t *flags); static int mca_accelerator_ze_create_stream(int dev_id, opal_accelerator_stream_t **stream); @@ -32,10 +33,17 @@ static int mca_accelerator_ze_memcpy_async(int dest_dev_id, int src_dev_id, void opal_accelerator_stream_t *stream, opal_accelerator_transfer_type_t type); static int mca_accelerator_ze_memcpy(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); +static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type); static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *dest, const void *src, size_t size, opal_accelerator_transfer_type_t type); static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size); static int mca_accelerator_ze_mem_release(int dev_id, void *ptr); +static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream); +static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream); static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base, size_t *size); @@ -65,8 +73,15 @@ static int mca_accelerator_ze_get_device_pci_attr(int dev_id, opal_accelerator_p static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_accelerator_buffer_id_t *buf_id); +static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream); + +static int mca_accelerator_ze_get_num_devices(int *num_devices); + +static int mca_accelerator_ze_get_mem_bw(int device, float *bw); + opal_accelerator_base_module_t opal_accelerator_ze_module = { + .get_default_stream = mca_accelerator_rocm_get_default_stream, .check_addr = mca_accelerator_ze_check_addr, .create_stream = mca_accelerator_ze_create_stream, @@ -77,10 +92,13 @@ opal_accelerator_base_module_t opal_accelerator_ze_module = .mem_copy_async = mca_accelerator_ze_memcpy_async, .mem_copy = mca_accelerator_ze_memcpy, + .mem_move_async = mca_accelerator_ze_memmove_async, .mem_move = mca_accelerator_ze_memmove, .mem_alloc = mca_accelerator_ze_mem_alloc, .mem_release = mca_accelerator_ze_mem_release, + .mem_alloc_stream = mca_accelerator_ze_mem_alloc_stream, + .mem_release_stream = mca_accelerator_ze_mem_release_stream, .get_address_range = mca_accelerator_ze_get_address_range, .is_ipc_enabled = mca_accelerator_ze_is_ipc_enabled, @@ -99,7 +117,10 @@ opal_accelerator_base_module_t opal_accelerator_ze_module = .get_device_pci_attr = mca_accelerator_ze_get_device_pci_attr, .device_can_access_peer = mca_accelerator_ze_device_can_access_peer, - .get_buffer_id = mca_accelerator_ze_get_buffer_id + .get_buffer_id = mca_accelerator_ze_get_buffer_id, + .wait_stream = mca_accelerator_ze_wait_stream, + .num_devices = mca_accelerator_ze_get_num_devices, + .get_mem_bw = mca_accelerator_ze_get_mem_bw }; static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice) @@ -116,6 +137,14 @@ static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice) return ret; } +static int mca_accelerator_ze_get_default_stream(int dev_id, opal_accelerator_stream_t **stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int mca_accelerator_ze_check_addr (const void *addr, int *dev_id, uint64_t *flags) { ze_result_t zret; @@ -548,6 +577,17 @@ static int mca_accelerator_ze_memmove(int dest_dev_id, int src_dev_id, void *des return OPAL_ERR_NOT_IMPLEMENTED; } +static int mca_accelerator_ze_memmove_async(int dest_dev_id, int src_dev_id, void *dest, + const void *src, size_t size, + opal_accelerator_stream_t *stream, + opal_accelerator_transfer_type_t type) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int mca_accelerator_ze_mem_alloc(int dev_id, void **ptr, size_t size) { ze_result_t zret; @@ -603,6 +643,23 @@ static int mca_accelerator_ze_mem_release(int dev_id, void *ptr) return OPAL_ERROR; } +static int mca_accelerator_ze_mem_alloc_stream(int dev_id, void **ptr, size_t size, + opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_mem_release_stream(int dev_id, void *ptr, opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + static int mca_accelerator_ze_get_address_range(int dev_id, const void *ptr, void **base, size_t *size) { @@ -801,3 +858,27 @@ static int mca_accelerator_ze_get_buffer_id(int dev_id, const void *addr, opal_a return OPAL_SUCCESS; } + +static int mca_accelerator_ze_wait_stream(opal_accelerator_stream_t *stream) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_get_num_devices(int *num_devices) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} + +static int mca_accelerator_ze_get_mem_bw(int device, float *bw) +{ + /* + * TODO + */ + return OPAL_ERR_NOT_IMPLEMENTED; +} \ No newline at end of file