From 443bfa8692c945539bec21e2b910fc22f67f7bc0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:27:00 +0800 Subject: [PATCH 001/393] more --- csrc/deep_ep.cpp | 6 ++++++ csrc/deep_ep.hpp | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 9c90178b..8bae5e79 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -10,6 +10,12 @@ #include "kernels/api.cuh" #include "kernels/configs.cuh" +namespace shared_memory { + void get_mem_handle(bool enable_fabric, ) { + } + +} + namespace deep_ep { Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode): diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index dfa2202d..f984735c 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -20,6 +20,13 @@ #define TORCH_EXTENSION_NAME deep_ep_cpp #endif +namespace shared_memory { +typedef union { + cudaIpcMemHandle_t cuda_ipc_mem_handle; + CUmemFabricHandle cu_mem_fabric_handle; +} MemHandle; +} + namespace deep_ep { struct Buffer { From b986cce27bf39167488415fefcf0ee2dfee24e0c Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:27:50 +0800 Subject: [PATCH 002/393] more --- csrc/deep_ep.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 8bae5e79..6f71f71f 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,7 +11,12 @@ #include "kernels/configs.cuh" namespace shared_memory { - void get_mem_handle(bool enable_fabric, ) { + void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaIpcGetMemHandle(handle, ptr)); + } } } @@ -52,7 +57,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles CUDA_CHECK(cudaMalloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); - CUDA_CHECK(cudaIpcGetMemHandle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); + CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); // Set barrier signals From 3ea6f58265cfccec4112bc00a195458dda5503d5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:29:26 +0800 Subject: [PATCH 003/393] more --- csrc/deep_ep.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6f71f71f..a9b72efc 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,6 +11,14 @@ #include "kernels/configs.cuh" namespace shared_memory { + void malloc(void** ptr, size_t size) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); + } + } + void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { if (enable_fabric) { TODO; @@ -56,7 +64,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles - CUDA_CHECK(cudaMalloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); + CUDA_CHECK(shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); From 5d3513bbd3fda45cbd83b40214ecaa736267f4f0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:29:37 +0800 Subject: [PATCH 004/393] more --- csrc/deep_ep.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index f984735c..3c4284f0 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -51,7 +51,7 @@ struct Buffer { int num_device_sms; int rank, rdma_rank, nvl_rank; int num_ranks, num_rdma_ranks, num_nvl_ranks; - cudaIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS]; + shared_memory::MemHandle ipc_handles[NUM_MAX_NVL_PEERS]; // Stream for communication at::cuda::CUDAStream comm_stream; From bda56951540b9ea4b7c110e27bb6f877cc69786b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:30:00 +0800 Subject: [PATCH 005/393] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a9b72efc..90304b09 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -23,7 +23,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcGetMemHandle(handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(handle->cuda_ipc_mem_handle, ptr)); } } From 3740762ccfe54edc147472c7c6af81196cfb1243 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:30:13 +0800 Subject: [PATCH 006/393] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 90304b09..d1a299eb 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -23,7 +23,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcGetMemHandle(handle->cuda_ipc_mem_handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(&handle->cuda_ipc_mem_handle, ptr)); } } From ad4aee8bfcaa3b6c02c0eda2ec5affb637574722 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:31:16 +0800 Subject: [PATCH 007/393] more --- csrc/deep_ep.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index d1a299eb..ab04202a 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -161,7 +161,8 @@ int Buffer::get_local_device_id() const { } pybind11::bytearray Buffer::get_local_ipc_handle() const { - return {ipc_handles[nvl_rank].reserved, CUDA_IPC_HANDLE_SIZE}; + const shared_memory::MemHandle& handle = ipc_handles[nvl_rank]; + return {reinterpret_cast(&handle), sizeof(handle)}; } pybind11::bytearray Buffer::get_local_nvshmem_unique_id() const { From b5e4aad6597e7d3487718a05bfd74dac1c7e3687 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:33:05 +0800 Subject: [PATCH 008/393] more --- csrc/deep_ep.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index ab04202a..3c8709a4 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -27,6 +27,13 @@ namespace shared_memory { } } + void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* handle) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle, cudaIpcMemLazyEnablePeerAccess)); + } + } } namespace deep_ep { @@ -198,7 +205,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == CUDA_IPC_HANDLE_SIZE); if (offset + i != rank) { std::memcpy(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE); - CUDA_CHECK(cudaIpcOpenMemHandle(&buffer_ptrs[i], ipc_handles[i], cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i])); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE) == 0); From 240d0582533b6b0b3864f1a9835e2596bd9fc2cd Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:41:25 +0800 Subject: [PATCH 009/393] more --- csrc/deep_ep.cpp | 6 +++--- csrc/deep_ep.hpp | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 3c8709a4..7174e8a7 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -202,13 +202,13 @@ void Buffer::sync(const std::vector &device_ids, for (int i = 0, offset = rdma_rank * num_nvl_ranks; i < num_nvl_ranks; ++ i) { EP_HOST_ASSERT(all_gathered_handles[offset + i].has_value()); auto handle_str = std::string(all_gathered_handles[offset + i].value()); - EP_HOST_ASSERT(handle_str.size() == CUDA_IPC_HANDLE_SIZE); + EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { - std::memcpy(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE); + std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i])); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { - EP_HOST_ASSERT(std::memcmp(ipc_handles[i].reserved, handle_str.c_str(), CUDA_IPC_HANDLE_SIZE) == 0); + EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); } } diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 3c4284f0..dbb4df72 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -25,6 +25,8 @@ typedef union { cudaIpcMemHandle_t cuda_ipc_mem_handle; CUmemFabricHandle cu_mem_fabric_handle; } MemHandle; + +constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); } namespace deep_ep { From 5379d59f6889acdb2063df8177165c282348b4a1 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:41:58 +0800 Subject: [PATCH 010/393] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 7174e8a7..9667961c 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -31,7 +31,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } } From 4fc8e79646295def0d90b49859c44154a3ccbda9 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:43:21 +0800 Subject: [PATCH 011/393] more --- csrc/deep_ep.cpp | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 9667961c..fa2e9def 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -19,6 +19,14 @@ namespace shared_memory { } } + void free(void* ptr) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); + } + } + void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { if (enable_fabric) { TODO; @@ -34,6 +42,14 @@ namespace shared_memory { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } + + void close_mem_handle(bool enable_fabric, void* ptr) { + if (enable_fabric) { + TODO; + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i])); + } + } } namespace deep_ep { @@ -71,8 +87,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles - CUDA_CHECK(shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes)); - CUDA_CHECK(shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank])); + shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes); + shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); // Set barrier signals @@ -118,11 +134,11 @@ Buffer::~Buffer() noexcept(false) { // Close remote IPC if (is_available()) { for (int i = 0; i < num_nvl_ranks; ++ i) if (i != nvl_rank) - CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i])); + shared_memory::close_mem_handle(buffer_ptrs[i]); } // Free local buffer and error flag - CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); + shared_memory::free(buffer_ptrs[nvl_rank])); } // Free NVSHMEM @@ -205,7 +221,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); - CUDA_CHECK(shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i])); + shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); From 2e90afea36cbfe613f89fd7be34268d3e0c3ed2d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:43:35 +0800 Subject: [PATCH 012/393] more --- csrc/deep_ep.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index fa2e9def..453993f2 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -47,7 +47,7 @@ namespace shared_memory { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcCloseMemHandle(buffer_ptrs[i])); + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } } } From 3639a57cd511d4bb9539bc2aaef4938eaa0c6614 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:48:24 +0800 Subject: [PATCH 013/393] more --- csrc/deep_ep.cpp | 35 ++++++++++++++++++++++++++++++++++- csrc/kernels/exception.cuh | 10 ++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 453993f2..40cfff62 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -13,7 +13,40 @@ namespace shared_memory { void malloc(void** ptr, size_t size) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + + int cudaDev; + CUDA_CHECK(cudaGetDevice(&cudaDev)); + + CUdevice currentDev; + CU_CHECK(cuDeviceGet(¤tDev, cudaDev)); + + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = currentDev; + + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size = (size + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc accessDesc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[idx].location.id = idx; + accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); } else { CUDA_CHECK(cudaMalloc(ptr, size)); } diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 7db0ddb7..9eeedadd 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -31,6 +31,16 @@ do { \ } while (0) #endif +#ifndef CU_CHECK +#define CU_CHECK(cmd) \ +do { \ + CUresult e = (cmd); \ + if (e != CUDA_SUCCESS) { \ + throw EPException("CUDA", __FILE__, __LINE__, std::string(e)); \ + } \ +} while (0) +#endif + #ifndef EP_HOST_ASSERT #define EP_HOST_ASSERT(cond) \ do { \ From 4ef8f05afb241c810ee064438f98ef1a5a38f400 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:48:38 +0800 Subject: [PATCH 014/393] more --- csrc/deep_ep.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 40cfff62..5fbf20ac 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -13,8 +13,6 @@ namespace shared_memory { void malloc(void** ptr, size_t size) { if (enable_fabric) { - CUmemGenericAllocationHandle handle; - int cudaDev; CUDA_CHECK(cudaGetDevice(&cudaDev)); @@ -33,6 +31,7 @@ namespace shared_memory { size = (size + granularity - 1) & ~(granularity - 1); if (size == 0) size = granularity; + CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); From 047656e98affce1903a9987f5fa0d789d1323bc8 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:49:13 +0800 Subject: [PATCH 015/393] more --- csrc/deep_ep.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 5fbf20ac..50b67923 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -13,17 +13,14 @@ namespace shared_memory { void malloc(void** ptr, size_t size) { if (enable_fabric) { - int cudaDev; - CUDA_CHECK(cudaGetDevice(&cudaDev)); - - CUdevice currentDev; - CU_CHECK(cuDeviceGet(¤tDev, cudaDev)); + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = currentDev; + prop.location.id = device; size_t granularity = 0; CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); From c21f36dccdd7ce0061cc3f362f65312095d9620a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:51:01 +0800 Subject: [PATCH 016/393] more --- csrc/deep_ep.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 50b67923..cf5abd8f 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -50,7 +50,15 @@ namespace shared_memory { void free(void* ptr) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); } else { CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); } From 7f3e4c088c268a700e928bf7a3edfd47f552d9c3 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:52:50 +0800 Subject: [PATCH 017/393] more --- csrc/deep_ep.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index cf5abd8f..0c7e63d4 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -64,19 +64,22 @@ namespace shared_memory { } } - void get_mem_handle(bool enable_fabric, MemHandle* handle, void* ptr) { + void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); } else { - CUDA_CHECK(cudaIpcGetMemHandle(&handle->cuda_ipc_mem_handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->cuda_ipc_mem_handle, ptr)); } } - void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* handle) { + void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { TODO; } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } From 92fb573703c5977c0c3af8adfd4f99d32aa927ba Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:54:01 +0800 Subject: [PATCH 018/393] more --- csrc/deep_ep.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0c7e63d4..e4b1d5e7 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -77,7 +77,24 @@ namespace shared_memory { void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { - TODO; + CUmemFabricHandle export_handle; + memcpy(&export_handle, output_buffer.data(), sizeof(export_handle)); + void *shm_addr = nullptr; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &export_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&shm_addr, entry.length, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)shm_addr, entry.length, 0, handle, 0)); + + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc accessDesc[device_count]; + for (int device_id = 0; device_id < device_count; ++device_id) { + accessDesc[device_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[device_id].location.id = device_id; + accessDesc[device_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + CU_CHECK(cuMemSetAccess((CUdeviceptr)shm_addr, entry.length, accessDesc, device_count)); } else { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } From 29f86f3537cf363207b5968b5f1a0db0c1b11314 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:55:06 +0800 Subject: [PATCH 019/393] more --- csrc/deep_ep.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index e4b1d5e7..dfbb103e 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -77,13 +77,12 @@ namespace shared_memory { void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { - CUmemFabricHandle export_handle; - memcpy(&export_handle, output_buffer.data(), sizeof(export_handle)); - void *shm_addr = nullptr; + TODO_size; + CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &export_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&shm_addr, entry.length, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)shm_addr, entry.length, 0, handle, 0)); + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); From 5557e70cb3562d31fd87c811d0c640e24aba6c18 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:56:07 +0800 Subject: [PATCH 020/393] more --- csrc/deep_ep.cpp | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index dfbb103e..6b530f4a 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,6 +11,20 @@ #include "kernels/configs.cuh" namespace shared_memory { + void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc accessDesc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc[idx].location.id = idx; + accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } + + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); + } + void malloc(void** ptr, size_t size) { if (enable_fabric) { CUdevice device; @@ -33,16 +47,7 @@ namespace shared_memory { CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc accessDesc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc[idx].location.id = idx; - accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); + cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaMalloc(ptr, size)); } @@ -87,13 +92,7 @@ namespace shared_memory { int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); - CUmemAccessDesc accessDesc[device_count]; - for (int device_id = 0; device_id < device_count; ++device_id) { - accessDesc[device_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc[device_id].location.id = device_id; - accessDesc[device_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - CU_CHECK(cuMemSetAccess((CUdeviceptr)shm_addr, entry.length, accessDesc, device_count)); + cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } From 9fd34e757bc5c35aee509a173d7d5d008a789d2b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:57:53 +0800 Subject: [PATCH 021/393] more --- csrc/deep_ep.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6b530f4a..a994f918 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -86,6 +86,7 @@ namespace shared_memory { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); From 64173931ff7887bd3b40dcca5100e83798348f3d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 15:58:39 +0800 Subject: [PATCH 022/393] more --- csrc/deep_ep.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a994f918..bf576ded 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -101,7 +101,15 @@ namespace shared_memory { void close_mem_handle(bool enable_fabric, void* ptr) { if (enable_fabric) { - TODO; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); } else { CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } From faaeaadfdaf741f482eb5b3292782fe85c2feeb0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:00:13 +0800 Subject: [PATCH 023/393] more --- csrc/deep_ep.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index bf576ded..ef4e940e 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -25,7 +25,7 @@ namespace shared_memory { CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); } - void malloc(void** ptr, size_t size) { + void malloc(bool enable_fabric, void** ptr, size_t size) { if (enable_fabric) { CUdevice device; CURESULT_CHECK(cuCtxGetDevice(&device)); @@ -53,7 +53,7 @@ namespace shared_memory { } } - void free(void* ptr) { + void free(bool enable_fabric, void* ptr) { if (enable_fabric) { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); @@ -101,15 +101,7 @@ namespace shared_memory { void close_mem_handle(bool enable_fabric, void* ptr) { if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); + free(true, ptr); } else { CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } From c38dbeddd903279cf6235c76e524e134ff250a59 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:00:53 +0800 Subject: [PATCH 024/393] more --- csrc/deep_ep.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index ef4e940e..e0d6e047 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -25,6 +25,18 @@ namespace shared_memory { CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); } + void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); + } + void malloc(bool enable_fabric, void** ptr, size_t size) { if (enable_fabric) { CUdevice device; @@ -55,17 +67,9 @@ namespace shared_memory { void free(bool enable_fabric, void* ptr) { if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); + cu_mem_free(ptr); } else { - CUDA_CHECK(cudaFree(buffer_ptrs[nvl_rank])); + CUDA_CHECK(cudaFree(ptr)); } } @@ -101,7 +105,7 @@ namespace shared_memory { void close_mem_handle(bool enable_fabric, void* ptr) { if (enable_fabric) { - free(true, ptr); + cu_mem_free(ptr); } else { CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } From dc74c0a9f3428c52201ad14bcbbcd64e304d8669 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:01:24 +0800 Subject: [PATCH 025/393] more --- csrc/deep_ep.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index e0d6e047..a48a7e15 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -56,9 +56,9 @@ namespace shared_memory { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaMalloc(ptr, size)); @@ -93,10 +93,6 @@ namespace shared_memory { CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - cu_mem_set_access_all(*ptr, size); } else { CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); From 61dea30b60ac87409148213a96e3bce0d5c2ab26 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:06:00 +0800 Subject: [PATCH 026/393] more --- csrc/deep_ep.cpp | 12 +++++++----- csrc/deep_ep.hpp | 10 ++++++++-- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a48a7e15..54b3a941 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -74,28 +74,30 @@ namespace shared_memory { } void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { + mem_handle->size = TODO; + if (enable_fabric) { CUmemGenericAllocationHandle handle; CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->cuda_ipc_mem_handle, ptr)); + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); } } void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { if (enable_fabric) { - TODO_size; + size_t size = mem_handle->size; CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); cu_mem_set_access_all(*ptr, size); } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } } diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index dbb4df72..1a015f5a 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -21,10 +21,16 @@ #endif namespace shared_memory { -typedef union { + +union MemHandleInner { cudaIpcMemHandle_t cuda_ipc_mem_handle; CUmemFabricHandle cu_mem_fabric_handle; -} MemHandle; +}; + +struct MemHandle { + MemHandleInner inner; + size_t size; +}; constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); } From 7d4bc93e2d204cc1f16c10a1b67e594e8cb65491 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:07:00 +0800 Subject: [PATCH 027/393] more --- csrc/deep_ep.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 54b3a941..cf350ce3 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -74,7 +74,10 @@ namespace shared_memory { } void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { - mem_handle->size = TODO; + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + + mem_handle->size = size; if (enable_fabric) { CUmemGenericAllocationHandle handle; From 5b78f2243341a6bd314bfd98faefdf8cbad3e31e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:07:18 +0800 Subject: [PATCH 028/393] more --- csrc/deep_ep.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index cf350ce3..a16583b8 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -15,14 +15,14 @@ namespace shared_memory { int device_count; CUDA_CHECK(cudaGetDeviceCount(&device_count)); - CUmemAccessDesc accessDesc[device_count]; + CUmemAccessDesc access_desc[device_count]; for (int idx = 0; idx < device_count; ++idx) { - accessDesc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc[idx].location.id = idx; - accessDesc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, accessDesc, device_count)); + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); } void cu_mem_free(void* ptr) { From 75351cd380ba45cff664683ee3312740a01968ad Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:08:34 +0800 Subject: [PATCH 029/393] more --- csrc/deep_ep.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a16583b8..991c94b0 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -37,7 +37,16 @@ namespace shared_memory { CU_CHECK(cuMemRelease(handle)); } - void malloc(bool enable_fabric, void** ptr, size_t size) { + void get_size_align_to_granularity(size_t size_raw) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; + } + + void malloc(bool enable_fabric, void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; CURESULT_CHECK(cuCtxGetDevice(&device)); @@ -47,12 +56,8 @@ namespace shared_memory { prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - - size = (size + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; + + size_t size = get_size_align_to_granularity(size_raw); CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); From 7bb12d4e66569af1f8d0a6c37840e139cd8570b0 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:08:49 +0800 Subject: [PATCH 030/393] more --- csrc/deep_ep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 991c94b0..fe099af2 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -37,7 +37,7 @@ namespace shared_memory { CU_CHECK(cuMemRelease(handle)); } - void get_size_align_to_granularity(size_t size_raw) { + void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { size_t granularity = 0; CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); @@ -57,7 +57,7 @@ namespace shared_memory { prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - size_t size = get_size_align_to_granularity(size_raw); + size_t size = get_size_align_to_granularity(size_raw, prop); CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); From 0e5a15509dceb76f82feea6aac775111cc28b4c5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:11:56 +0800 Subject: [PATCH 031/393] more --- csrc/deep_ep.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index fe099af2..42537a2a 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -46,6 +46,21 @@ namespace shared_memory { return size; } + bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } + } + + return true; + } + void malloc(bool enable_fabric, void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; @@ -56,7 +71,7 @@ namespace shared_memory { prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - + size_t size = get_size_align_to_granularity(size_raw, prop); CUmemGenericAllocationHandle handle; From 87b398034dee42cde66fa362e70f992f2ca08ffa Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:13:52 +0800 Subject: [PATCH 032/393] more --- csrc/deep_ep.cpp | 10 +++++----- csrc/deep_ep.hpp | 5 +++++ 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 42537a2a..0b2cbbe1 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -61,7 +61,7 @@ namespace shared_memory { return true; } - void malloc(bool enable_fabric, void** ptr, size_t size_raw) { + void malloc(void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; CURESULT_CHECK(cuCtxGetDevice(&device)); @@ -85,7 +85,7 @@ namespace shared_memory { } } - void free(bool enable_fabric, void* ptr) { + void free(void* ptr) { if (enable_fabric) { cu_mem_free(ptr); } else { @@ -93,7 +93,7 @@ namespace shared_memory { } } - void get_mem_handle(bool enable_fabric, MemHandle* mem_handle, void* ptr) { + void get_mem_handle(MemHandle* mem_handle, void* ptr) { size_t size = 0; CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); @@ -109,7 +109,7 @@ namespace shared_memory { } } - void open_mem_handle(bool enable_fabric, void** ptr, MemHandle* mem_handle) { + void open_mem_handle(void** ptr, MemHandle* mem_handle) { if (enable_fabric) { size_t size = mem_handle->size; @@ -124,7 +124,7 @@ namespace shared_memory { } } - void close_mem_handle(bool enable_fabric, void* ptr) { + void close_mem_handle(void* ptr) { if (enable_fabric) { cu_mem_free(ptr); } else { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 1a015f5a..017c465a 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -33,6 +33,11 @@ struct MemHandle { }; constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); + +class SharedMemoryAllocator { +private: + bool enable_fabric; +}; } namespace deep_ep { From 4398b5ce9737b0f2e2d92f9897a981f5567e5a6d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:14:21 +0800 Subject: [PATCH 033/393] more --- csrc/deep_ep.cpp | 189 ++++++++++++++++++++++++----------------------- csrc/deep_ep.hpp | 4 - 2 files changed, 98 insertions(+), 95 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0b2cbbe1..bae8a622 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,127 +11,134 @@ #include "kernels/configs.cuh" namespace shared_memory { - void cu_mem_set_access_all(void* ptr, size_t size) { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc access_desc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc[idx].location.id = idx; - access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc access_desc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } - void cu_mem_free(void* ptr) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +} - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); - } + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); +} - size_t size = (size_raw + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; - return size; - } +void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - bool support_fabric() { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; +} - for (int device = 0; device < device_count; ++device) { - int support = 0; - CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (!support) { - return false; - } - } +bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); - return true; + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } } - void malloc(void** ptr, size_t size_raw) { - if (enable_fabric) { - CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + return true; +} - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = device; +class SharedMemoryAllocator { +public: + void malloc(void** ptr, size_t size_raw); +private: + bool enable_fabric; +}; - size_t size = get_size_align_to_granularity(size_raw, prop); +void malloc(void** ptr, size_t size_raw) { + if (enable_fabric) { + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = device; - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); - } + size_t size = get_size_align_to_granularity(size_raw, prop); + + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); } +} - void free(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaFree(ptr)); - } +void free(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaFree(ptr)); } +} - void get_mem_handle(MemHandle* mem_handle, void* ptr) { - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void get_mem_handle(MemHandle* mem_handle, void* ptr) { + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - mem_handle->size = size; + mem_handle->size = size; - if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + if (enable_fabric) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); - } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); - } + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + } else { + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); } +} - void open_mem_handle(void** ptr, MemHandle* mem_handle) { - if (enable_fabric) { - size_t size = mem_handle->size; +void open_mem_handle(void** ptr, MemHandle* mem_handle) { + if (enable_fabric) { + size_t size = mem_handle->size; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); - } + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } +} - void close_mem_handle(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); - } +void close_mem_handle(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } } +} namespace deep_ep { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 017c465a..14bbaf7a 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -34,10 +34,6 @@ struct MemHandle { constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); -class SharedMemoryAllocator { -private: - bool enable_fabric; -}; } namespace deep_ep { From d7e9ce380c689a5ca42de187a6cdf6633fdf5d5a Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:16:08 +0800 Subject: [PATCH 034/393] more --- csrc/deep_ep.cpp | 189 +++++++++++++++++++++++------------------------ csrc/deep_ep.hpp | 10 +++ 2 files changed, 101 insertions(+), 98 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index bae8a622..a2650617 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,134 +11,127 @@ #include "kernels/configs.cuh" namespace shared_memory { -void cu_mem_set_access_all(void* ptr, size_t size) { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc access_desc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc[idx].location.id = idx; - access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } + void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc access_desc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + } - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); -} + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); + } -void cu_mem_free(void* ptr) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); -} + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); + } -void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - size_t size = (size_raw + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; - return size; -} + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; + } -bool support_fabric() { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); + bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); - for (int device = 0; device < device_count; ++device) { - int support = 0; - CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (!support) { - return false; + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } } - } - - return true; -} -class SharedMemoryAllocator { -public: - void malloc(void** ptr, size_t size_raw); -private: - bool enable_fabric; -}; + return true; + } -void malloc(void** ptr, size_t size_raw) { - if (enable_fabric) { - CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { + if (enable_fabric) { + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = device; + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = device; - size_t size = get_size_align_to_granularity(size_raw, prop); + size_t size = get_size_align_to_granularity(size_raw, prop); - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); + } } -} -void free(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaFree(ptr)); + void SharedMemoryAllocator::free(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaFree(ptr)); + } } -} -void get_mem_handle(MemHandle* mem_handle, void* ptr) { - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) { + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - mem_handle->size = size; + mem_handle->size = size; - if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + if (enable_fabric) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); - } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + } else { + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); + } } -} -void open_mem_handle(void** ptr, MemHandle* mem_handle) { - if (enable_fabric) { - size_t size = mem_handle->size; + void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) { + if (enable_fabric) { + size_t size = mem_handle->size; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); + } } -} -void close_mem_handle(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); + void SharedMemoryAllocator::close_mem_handle(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); + } } } -} namespace deep_ep { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 14bbaf7a..5df87429 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -34,6 +34,16 @@ struct MemHandle { constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); +class SharedMemoryAllocator { +public: + void malloc(void** ptr, size_t size); + void free(void* ptr); + void get_mem_handle(MemHandle* mem_handle, void* ptr); + void open_mem_handle(void** ptr, MemHandle* mem_handle); + void close_mem_handle(void* ptr); +private: + bool enable_fabric; +}; } namespace deep_ep { From 5b83cb85878f39b897432f9b38184bcd17a36f9d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:16:41 +0800 Subject: [PATCH 035/393] more --- csrc/deep_ep.cpp | 2 ++ csrc/deep_ep.hpp | 1 + 2 files changed, 3 insertions(+) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index a2650617..6e2c084f 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -60,6 +60,8 @@ namespace shared_memory { return true; } + + SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {} void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { if (enable_fabric) { diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 5df87429..a8c73f75 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -36,6 +36,7 @@ constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); class SharedMemoryAllocator { public: + SharedMemoryAllocator(); void malloc(void** ptr, size_t size); void free(void* ptr); void get_mem_handle(MemHandle* mem_handle, void* ptr); From f024df5938cb2f81060974cd4ef238b2e596f1bf Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:16:45 +0800 Subject: [PATCH 036/393] more --- csrc/deep_ep.cpp | 186 +++++++++++++++++++++++------------------------ 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6e2c084f..6d6deda4 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -11,129 +11,129 @@ #include "kernels/configs.cuh" namespace shared_memory { - void cu_mem_set_access_all(void* ptr, size_t size) { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); - - CUmemAccessDesc access_desc[device_count]; - for (int idx = 0; idx < device_count; ++idx) { - access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access_desc[idx].location.id = idx; - access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - } - - CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +void cu_mem_set_access_all(void* ptr, size_t size) { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + + CUmemAccessDesc access_desc[device_count]; + for (int idx = 0; idx < device_count; ++idx) { + access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access_desc[idx].location.id = idx; + access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; } - void cu_mem_free(void* ptr) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CU_CHECK(cuMemSetAccess((CUdeviceptr)ptr, size, access_desc, device_count)); +} - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void cu_mem_free(void* ptr) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - CU_CHECK(cuMemRelease(handle)); - } + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + CU_CHECK(cuMemRelease(handle)); +} - size_t size = (size_raw + granularity - 1) & ~(granularity - 1); - if (size == 0) size = granularity; - return size; - } +void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - bool support_fabric() { - int device_count; - CUDA_CHECK(cudaGetDeviceCount(&device_count)); + size_t size = (size_raw + granularity - 1) & ~(granularity - 1); + if (size == 0) size = granularity; + return size; +} - for (int device = 0; device < device_count; ++device) { - int support = 0; - CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (!support) { - return false; - } - } +bool support_fabric() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); - return true; + for (int device = 0; device < device_count; ++device) { + int support = 0; + CU_CHECK(cuDeviceGetAttribute(&support, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (!support) { + return false; + } } - - SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {} - void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { - if (enable_fabric) { - CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + return true; +} - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; - prop.location.id = device; +SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) {} - size_t size = get_size_align_to_granularity(size_raw, prop); +void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { + if (enable_fabric) { + CUdevice device; + CURESULT_CHECK(cuCtxGetDevice(&device)); - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.id = device; - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaMalloc(ptr, size)); - } + size_t size = get_size_align_to_granularity(size_raw, prop); + + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); + + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaMalloc(ptr, size)); } +} - void SharedMemoryAllocator::free(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaFree(ptr)); - } +void SharedMemoryAllocator::free(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaFree(ptr)); } +} - void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) { - size_t size = 0; - CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); +void SharedMemoryAllocator::get_mem_handle(MemHandle* mem_handle, void* ptr) { + size_t size = 0; + CU_CHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - mem_handle->size = size; + mem_handle->size = size; - if (enable_fabric) { - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); + if (enable_fabric) { + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); - } else { - CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); - } + CU_CHECK(cuMemExportToShareableHandle(&mem_handle->inner.cu_mem_fabric_handle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0)); + } else { + CUDA_CHECK(cudaIpcGetMemHandle(&mem_handle->inner.cuda_ipc_mem_handle, ptr)); } +} - void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) { - if (enable_fabric) { - size_t size = mem_handle->size; +void SharedMemoryAllocator::open_mem_handle(void** ptr, MemHandle* mem_handle) { + if (enable_fabric) { + size_t size = mem_handle->size; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemImportFromShareableHandle(&handle, &mem_handle->inner.cu_mem_fabric_handle, CU_MEM_HANDLE_TYPE_FABRIC)); - CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); - CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - cu_mem_set_access_all(*ptr, size); - } else { - CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); - } + CU_CHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, 0, 0, 0)); + CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + cu_mem_set_access_all(*ptr, size); + } else { + CUDA_CHECK(cudaIpcOpenMemHandle(ptr, mem_handle->inner.cuda_ipc_mem_handle, cudaIpcMemLazyEnablePeerAccess)); } +} - void SharedMemoryAllocator::close_mem_handle(void* ptr) { - if (enable_fabric) { - cu_mem_free(ptr); - } else { - CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); - } +void SharedMemoryAllocator::close_mem_handle(void* ptr) { + if (enable_fabric) { + cu_mem_free(ptr); + } else { + CUDA_CHECK(cudaIpcCloseMemHandle(ptr)); } } +} namespace deep_ep { From 5a7b2f2ab15ceb2492d9bd13eb6300a5fc71325e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:20:10 +0800 Subject: [PATCH 037/393] more --- csrc/deep_ep.cpp | 10 +++++----- csrc/deep_ep.hpp | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 6d6deda4..8bdfc3e9 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -170,8 +170,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (num_nvl_bytes > 0) { // Local IPC: alloc local memory and set local IPC handles - shared_memory::malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes); - shared_memory::get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]); + shared_memory_allocator.malloc(&buffer_ptrs[nvl_rank], num_nvl_bytes + barrier_signal_bytes + buffer_ptr_bytes + barrier_signal_ptr_bytes); + shared_memory_allocator.get_mem_handle(&ipc_handles[nvl_rank], buffer_ptrs[nvl_rank]); buffer_ptrs_gpu = reinterpret_cast(static_cast(buffer_ptrs[nvl_rank]) + num_nvl_bytes + barrier_signal_bytes); // Set barrier signals @@ -217,11 +217,11 @@ Buffer::~Buffer() noexcept(false) { // Close remote IPC if (is_available()) { for (int i = 0; i < num_nvl_ranks; ++ i) if (i != nvl_rank) - shared_memory::close_mem_handle(buffer_ptrs[i]); + shared_memory_allocator.close_mem_handle(buffer_ptrs[i]); } // Free local buffer and error flag - shared_memory::free(buffer_ptrs[nvl_rank])); + shared_memory_allocator.free(buffer_ptrs[nvl_rank])); } // Free NVSHMEM @@ -304,7 +304,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); - shared_memory::open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); + shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index a8c73f75..8ceee158 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -98,6 +98,8 @@ struct Buffer { volatile int* moe_recv_rdma_counter = nullptr; int* moe_recv_rdma_counter_mapped = nullptr; + SharedMemoryAllocator shared_memory_allocator; + public: Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode); From 60523797d95a353b61754e885654115ff1123858 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:22:32 +0800 Subject: [PATCH 038/393] more --- csrc/kernels/exception.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 9eeedadd..4be59122 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -36,7 +36,7 @@ do { \ do { \ CUresult e = (cmd); \ if (e != CUDA_SUCCESS) { \ - throw EPException("CUDA", __FILE__, __LINE__, std::string(e)); \ + throw EPException("CU", __FILE__, __LINE__, cuGetErrorName(e)); \ } \ } while (0) #endif From befcd27066f1524737892446900113344dd22dbd Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:22:50 +0800 Subject: [PATCH 039/393] more --- csrc/kernels/exception.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 4be59122..80aae935 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -36,7 +36,7 @@ do { \ do { \ CUresult e = (cmd); \ if (e != CUDA_SUCCESS) { \ - throw EPException("CU", __FILE__, __LINE__, cuGetErrorName(e)); \ + throw EPException("CU", __FILE__, __LINE__, cuGetErrorString(e)); \ } \ } while (0) #endif From df598ea7ac8306ab8a80a5130133eab3650d6fd5 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:24:54 +0800 Subject: [PATCH 040/393] more --- csrc/kernels/exception.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh index 80aae935..3026374b 100644 --- a/csrc/kernels/exception.cuh +++ b/csrc/kernels/exception.cuh @@ -36,7 +36,9 @@ do { \ do { \ CUresult e = (cmd); \ if (e != CUDA_SUCCESS) { \ - throw EPException("CU", __FILE__, __LINE__, cuGetErrorString(e)); \ + const char *error_str = NULL; \ + cuGetErrorString(e, &error_str); \ + throw EPException("CU", __FILE__, __LINE__, std::string(error_str)); \ } \ } while (0) #endif From 5b23a8ad2190514697523f375d8e18b2571aff4b Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:25:15 +0800 Subject: [PATCH 041/393] more --- csrc/deep_ep.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 8ceee158..9b99d5e8 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -32,7 +32,7 @@ struct MemHandle { size_t size; }; -constexpr usize_t HANDLE_SIZE = sizeof(MemHandle); +constexpr size_t HANDLE_SIZE = sizeof(MemHandle); class SharedMemoryAllocator { public: From 210e4997026bd372b69632f3589551a454aea81f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:26:28 +0800 Subject: [PATCH 042/393] more --- csrc/deep_ep.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp index 9b99d5e8..185fd33b 100644 --- a/csrc/deep_ep.hpp +++ b/csrc/deep_ep.hpp @@ -98,7 +98,7 @@ struct Buffer { volatile int* moe_recv_rdma_counter = nullptr; int* moe_recv_rdma_counter_mapped = nullptr; - SharedMemoryAllocator shared_memory_allocator; + shared_memory::SharedMemoryAllocator shared_memory_allocator; public: Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode); From 379ac2447d8415679c493edd8d78855d8e742d5e Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:28:11 +0800 Subject: [PATCH 043/393] more --- csrc/deep_ep.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 8bdfc3e9..0e725f88 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -37,10 +37,7 @@ void cu_mem_free(void* ptr) { CU_CHECK(cuMemRelease(handle)); } -void get_size_align_to_granularity(size_t size_raw, CUmemAllocationProp& prop) { - size_t granularity = 0; - CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - +size_t get_size_align_to_granularity(size_t size_raw, size_t granularity) { size_t size = (size_raw + granularity - 1) & ~(granularity - 1); if (size == 0) size = granularity; return size; @@ -66,7 +63,7 @@ SharedMemoryAllocator::SharedMemoryAllocator() : enable_fabric(support_fabric()) void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { if (enable_fabric) { CUdevice device; - CURESULT_CHECK(cuCtxGetDevice(&device)); + CU_CHECK(cuCtxGetDevice(&device)); CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; @@ -74,7 +71,10 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.location.id = device; - size_t size = get_size_align_to_granularity(size_raw, prop); + size_t granularity = 0; + CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t size = get_size_align_to_granularity(size_raw, granularity); CUmemGenericAllocationHandle handle; CU_CHECK(cuMemCreate(&handle, size, &prop, 0)); From 43999dc0d16aaf1436b168ec10f803c4eeaa3142 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:29:02 +0800 Subject: [PATCH 044/393] more --- csrc/deep_ep.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 0e725f88..40b3cd1e 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -303,11 +303,11 @@ void Buffer::sync(const std::vector &device_ids, auto handle_str = std::string(all_gathered_handles[offset + i].value()); EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { - std::memcpy(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); + std::memcpy(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { - EP_HOST_ASSERT(std::memcmp(ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); + EP_HOST_ASSERT(std::memcmp(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); } } From 791601101bdfb82881b6e7446c17b6bbf9c28815 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:30:16 +0800 Subject: [PATCH 045/393] more --- csrc/deep_ep.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp index 40b3cd1e..d872c037 100644 --- a/csrc/deep_ep.cpp +++ b/csrc/deep_ep.cpp @@ -83,7 +83,7 @@ void SharedMemoryAllocator::malloc(void** ptr, size_t size_raw) { CU_CHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); cu_mem_set_access_all(*ptr, size); } else { - CUDA_CHECK(cudaMalloc(ptr, size)); + CUDA_CHECK(cudaMalloc(ptr, size_raw)); } } @@ -221,7 +221,7 @@ Buffer::~Buffer() noexcept(false) { } // Free local buffer and error flag - shared_memory_allocator.free(buffer_ptrs[nvl_rank])); + shared_memory_allocator.free(buffer_ptrs[nvl_rank]); } // Free NVSHMEM @@ -304,7 +304,7 @@ void Buffer::sync(const std::vector &device_ids, EP_HOST_ASSERT(handle_str.size() == shared_memory::HANDLE_SIZE); if (offset + i != rank) { std::memcpy(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE); - shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], ipc_handles[i]); + shared_memory_allocator.open_mem_handle(&buffer_ptrs[i], &ipc_handles[i]); barrier_signal_ptrs[i] = reinterpret_cast(static_cast(buffer_ptrs[i]) + num_nvl_bytes); } else { EP_HOST_ASSERT(std::memcmp(&ipc_handles[i], handle_str.c_str(), shared_memory::HANDLE_SIZE) == 0); From 0525f8f79e8b96b8e934130d01bd853c3386b5c2 Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Tue, 17 Jun 2025 16:44:00 +0800 Subject: [PATCH 046/393] more --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b16310a7..93294f74 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ include_dirs = ['csrc/'] library_dirs = [] nvcc_dlink = [] - extra_link_args = [] + extra_link_args = ['-lcuda'] # NVSHMEM flags if disable_nvshmem: From 3b3e8ce72ab0d955ecb8be11d85d06d6cd4fa34d Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Mon, 25 Aug 2025 19:27:49 +0800 Subject: [PATCH 047/393] wait_signal --- csrc/kernels/utils.cuh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/csrc/kernels/utils.cuh b/csrc/kernels/utils.cuh index 9ef4d7aa..e1a784e9 100644 --- a/csrc/kernels/utils.cuh +++ b/csrc/kernels/utils.cuh @@ -582,4 +582,16 @@ __forceinline__ __device__ T warp_reduce_or(T value) { return warp_reduce(value, ReduceOr{}); } +__device__ __forceinline__ void wait_signal(uint32_t* addr, uint32_t expect_value) { + uint32_t ready = *addr; + while (ready != expect_value) { + // TODO correct? + asm volatile("ld.acquire.gpu.global.u32 %0, [%1];" + : "=r"(ready) + : "l"(addr) + : "memory"); + asm volatile("nanosleep.u32 20;"); + }; +} + } // namespace deep_ep From 56b4d96f7a44cb5f3e12a6f1f14bb1cf5515ca0f Mon Sep 17 00:00:00 2001 From: fzyzcjy Date: Mon, 25 Aug 2025 19:29:40 +0800 Subject: [PATCH 048/393] copy v2 --- csrc/kernels/internode_ll.cu | 2 + csrc/kernels/internode_ll_v2.cuh | 799 +++++++++++++++++++++++++++++++ 2 files changed, 801 insertions(+) create mode 100644 csrc/kernels/internode_ll_v2.cuh diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu index 9cd2bfb5..e499c472 100644 --- a/csrc/kernels/internode_ll.cu +++ b/csrc/kernels/internode_ll.cu @@ -3,6 +3,8 @@ #include "launch.cuh" #include "ibgda_device.cuh" +#include "internode_ll_v2.cuh" + namespace deep_ep { namespace internode_ll { diff --git a/csrc/kernels/internode_ll_v2.cuh b/csrc/kernels/internode_ll_v2.cuh new file mode 100644 index 00000000..28c301f6 --- /dev/null +++ b/csrc/kernels/internode_ll_v2.cuh @@ -0,0 +1,799 @@ +#include "configs.cuh" +#include "exception.cuh" +#include "launch.cuh" +#include "ibgda_device.cuh" + +namespace deep_ep { +namespace internode_ll { + +template +__global__ __launch_bounds__(1024, 1) void +dispatch_v2(void* packed_recv_x, void* packed_recv_x_scales, + int* packed_recv_src_info, int64_t* packed_recv_layout_range, + int* packed_recv_count, + int* cumulative_local_expert_recv_stats, + int64_t* dispatch_wait_recv_cost_stats, + void* rdma_recv_x, int* rdma_recv_count, void* rdma_x, + const void* x, const int64_t* topk_idx, + int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert, + int* next_clean, int num_next_clean_int, + int num_tokens, int num_max_dispatch_tokens_per_rank, + int num_topk, int num_experts, int rank, int num_ranks, + int num_warp_groups, int num_warps_per_group, + bool round_scale, int phases) { + const auto sm_id = static_cast(blockIdx.x); + const auto thread_id = static_cast(threadIdx.x); + const auto warp_id = thread_id / 32, lane_id = get_lane_id(); + const auto num_sms = static_cast(gridDim.x); + const auto num_warps = num_warp_groups * num_warps_per_group; + const auto num_local_experts = num_experts / num_ranks; + const auto warp_group_id = warp_id / num_warps_per_group; + const auto sub_warp_id = warp_id % num_warps_per_group; + const auto responsible_expert_idx = sm_id * num_warp_groups + warp_group_id; + + // May extract UE8M0 from the scales + using scale_t = std::conditional_t; + using packed_t = std::conditional_t; + EP_STATIC_ASSERT(sizeof(packed_t) % sizeof(scale_t) == 0, "Invalid vector length"); + + // FP8 staffs + constexpr int kNumPerChannels = 128; + const int num_scales = kHidden / kNumPerChannels; + const size_t hidden_bytes = kHidden * (kUseFP8 ? sizeof(__nv_fp8_storage_t) : sizeof(nv_bfloat16)); + const size_t hidden_int4 = hidden_bytes / sizeof(int4); + + // Message package: hidden data, FP8 scales, index at source + // NOTES: currently we have 3 reserved int fields for future use + using vec_t = std::conditional_t; + const size_t num_bytes_per_msg = sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float)) : (kHidden * sizeof(nv_bfloat16))); + const size_t num_int4_per_msg = num_bytes_per_msg / sizeof(int4); + EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0); + + // Expert counts + constexpr int kNumMaxWarpGroups = 32; + __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups]; + + // Sending phase + if ((phases & LOW_LATENCY_SEND_PHASE) == 0) + goto LOW_LATENCY_DISPATCH_RECV; + + // There are 2 kinds of warps in this part: + // 1. The first-kind warps for FP8 cast and sending top-k tokens + // 2. The last warp for reading `topk_idx` and count for per-expert information + if (warp_id < num_warps - 1) { + constexpr int kNumElemsPerRead = sizeof(int4) / sizeof(nv_bfloat16); + EP_STATIC_ASSERT(kHidden % (32 * kNumElemsPerRead) == 0, "Invalid hidden"); + EP_STATIC_ASSERT(kNumElemsPerRead * 32 % kNumPerChannels == 0, "Invalid vectorization"); + const auto num_threads = (num_warps - 1) * 32; + const size_t hidden_bf16_int4 = kHidden / kNumElemsPerRead; + + for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) { + const auto x_int4 = static_cast(x) + token_idx * hidden_bf16_int4; + const auto rdma_x_src_idx = reinterpret_cast(static_cast(rdma_x) + token_idx * num_bytes_per_msg); + const auto rdma_x_vec = reinterpret_cast(reinterpret_cast(rdma_x_src_idx) + sizeof(int4)); + const auto rdma_x_scales = reinterpret_cast(reinterpret_cast(rdma_x_vec) + hidden_bytes); + + // Overlap top-k index read and source token index writes + auto dst_expert_idx = warp_id < num_topk ? static_cast(__ldg(topk_idx + token_idx * num_topk + warp_id)) : -1; + thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0; + + // FP8 cast + EP_STATIC_ASSERT(hidden_bf16_int4 % 32 == 0, "Must use the full warp to reduce"); + #pragma unroll + for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) { + // Read + auto int4_value = __ldg(x_int4 + i); + + if constexpr (kUseFP8) { + // Calculate local amax + auto bf16_values = reinterpret_cast(&int4_value); + float fp32_values[kNumElemsPerRead]; + float amax = kFP8Margin, scale, scale_inv; + #pragma unroll + for (int j = 0; j < kNumElemsPerRead; ++ j) { + fp32_values[j] = static_cast(bf16_values[j]); + amax = fmaxf(amax, fabsf(fp32_values[j])); + } + + // Reduce amax and scale + EP_STATIC_ASSERT(kNumElemsPerRead * 32 / kNumPerChannels == 2, "Invalid vectorization"); + amax = warp_reduce_max<16>(amax); + calculate_fp8_scales(amax, scale, scale_inv, round_scale); + if (lane_id == 0 or lane_id == 16) + rdma_x_scales[i * kNumElemsPerRead / 128] = scale_inv; + + // Cast into send buffer + vec_t int2_value; + auto fp8x2_values = reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value); + #pragma unroll + for (int j = 0; j < kNumElemsPerRead; j += 2) { + float2 fp32x2 = {fp32_values[j] * scale, fp32_values[j + 1] * scale}; + fp8x2_values[j / 2] = __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE, __NV_E4M3); + } + rdma_x_vec[i] = int2_value; + } else { + // Reinterpret-cast is for C++14 compatibility + rdma_x_vec[i] = *reinterpret_cast(&int4_value); + } + } + asm volatile("bar.sync 1, %0;" :: "r"(num_threads)); + + // Issue IBGDA sends + if (dst_expert_idx >= 0) { + int slot_idx = lane_id == 0 ? atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1) : 0; + slot_idx = __shfl_sync(0xffffffff, slot_idx, 0); + const auto dst_rank = dst_expert_idx / num_local_experts; + const auto dst_expert_local_idx = dst_expert_idx % num_local_experts; + const auto src_ptr = reinterpret_cast(rdma_x_src_idx); + const auto dst_ptr = reinterpret_cast(rdma_recv_x) + + dst_expert_local_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg + + rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg + + slot_idx * num_bytes_per_msg; + const auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank); + if (dst_p2p_ptr == 0) { + nvshmemi_ibgda_put_nbi_warp(dst_ptr, src_ptr, num_bytes_per_msg, dst_rank, dst_expert_local_idx, lane_id, slot_idx); + } else { + // NOTES: only 2 load iterations for 7K hidden with 8 unrolls + const auto* src_int4_ptr = reinterpret_cast(src_ptr); + const auto* dst_int4_ptr = reinterpret_cast(dst_p2p_ptr); + UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr, src_int4_ptr, ld_nc_global, st_na_global); + } + + // Increase counter after finishing + __syncwarp(); + lane_id == 0 ? atomic_add_release_global(atomic_finish_counter_per_expert + dst_expert_idx, 1) : 0; + } + } + } else if (warp_id == num_warps - 1) { + EP_DEVICE_ASSERT(num_sms > 1); + if (sm_id == 0) { + // The first SM is also responsible for checking QPs + EP_DEVICE_ASSERT(ibgda_get_state()->num_rc_per_pe >= num_local_experts); + + // The first SM is also responsible for cleaning the next buffer + #pragma unroll + for (int i = lane_id; i < num_next_clean_int; i += 32) + next_clean[i] = 0; + + // Notify before executing `int_p` + __syncwarp(); + #pragma unroll + for (int i = lane_id; i < num_experts; i += 32) + atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG); + } + + // This SM should be responsible for some destination experts, read `topk_idx` for them + int expert_count[kNumMaxWarpGroups] = {0}; + const auto expert_begin_idx = sm_id * num_warp_groups; + const auto expert_end_idx = min(expert_begin_idx + num_warp_groups, num_experts); + + // Per lane count + #pragma unroll 8 + for (int i = lane_id; i < num_tokens * num_topk; i += 32) { + auto idx = static_cast(__ldg(topk_idx + i)); + if (idx >= expert_begin_idx and idx < expert_end_idx) + expert_count[idx - expert_begin_idx] ++; + } + + // Warp reduce + #pragma unroll + for (int i = expert_begin_idx; i < expert_end_idx; ++ i) { + auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]); + if (lane_id == 0) { + shared_num_tokens_sent_per_expert[i - expert_begin_idx] = sum; + atomic_add_release_global(atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG - sum); + } + } + } + __syncthreads(); + + // Issue count sends + if (responsible_expert_idx < num_experts and sub_warp_id == 0 and lane_id == 0) { + const auto dst_rank = responsible_expert_idx / num_local_experts; + const auto dst_expert_local_idx = responsible_expert_idx % num_local_experts; + const auto num_tokens_sent = shared_num_tokens_sent_per_expert[responsible_expert_idx - sm_id * num_warp_groups]; + + // Wait local sends issued and send expert counts + while (ld_acquire_global(atomic_finish_counter_per_expert + responsible_expert_idx) != FINISHED_SUM_TAG * 2); + auto dst_ptr = reinterpret_cast(rdma_recv_count + dst_expert_local_idx * num_ranks + rank); + auto dst_p2p_ptr = nvshmemi_get_p2p_ptr(dst_ptr, rank, dst_rank); + if (dst_p2p_ptr == 0) { + nvshmemi_ibgda_amo_nonfetch_add(reinterpret_cast(dst_ptr), -num_tokens_sent - 1, dst_rank, dst_expert_local_idx); + } else { + st_release_sys_global(reinterpret_cast(dst_p2p_ptr), -num_tokens_sent - 1); + } + + // Clean workspace for next use + atomic_counter_per_expert[responsible_expert_idx] = 0; + atomic_finish_counter_per_expert[responsible_expert_idx] = 0; + + // Clean `packed_recv_count` + if (dst_rank == 0) + packed_recv_count[dst_expert_local_idx] = 0; + } + __syncwarp(); + + // Receiving phase + LOW_LATENCY_DISPATCH_RECV: + if ((phases & LOW_LATENCY_RECV_PHASE) == 0) + return; + + // For send-and-recv kernels, we need a grid sync for making `packed_recv_count` visible + if (phases & LOW_LATENCY_SEND_PHASE) + cg::this_grid().sync(); + + // Receiving and packing + if (responsible_expert_idx < num_experts) { + const auto src_rank = responsible_expert_idx / num_local_experts; + const auto local_expert_idx = responsible_expert_idx % num_local_experts; + const auto rdma_recv_x_uint8 = static_cast(rdma_recv_x) + + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_bytes_per_msg + + src_rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg; + const auto recv_x_int4 = static_cast(packed_recv_x) + + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * hidden_int4; + const auto recv_src_info = packed_recv_src_info + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank; + const auto recv_range = packed_recv_layout_range + local_expert_idx * num_ranks; + const auto num_aligned_scales = align(num_scales, sizeof(float) / sizeof(scale_t)); + const auto recv_x_scales = static_cast(packed_recv_x_scales) + local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank * num_aligned_scales; + + // Shared between sub-warps in warp groups + __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups], shared_recv_token_begin_idx[kNumMaxWarpGroups]; + + // Wait tokens to arrive + // NOTES: using sub-warp 1 to overlap with sub-warp 0 + int num_recv_tokens, recv_token_begin_idx; + EP_DEVICE_ASSERT(num_warps_per_group > 1 and num_warp_groups < 15); + if (sub_warp_id == 1 and lane_id == 0) { + auto start_time = clock64(); + while ((num_recv_tokens = ld_acquire_sys_global(rdma_recv_count + local_expert_idx * num_ranks + src_rank)) == 0); + auto wait_recv_cost = clock64() - start_time; + num_recv_tokens = -num_recv_tokens - 1; + recv_token_begin_idx = atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens); + shared_num_recv_tokens[warp_group_id] = num_recv_tokens; + shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx; + recv_range[src_rank] = pack2(num_recv_tokens, recv_token_begin_idx); + + // Add stats for diagnosis + if (cumulative_local_expert_recv_stats != nullptr) + atomicAdd(cumulative_local_expert_recv_stats + local_expert_idx, num_recv_tokens); + if (dispatch_wait_recv_cost_stats != nullptr) + atomicAdd(reinterpret_cast(dispatch_wait_recv_cost_stats + src_rank), wait_recv_cost); + } + asm volatile("bar.sync %0, %1;" :: "r"(warp_group_id + 2), "r"(num_warps_per_group * 32)); + num_recv_tokens = shared_num_recv_tokens[warp_group_id]; + recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id]; + + // Copy tokens + EP_DEVICE_ASSERT(num_scales <= 64); + for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) { + // Copy source info + const auto src_src_idx = reinterpret_cast(rdma_recv_x_uint8 + i * num_bytes_per_msg); + if (lane_id == 0) + recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx); + __syncwarp(); + + // Copy data + // NOTES: only 2 load iterations for 7K hidden with 7 unrolls + const auto src_data = reinterpret_cast(reinterpret_cast(src_src_idx) + sizeof(int4)); + const auto dst_data = recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4; + UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data, ld_nc_global, st_na_global); + + // Copy scales + if constexpr (kUseFP8) { + // Equivalent CuTe layout: + // (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack, (num_tokens * num_elems_per_pack, 1)) + const auto src_scales = reinterpret_cast(reinterpret_cast(src_data) + hidden_bytes); + const auto num_elems_per_pack = static_cast(sizeof(packed_t) / sizeof(scale_t)); + const auto token_idx = recv_token_begin_idx + i; + const auto token_stride = num_elems_per_pack; + const auto pack_stride = num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack; + if (lane_id < num_scales) { + const auto pack_idx = lane_id / num_elems_per_pack; + const auto elem_idx = lane_id % num_elems_per_pack; + auto scale = extract_required_scale_format(ld_nc_global(src_scales + lane_id)); + recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale; + } + if (lane_id + 32 < num_scales) { + const auto pack_idx = (lane_id + 32) / num_elems_per_pack; + const auto elem_idx = (lane_id + 32) % num_elems_per_pack; + auto scale = extract_required_scale_format(ld_nc_global(src_scales + lane_id + 32)); + recv_x_scales[token_idx * token_stride + pack_idx * pack_stride + elem_idx] = scale; + } + } + } + } +} + +void dispatch_v2(void* packed_recv_x, void* packed_recv_x_scales, + int* packed_recv_src_info, int64_t* packed_recv_layout_range, + int* packed_recv_count, + int* cumulative_local_expert_recv_stats, + int64_t* dispatch_wait_recv_cost_stats, + void* rdma_recv_x, int* rdma_recv_count, void* rdma_x, + const void* x, const int64_t* topk_idx, + int* next_clean, int num_next_clean_int, + int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank, + int num_topk, int num_experts, int rank, int num_ranks, + bool use_fp8, bool round_scale, bool use_ue8m0, + void* workspace, int num_device_sms, + cudaStream_t stream, int phases) { + constexpr int kNumMaxTopK = 9; + const int num_warp_groups = ceil_div(num_experts, num_device_sms); + const int num_warps_per_group = 32 / num_warp_groups; + EP_HOST_ASSERT(num_warp_groups > 0 and num_warps_per_group > 0); + EP_HOST_ASSERT(kNumMaxTopK + 1 <= num_warp_groups * num_warps_per_group); + + const auto num_warps = num_warp_groups * num_warps_per_group; + const auto num_sms = ceil_div(num_experts, num_warp_groups); + EP_HOST_ASSERT(num_topk <= kNumMaxTopK); + + // Workspace checks + auto atomic_counter_per_expert = static_cast(workspace); + auto atomic_finish_counter_per_expert = atomic_counter_per_expert + num_experts; + EP_HOST_ASSERT(num_experts * sizeof(int) * 2 <= NUM_WORKSPACE_BYTES); + + // FP8 checks + if (use_ue8m0) + EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`"); + +#define DISPATCH_LAUNCH_CASE(hidden) { \ +auto dispatch_func = dispatch