Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 11 additions & 39 deletions src/host/proxy/proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,51 +687,23 @@ int process_channel_amo(proxy_state_t *state, proxy_channel_t *ch, int *is_proce
}

void enforce_cst(proxy_state_t *proxy_state) {
#if defined(NVSHMEM_X86_64)
nvshmemi_state_t *state = proxy_state->nvshmemi_state;
#endif

int status = 0;

if (nvshmemi_options.BYPASS_FLUSH) return;

if (proxy_state->is_consistency_api_supported) {
if (CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER > proxy_state->gdr_device_native_ordering &&
CUPFN(nvshmemi_cuda_syms, cuFlushGPUDirectRDMAWrites)) {
status =
CUPFN(nvshmemi_cuda_syms,
cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
/** We would want to use cudaFlushGPUDirectRDMAWritesToAllDevices when we enable
consistent access of data on any GPU (and not just self GPU) with
wait_until, quiet, barrier, etc. **/
if (status != CUDA_SUCCESS) {
NVSHMEMI_ERROR_EXIT("cuFlushGPUDirectRDMAWrites() failed in the proxy thread \n");
}
}
return;
}
#if defined(NVSHMEM_PPC64LE)
status = cudaEventRecord(proxy_state->cuev, proxy_state->stream);
if (unlikely(status != CUDA_SUCCESS)) {
NVSHMEMI_ERROR_EXIT("cuEventRecord() failed in the proxy thread \n");
}
#elif defined(NVSHMEM_X86_64)
for (int i = 0; i < state->num_initialized_transports; i++) {
if (!((state->transport_bitmap) & (1 << i))) continue;
struct nvshmem_transport *tcurr = state->transports[i];
if (!tcurr->host_ops.enforce_cst) continue;

// assuming the transport is connected - IB RC
if (tcurr->attr & NVSHMEM_TRANSPORT_ATTR_CONNECTED) {
status = tcurr->host_ops.enforce_cst(tcurr);
if (status) {
NVSHMEMI_ERROR_PRINT("aborting due to error in progress_cst \n");
exit(-1);
}
if (CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER > proxy_state->gdr_device_native_ordering &&
CUPFN(nvshmemi_cuda_syms, cuFlushGPUDirectRDMAWrites)) {
status =
CUPFN(nvshmemi_cuda_syms,
cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER));
/** We would want to use cudaFlushGPUDirectRDMAWritesToAllDevices when we enable
consistent access of data on any GPU (and not just self GPU) with
wait_until, quiet, barrier, etc. **/
if (status != CUDA_SUCCESS) {
NVSHMEMI_ERROR_EXIT("cuFlushGPUDirectRDMAWrites() failed in the proxy thread \n");
}
}
#endif
}

inline void quiet_ack_channels(proxy_state_t *proxy_state) {
Expand Down
Loading