diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c index 3562dfd1e6b..a6b50caf86e 100644 --- a/src/ucp/core/ucp_ep.c +++ b/src/ucp/core/ucp_ep.c @@ -684,12 +684,18 @@ ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker) ucs_status_t status; void *address_buffer; size_t address_length; - ucp_tl_bitmap_t mem_access_tls; + ucp_tl_bitmap_t mem_access_tls, host_mem_access_tls; char ep_name[UCP_WORKER_ADDRESS_NAME_MAX]; unsigned addr_indices[UCP_MAX_LANES]; + ucp_lane_index_t num_lanes; ucs_memory_type_for_each(mem_type) { ucp_context_memaccess_tl_bitmap(context, mem_type, 0, &mem_access_tls); + /* Mem type EP requires host memory support */ + ucp_context_memaccess_tl_bitmap(context, UCS_MEMORY_TYPE_HOST, 0, + &host_mem_access_tls); + UCS_STATIC_BITMAP_AND_INPLACE(&mem_access_tls, host_mem_access_tls); + if (UCP_MEM_IS_HOST(mem_type) || UCS_STATIC_BITMAP_IS_ZERO(mem_access_tls)) { continue; @@ -725,6 +731,9 @@ ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker) goto err_free_address_list; } + /* Mem type EP cannot have more than one lane */ + num_lanes = ucp_ep_num_lanes(worker->mem_type_ep[mem_type]); + ucs_assertv_always(num_lanes == 1, "num_lanes=%u", num_lanes); UCS_ASYNC_UNBLOCK(&worker->async); ucs_free(local_address.address_list); diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c index 43af459493c..07d6638cef8 100644 --- a/src/ucp/rma/flush.c +++ b/src/ucp/rma/flush.c @@ -530,7 +530,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_ep_flush_nbx, (ep, param), return request; } -static ucs_status_t ucp_worker_flush_check(ucp_worker_h worker) +ucs_status_t ucp_worker_flush_check(ucp_worker_h worker) { ucp_rsc_index_t iface_id; ucp_worker_iface_t *wiface; diff --git a/src/ucp/rma/rma.h b/src/ucp/rma/rma.h index 14516c1ce7f..9242d2468b4 100644 --- a/src/ucp/rma/rma.h +++ b/src/ucp/rma/rma.h @@ -100,6 +100,7 @@ extern ucp_amo_proto_t ucp_amo_sw_proto; extern const ucp_rma_proto_t *ucp_rma_proto_list[]; extern const ucp_amo_proto_t *ucp_amo_proto_list[]; +ucs_status_t ucp_worker_flush_check(ucp_worker_h worker); ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length, ucs_status_t status, diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index 116f3791f35..e7436c41fc8 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -74,10 +74,6 @@ static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = { "Estimated CPU overhead for transferring GPU memory", ucs_offsetof(uct_cuda_ipc_iface_config_t, params.overhead), UCS_CONFIG_TYPE_TIME}, - {"ENABLE_SAME_PROCESS", "n", - "Enable same process same device communication for cuda_ipc", - ucs_offsetof(uct_cuda_ipc_iface_config_t, params.enable_same_process), UCS_CONFIG_TYPE_BOOL}, - {NULL} }; @@ -146,12 +142,6 @@ uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface, dev_addr = (const uct_cuda_ipc_device_addr_t *)params->device_addr; same_uuid = (ucs_get_system_id() == dev_addr->system_uuid); - if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid && - !iface->config.enable_same_process) { - uct_iface_fill_info_str_buf(params, "same process"); - return 0; - } - if (same_uuid || uct_cuda_ipc_iface_mnnvl_supported(md, dev_addr, dev_addr_len)) { return uct_iface_scope_is_reachable(tl_iface, params); diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h index c749fcff9e0..663cece255f 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h @@ -28,7 +28,6 @@ typedef struct { double bandwidth; /* estimated bandwidth */ double latency; /* estimated latency */ double overhead; /* estimated CPU overhead */ - int enable_same_process; /* enable cuda_ipc for same pid same device */ } uct_cuda_ipc_iface_config_params_t; diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc index 06d2fb8a420..6521d09b77a 100644 --- a/test/gtest/ucp/test_ucp_device.cc +++ b/test/gtest/ucp/test_ucp_device.cc @@ -69,7 +69,6 @@ void test_ucp_device::get_test_variants(std::vector &variants) void test_ucp_device::init() { - m_env.push_back(new ucs::scoped_setenv("UCX_CUDA_IPC_ENABLE_SAME_PROCESS", "y")); m_env.push_back(new ucs::scoped_setenv("UCX_IB_GDA_MAX_SYS_LATENCY", "1us")); ucp_test::init(); sender().connect(&receiver(), get_ep_params()); diff --git a/test/gtest/ucp/test_ucp_memheap.cc b/test/gtest/ucp/test_ucp_memheap.cc index d60f6f3af03..991c14dc7ec 100644 --- a/test/gtest/ucp/test_ucp_memheap.cc +++ b/test/gtest/ucp/test_ucp_memheap.cc @@ -10,6 +10,10 @@ #include #include #include +extern "C" { +#include +} + #include @@ -95,6 +99,9 @@ void test_ucp_memheap::test_xfer(send_func_t send_func, size_t size, flush_ep(sender()); } else { flush_worker(sender()); + while(ucp_worker_flush_check(sender().worker()) != UCS_OK) { + progress(); + } } /* Validate data */ diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc index 80d459568aa..f01d1ddaea7 100644 --- a/test/gtest/ucp/test_ucp_peer_failure.cc +++ b/test/gtest/ucp/test_ucp_peer_failure.cc @@ -987,7 +987,7 @@ UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, rtr_mtype) } UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, pipeline, - "RNDV_FRAG_SIZE=host:8K") + "RNDV_FRAG_SIZE=host:8K,cuda:8K") { rndv_progress_failure_test(rndv_mode::put_ppln, true); } diff --git a/test/gtest/uct/test_uct_iface.cc b/test/gtest/uct/test_uct_iface.cc index 233f0864d88..f6a0578ab9d 100644 --- a/test/gtest/uct/test_uct_iface.cc +++ b/test/gtest/uct/test_uct_iface.cc @@ -24,11 +24,6 @@ class test_uct_iface : public uct_test { } void test_is_reachable(); - - virtual bool is_self_reachable() const - { - return true; - } }; void test_uct_iface::test_is_reachable() @@ -63,7 +58,7 @@ void test_uct_iface::test_is_reachable() ASSERT_UCS_OK(status); bool is_reachable = uct_iface_is_reachable_v2(iface, ¶ms); - EXPECT_EQ(is_self_reachable(), is_reachable); + EXPECT_TRUE(is_reachable); // Allocate corrupted address buffers, make it larger than the correct // buffer size in case the corrupted data indicates a larger address length @@ -98,18 +93,4 @@ UCS_TEST_P(test_uct_iface, is_reachable) } UCT_INSTANTIATE_TEST_CASE(test_uct_iface) - -class test_uct_iface_self_unreachable : public test_uct_iface { -protected: - bool is_self_reachable() const override - { - return false; - } -}; - -UCS_TEST_P(test_uct_iface_self_unreachable, is_reachable) -{ - test_is_reachable(); -} - -UCT_INSTANTIATE_CUDA_IPC_TEST_CASE(test_uct_iface_self_unreachable) +UCT_INSTANTIATE_CUDA_IPC_TEST_CASE(test_uct_iface)