diff --git a/src/ucp/core/ucp_ep.c b/src/ucp/core/ucp_ep.c
index 3562dfd1e6b..a6b50caf86e 100644
--- a/src/ucp/core/ucp_ep.c
+++ b/src/ucp/core/ucp_ep.c
@@ -684,12 +684,18 @@ ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker)
     ucs_status_t status;
     void *address_buffer;
     size_t address_length;
-    ucp_tl_bitmap_t mem_access_tls;
+    ucp_tl_bitmap_t mem_access_tls, host_mem_access_tls;
     char ep_name[UCP_WORKER_ADDRESS_NAME_MAX];
     unsigned addr_indices[UCP_MAX_LANES];
+    ucp_lane_index_t num_lanes;
 
     ucs_memory_type_for_each(mem_type) {
         ucp_context_memaccess_tl_bitmap(context, mem_type, 0, &mem_access_tls);
+        /* Mem type EP requires host memory support */
+        ucp_context_memaccess_tl_bitmap(context, UCS_MEMORY_TYPE_HOST, 0,
+                                        &host_mem_access_tls);
+        UCS_STATIC_BITMAP_AND_INPLACE(&mem_access_tls, host_mem_access_tls);
+
         if (UCP_MEM_IS_HOST(mem_type) ||
             UCS_STATIC_BITMAP_IS_ZERO(mem_access_tls)) {
             continue;
@@ -725,6 +731,9 @@ ucs_status_t ucp_worker_mem_type_eps_create(ucp_worker_h worker)
             goto err_free_address_list;
         }
 
+        /* Mem type EP cannot have more than one lane */
+        num_lanes = ucp_ep_num_lanes(worker->mem_type_ep[mem_type]);
+        ucs_assertv_always(num_lanes == 1, "num_lanes=%u", num_lanes);
         UCS_ASYNC_UNBLOCK(&worker->async);
 
         ucs_free(local_address.address_list);
diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c
index 43af459493c..07d6638cef8 100644
--- a/src/ucp/rma/flush.c
+++ b/src/ucp/rma/flush.c
@@ -530,7 +530,7 @@ UCS_PROFILE_FUNC(ucs_status_ptr_t, ucp_ep_flush_nbx, (ep, param),
     return request;
 }
 
-static ucs_status_t ucp_worker_flush_check(ucp_worker_h worker)
+ucs_status_t ucp_worker_flush_check(ucp_worker_h worker)
 {
     ucp_rsc_index_t iface_id;
     ucp_worker_iface_t *wiface;
diff --git a/src/ucp/rma/rma.h b/src/ucp/rma/rma.h
index 14516c1ce7f..9242d2468b4 100644
--- a/src/ucp/rma/rma.h
+++ b/src/ucp/rma/rma.h
@@ -100,6 +100,7 @@ extern ucp_amo_proto_t ucp_amo_sw_proto;
 extern const ucp_rma_proto_t *ucp_rma_proto_list[];
 extern const ucp_amo_proto_t *ucp_amo_proto_list[];
 
+ucs_status_t ucp_worker_flush_check(ucp_worker_h worker);
 
 ucs_status_t ucp_rma_request_advance(ucp_request_t *req, ssize_t frag_length,
                                      ucs_status_t status,
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
index 116f3791f35..e7436c41fc8 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c
@@ -74,10 +74,6 @@ static ucs_config_field_t uct_cuda_ipc_iface_config_table[] = {
      "Estimated CPU overhead for transferring GPU memory",
      ucs_offsetof(uct_cuda_ipc_iface_config_t, params.overhead), UCS_CONFIG_TYPE_TIME},
 
-    {"ENABLE_SAME_PROCESS", "n",
-     "Enable same process same device communication for cuda_ipc",
-     ucs_offsetof(uct_cuda_ipc_iface_config_t, params.enable_same_process), UCS_CONFIG_TYPE_BOOL},
-
     {NULL}
 };
 
@@ -146,12 +142,6 @@ uct_cuda_ipc_iface_is_reachable_v2(const uct_iface_h tl_iface,
     dev_addr     = (const uct_cuda_ipc_device_addr_t *)params->device_addr;
     same_uuid    = (ucs_get_system_id() == dev_addr->system_uuid);
 
-    if ((getpid() == *(pid_t*)params->iface_addr) && same_uuid &&
-        !iface->config.enable_same_process) {
-        uct_iface_fill_info_str_buf(params, "same process");
-        return 0;
-    }
-
     if (same_uuid ||
         uct_cuda_ipc_iface_mnnvl_supported(md, dev_addr, dev_addr_len)) {
         return uct_iface_scope_is_reachable(tl_iface, params);
diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h
index c749fcff9e0..663cece255f 100644
--- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h
+++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.h
@@ -28,7 +28,6 @@ typedef struct {
     double                  bandwidth;           /* estimated bandwidth */
     double                  latency;             /* estimated latency */
     double                  overhead;            /* estimated CPU overhead */
-    int                     enable_same_process; /* enable cuda_ipc for same pid same device */
 } uct_cuda_ipc_iface_config_params_t;
 
 
diff --git a/test/gtest/ucp/test_ucp_device.cc b/test/gtest/ucp/test_ucp_device.cc
index 06d2fb8a420..6521d09b77a 100644
--- a/test/gtest/ucp/test_ucp_device.cc
+++ b/test/gtest/ucp/test_ucp_device.cc
@@ -69,7 +69,6 @@ void test_ucp_device::get_test_variants(std::vector<ucp_test_variant> &variants)
 
 void test_ucp_device::init()
 {
-    m_env.push_back(new ucs::scoped_setenv("UCX_CUDA_IPC_ENABLE_SAME_PROCESS", "y"));
     m_env.push_back(new ucs::scoped_setenv("UCX_IB_GDA_MAX_SYS_LATENCY", "1us"));
     ucp_test::init();
     sender().connect(&receiver(), get_ep_params());
diff --git a/test/gtest/ucp/test_ucp_memheap.cc b/test/gtest/ucp/test_ucp_memheap.cc
index d60f6f3af03..991c14dc7ec 100644
--- a/test/gtest/ucp/test_ucp_memheap.cc
+++ b/test/gtest/ucp/test_ucp_memheap.cc
@@ -10,6 +10,10 @@
 #include <common/mem_buffer.h>
 #include <common/test_helpers.h>
 #include <ucs/sys/sys.h>
+extern "C" {
+#include <ucp/rma/rma.h>
+}
+
 #include <ucs/sys/ptr_arith.h>
 
 
@@ -95,6 +99,9 @@ void test_ucp_memheap::test_xfer(send_func_t send_func, size_t size,
         flush_ep(sender());
     } else {
         flush_worker(sender());
+        while(ucp_worker_flush_check(sender().worker()) != UCS_OK) {
+            progress();
+        }
     }
 
     /* Validate data */
diff --git a/test/gtest/ucp/test_ucp_peer_failure.cc b/test/gtest/ucp/test_ucp_peer_failure.cc
index 80d459568aa..f01d1ddaea7 100644
--- a/test/gtest/ucp/test_ucp_peer_failure.cc
+++ b/test/gtest/ucp/test_ucp_peer_failure.cc
@@ -987,7 +987,7 @@ UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, rtr_mtype)
 }
 
 UCS_TEST_P(test_ucp_peer_failure_rndv_put_ppln_abort, pipeline,
-           "RNDV_FRAG_SIZE=host:8K")
+           "RNDV_FRAG_SIZE=host:8K,cuda:8K")
 {
     rndv_progress_failure_test(rndv_mode::put_ppln, true);
 }
diff --git a/test/gtest/uct/test_uct_iface.cc b/test/gtest/uct/test_uct_iface.cc
index 233f0864d88..f6a0578ab9d 100644
--- a/test/gtest/uct/test_uct_iface.cc
+++ b/test/gtest/uct/test_uct_iface.cc
@@ -24,11 +24,6 @@ class test_uct_iface : public uct_test {
     }
 
     void test_is_reachable();
-
-    virtual bool is_self_reachable() const
-    {
-        return true;
-    }
 };
 
 void test_uct_iface::test_is_reachable()
@@ -63,7 +58,7 @@ void test_uct_iface::test_is_reachable()
     ASSERT_UCS_OK(status);
 
     bool is_reachable = uct_iface_is_reachable_v2(iface, &params);
-    EXPECT_EQ(is_self_reachable(), is_reachable);
+    EXPECT_TRUE(is_reachable);
 
     // Allocate corrupted address buffers, make it larger than the correct
     // buffer size in case the corrupted data indicates a larger address length
@@ -98,18 +93,4 @@ UCS_TEST_P(test_uct_iface, is_reachable)
 }
 
 UCT_INSTANTIATE_TEST_CASE(test_uct_iface)
-
-class test_uct_iface_self_unreachable : public test_uct_iface {
-protected:
-    bool is_self_reachable() const override
-    {
-        return false;
-    }
-};
-
-UCS_TEST_P(test_uct_iface_self_unreachable, is_reachable)
-{
-    test_is_reachable();
-}
-
-UCT_INSTANTIATE_CUDA_IPC_TEST_CASE(test_uct_iface_self_unreachable)
+UCT_INSTANTIATE_CUDA_IPC_TEST_CASE(test_uct_iface)