Skip to content

Commit 75df7ab

Browse files
committed
TEST/GTEST/UCT: Retry when it cannot allocate MEMIC memory
1 parent 233b133 commit 75df7ab

File tree

3 files changed

+49
-28
lines changed

3 files changed

+49
-28
lines changed

test/gtest/uct/test_atomic_key_reg_rdma_mem_type.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ UCS_TEST_SKIP_COND_P(uct_atomic_key_reg_rdma_mem_type, fadd64,
3131
!check_rdma_memory())
3232
{
3333
mapped_buffer recvbuf(sizeof(uint64_t), receiver(), 0UL,
34-
UCS_MEMORY_TYPE_RDMA);
34+
UCS_MEMORY_TYPE_RDMA, UCT_MD_MEM_ACCESS_ALL, 10);
3535
uint64_t add = rand64();
3636

3737
run_workers(static_cast<send_func_t>(

test/gtest/uct/uct_test.cc

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -931,11 +931,12 @@ uct_test::entity::entity(const resource& resource, uct_md_config_t *md_config,
931931

932932
void uct_test::entity::mem_alloc(size_t length, unsigned mem_flags,
933933
uct_allocated_memory_t *mem,
934-
ucs_memory_type_t mem_type) const
934+
ucs_memory_type_t mem_type,
935+
unsigned num_retries) const
935936
{
936-
void *address = NULL;
937-
uct_md_h uct_md = md();
938-
ucs_status_t status;
937+
void *address = NULL;
938+
uct_md_h uct_md = md();
939+
ucs_status_t status = UCS_OK;
939940
uct_mem_alloc_params_t params;
940941

941942
params.field_mask = UCT_MEM_ALLOC_PARAM_FIELD_FLAGS |
@@ -947,22 +948,38 @@ void uct_test::entity::mem_alloc(size_t length, unsigned mem_flags,
947948
params.mem_type = mem_type;
948949
params.address = address;
949950

950-
if ((md_attr().flags & (UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_REG)) &&
951-
(mem_type == UCS_MEMORY_TYPE_HOST)) {
952-
status = uct_iface_mem_alloc(m_iface, length, mem_flags, "uct_test",
953-
mem);
954-
ASSERT_UCS_OK(status);
955-
} else {
956-
uct_alloc_method_t alloc_methods[] = {UCT_ALLOC_METHOD_MMAP,
957-
UCT_ALLOC_METHOD_MD};
958-
params.field_mask |= UCT_MEM_ALLOC_PARAM_FIELD_MDS;
959-
params.mds.mds = &uct_md;
960-
params.mds.count = 1;
961-
status = uct_mem_alloc(length, alloc_methods,
962-
ucs_static_array_size(alloc_methods), &params,
963-
mem);
964-
ASSERT_UCS_OK(status);
951+
for (unsigned i = 0; i <= num_retries; ++i) {
952+
scoped_log_handler slh(wrap_errors_logger);
953+
if ((md_attr().flags & (UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_REG)) &&
954+
(mem_type == UCS_MEMORY_TYPE_HOST)) {
955+
status = uct_iface_mem_alloc(m_iface, length, mem_flags, "uct_test",
956+
mem);
957+
} else {
958+
uct_alloc_method_t alloc_methods[] = {UCT_ALLOC_METHOD_MMAP,
959+
UCT_ALLOC_METHOD_MD};
960+
params.field_mask |= UCT_MEM_ALLOC_PARAM_FIELD_MDS;
961+
params.mds.mds = &uct_md;
962+
params.mds.count = 1;
963+
status = uct_mem_alloc(length, alloc_methods,
964+
ucs_static_array_size(alloc_methods),
965+
&params, mem);
966+
}
967+
968+
if (status != UCS_ERR_NO_MEMORY) {
969+
break;
970+
}
971+
972+
if (i < num_retries) {
973+
UCS_TEST_MESSAGE << "Retry " << (i + 1) << "/" << num_retries
974+
<< ": Allocation failed - "
975+
<< ucs_status_string(status);
976+
/* Sleep only if there are more retries remaining */
977+
usleep(ucs::rand() % 10000);
978+
}
965979
}
980+
981+
ASSERT_UCS_OK(status);
982+
966983
ucs_assert(mem->mem_type == mem_type);
967984
}
968985

@@ -1414,16 +1431,16 @@ void uct_test::mapped_buffer::reset()
14141431
uct_test::mapped_buffer::mapped_buffer(size_t size, uint64_t seed,
14151432
const entity &entity, size_t offset,
14161433
ucs_memory_type_t mem_type,
1417-
unsigned mem_flags) :
1418-
mapped_buffer(size, entity, offset, mem_type, mem_flags)
1434+
unsigned mem_flags, unsigned num_retries) :
1435+
mapped_buffer(size, entity, offset, mem_type, mem_flags, num_retries)
14191436
{
14201437
pattern_fill(seed);
14211438
}
14221439

1423-
uct_test::mapped_buffer::mapped_buffer(size_t size,
1440+
uct_test::mapped_buffer::mapped_buffer(size_t size,
14241441
const entity &entity, size_t offset,
14251442
ucs_memory_type_t mem_type,
1426-
unsigned mem_flags) :
1443+
unsigned mem_flags, unsigned num_retries) :
14271444
m_entity(entity)
14281445
{
14291446
if (size == 0) {
@@ -1433,7 +1450,7 @@ uct_test::mapped_buffer::mapped_buffer(size_t size,
14331450

14341451
size_t alloc_size = size + offset;
14351452
if ((mem_type == UCS_MEMORY_TYPE_HOST) || (mem_type == UCS_MEMORY_TYPE_RDMA)) {
1436-
m_entity.mem_alloc(alloc_size, mem_flags, &m_mem, mem_type);
1453+
m_entity.mem_alloc(alloc_size, mem_flags, &m_mem, mem_type, num_retries);
14371454
} else {
14381455
m_mem.method = UCT_ALLOC_METHOD_LAST;
14391456
m_mem.address = mem_buffer::allocate(alloc_size, mem_type);

test/gtest/uct/uct_test.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ class uct_test : public testing::TestWithParam<const resource*>,
140140

141141
void mem_alloc(size_t length, unsigned mem_flags,
142142
uct_allocated_memory_t *mem,
143-
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST) const;
143+
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
144+
unsigned num_retries = 0) const;
144145

145146
void mem_free(const uct_allocated_memory_t *mem) const;
146147

@@ -245,12 +246,15 @@ class uct_test : public testing::TestWithParam<const resource*>,
245246
public:
246247
mapped_buffer(size_t size, const entity &entity, size_t offset = 0,
247248
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
248-
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL);
249+
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL,
250+
unsigned num_retries = 0);
249251

250252
mapped_buffer(size_t size, uint64_t seed, const entity &entity,
251253
size_t offset = 0,
252254
ucs_memory_type_t mem_type = UCS_MEMORY_TYPE_HOST,
253-
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL);
255+
unsigned mem_flags = UCT_MD_MEM_ACCESS_ALL,
256+
unsigned num_retries = 0);
257+
254258
virtual ~mapped_buffer();
255259

256260
mapped_buffer(mapped_buffer &&other);

0 commit comments

Comments
 (0)