Skip to content

Commit

Permalink
cherry-pick: open-mpi#11796
Browse files Browse the repository at this point in the history
SHMEM_LOCKS: MCS implementation of SHMEM LOCKS

Adding MCS algorithm-based implementation for shmem_locks
to improve performance for large scale SHMEM applications using locks.
MCS lock is now the default algorithm, use the following MCA parameter
to disable. --mca oshmem_enable_mcs_lock 0 to disable mcs locks and
revert to default ticket locking. --mca oshmem_api_verbose 10 for debug
information on shmem_locks.

Signed-off-by: Vishwanath Venkatesan <[email protected]>
  • Loading branch information
Vishwanath Venkatesan committed Oct 18, 2023
1 parent 386b88d commit 4cecc52
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 7 deletions.
20 changes: 17 additions & 3 deletions oshmem/runtime/oshmem_shmem_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
#include "oshmem/constants.h"


int oshmem_shmem_lock_recursive = 0;
int oshmem_shmem_api_verbose = 0;
int oshmem_preconnect_all = 0;
int oshmem_shmem_lock_recursive = 0;
int oshmem_shmem_api_verbose = 0;
int oshmem_shmem_enable_mcs_locks = 1;
int oshmem_preconnect_all = 0;

int oshmem_shmem_register_params(void)
{
Expand All @@ -38,6 +39,19 @@ int oshmem_shmem_register_params(void)
MCA_BASE_VAR_SCOPE_READONLY,
&oshmem_shmem_lock_recursive);

(void) mca_base_var_register("oshmem",
"oshmem",
NULL,
"enable_mcs_lock",
"enable mcs locks",
MCA_BASE_VAR_TYPE_INT,
NULL,
1,
MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&oshmem_shmem_enable_mcs_locks);

(void) mca_base_var_register("oshmem",
"oshmem",
NULL,
Expand Down
7 changes: 7 additions & 0 deletions oshmem/runtime/params.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ OSHMEM_DECLSPEC extern int oshmem_shmem_api_verbose;
*/
OSHMEM_DECLSPEC extern int oshmem_preconnect_all;


/**
* Whether to force SHMEM processes to use MCS locking
* for shmem_locks
*/
OSHMEM_DECLSPEC extern int oshmem_shmem_enable_mcs_locks;

END_C_DECLS

#endif /* OSHMEM_RUNTIME_PARAMS_H */
3 changes: 2 additions & 1 deletion oshmem/shmem/c/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ if OSHMEM_PROFILING
endif

OSHMEM_AUX_SOURCES = \
shmem_lock.c
shmem_lock.c \
shmem_mcs_lock.c

OSHMEM_API_SOURCES = \
shmem_init.c \
Expand Down
11 changes: 10 additions & 1 deletion oshmem/shmem/c/shmem_clear_lock.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/*
* Copyright (c) 2023 NVIDIA Corporation.
* All rights reserved.
* Copyright (c) 2013-2016 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
Expand All @@ -18,6 +20,7 @@
#include "oshmem/shmem/shmem_api_logger.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/shmem/shmem_lock.h"
#include "oshmem/runtime/params.h"

#if OSHMEM_PROFILING
#include "oshmem/include/pshmem.h"
Expand All @@ -27,5 +30,11 @@

void shmem_clear_lock(volatile long *lock)
{
_shmem_clear_lock((void *)lock, sizeof(long));
if (oshmem_shmem_enable_mcs_locks) {
SHMEM_API_VERBOSE(10, "Clear Lock with MCS Lock implementation");
_shmem_mcs_clear_lock((long *)lock);
} else {
SHMEM_API_VERBOSE(10, "Clear Lock with Ticket Lock implementation");
_shmem_clear_lock((void *)lock, sizeof(long));
}
}
239 changes: 239 additions & 0 deletions oshmem/shmem/c/shmem_mcs_lock.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
/*
* Copyright (c) 2023 NVIDIA Corporation.
* All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#include "oshmem_config.h"

#include "oshmem/constants.h"
#include "oshmem/include/shmem.h"
#include "oshmem/runtime/params.h"
#include "oshmem/runtime/runtime.h"
#include <stdlib.h>
#include <memory.h>

#include "oshmem/shmem/shmem_api_logger.h"
#include "oshmem/shmem/shmem_lock.h"
#include "oshmem/mca/memheap/memheap.h"
#include "oshmem/mca/memheap/base/base.h"
#include "oshmem/mca/atomic/atomic.h"

#define OPAL_BITWISE_SIZEOF_LONG (SIZEOF_LONG * 8)


/** Use basic MCS distributed lock algorithm for lock */
struct shmem_mcs_lock {
/** has meaning only on MCSQ_TAIL OWNER */
int tail;
/** It has meaning on all PEs */
/** The next pointer is a combination of the PE ID and wait signal */
int next;
};
typedef struct shmem_mcs_lock shmem_mcs_lock_t;

#define SHMEM_MCSL_TAIL_OWNER(lock_ptr)\
(((uintptr_t)(lock_ptr) / sizeof(long)) % shmem_n_pes())

#define SHMEM_MCSL_NEXT_MASK 0x7FFFFFFFU
#define SHMEM_MCSL_SIGNAL_MASK 0x80000000U /** Wait signal mask */
#define SHMEM_MCSL_NEXT(lock_val) ((lock_val) & SHMEM_MCSL_NEXT_MASK)
/** Improve readability */
#define SHMEM_MCSL_GET_PE(tail_val) ((tail_val) & SHMEM_MCSL_NEXT_MASK)
#define SHMEM_MCSL_SIGNAL(lock_val) ((lock_val) & SHMEM_MCSL_SIGNAL_MASK)
#define SHMEM_MCSL_SET_SIGNAL(lock_val) ((lock_val) | SHMEM_MCSL_SIGNAL_MASK)

void
_shmem_mcs_set_lock(long *lockp)
{
shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp;
int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock);
int new_tail_req = 0;
int *tail = &(lock->tail);
int *next = &(lock->next);
int my_pe = shmem_my_pe();
int curr = 0;
int out_value = 0;
int prev_tail = 0;
int prev_tailpe = 0;
int tval = 0;
int tmp_val = 0;
int retv = 0;
uint64_t value_tmp = 0;

RUNTIME_CHECK_INIT();
/**
* Initializing next pointer to next mask
* Done atomically to avoid races as NEXT pointer
* can be modified by other PEs while acquiring or
* releasing it.
*/
/**
* Can make this to be shmem_atomic_set to be safe
* in non-cc architectures
* has an impact on performance
*/
value_tmp = SHMEM_MCSL_NEXT_MASK;
out_value = SHMEM_MCSL_NEXT_MASK;
retv = MCA_ATOMIC_CALL(swap(oshmem_ctx_default, (void*)next,
(void*)&out_value, value_tmp,
sizeof(int), my_pe));
RUNTIME_CHECK_RC(retv);
MCA_SPML_CALL(quiet(oshmem_ctx_default));

/** Signal for setting lock */
new_tail_req = SHMEM_MCSL_SET_SIGNAL(my_pe);
/**
* Swap and make me the new tail and update in tail owner
* Get the previous tail PE.
*/
retv = MCA_ATOMIC_CALL(swap(oshmem_ctx_default, (void *)tail,
(void*)&prev_tail,
OSHMEM_ATOMIC_PTR_2_INT(&new_tail_req,
sizeof(new_tail_req)),
sizeof(int), mcs_tail_owner));
RUNTIME_CHECK_RC(retv);

prev_tailpe = SHMEM_MCSL_GET_PE(prev_tail);
if (SHMEM_MCSL_SIGNAL(prev_tail)) {
/**
* Someone else has got the lock before this PE
* Adding this PE to the previous tail PE's Next pointer
* Substract the SIGNAL Bit to avoid changing it.
*/
tmp_val = my_pe - SHMEM_MCSL_NEXT_MASK;
retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, (void*)next, tmp_val,
sizeof(int), prev_tailpe));
RUNTIME_CHECK_RC(retv);
/**
* This value to be changed eventually by predecessor
* when its lock is released.
* Need to be done atomically to avoid any races where
* next pointer is modified by another PE acquiring or
* releasing this.
*/
retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default, (void *)next,
SHMEM_MCSL_SIGNAL_MASK, sizeof(int),
my_pe));
RUNTIME_CHECK_RC(retv);
MCA_SPML_CALL(quiet(oshmem_ctx_default));
/** Wait for predecessor release lock to this PE signal to false. */
retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next,
(void*)&curr, tval, sizeof(int), my_pe));
RUNTIME_CHECK_RC(retv);

while (SHMEM_MCSL_SIGNAL(curr)) {
retv = MCA_SPML_CALL(wait((void*)next, SHMEM_CMP_NE,
(void*)&curr, SHMEM_INT));
RUNTIME_CHECK_RC(retv);
retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next,
(void*)&curr, tval, sizeof(int),
my_pe));
RUNTIME_CHECK_RC(retv);
}
}
/** else.. this pe has got the lock as no one else had it */
}

void
_shmem_mcs_clear_lock(long *lockp)
{
shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp;
int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock);
int *tail = &(lock->tail);
int *next = &(lock->next);
int my_pe = shmem_my_pe();
int next_value = 0;
int swap_cond = 0;
int prev_value = 0;
int tval = 0;
int val_tmp = 0;
int nmask = 0;
int a_val = 0;
int retv = 0;

/**
* Can make atomic fetch to be safe in non-cc architectures
* Has impact on performance
*/
retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next,
(void*)&next_value, tval, sizeof(int),
my_pe));
RUNTIME_CHECK_RC(retv);
MCA_SPML_CALL(quiet(oshmem_ctx_default));

if (next_value == SHMEM_MCSL_NEXT_MASK) {
swap_cond = SHMEM_MCSL_SET_SIGNAL(my_pe);
retv = MCA_ATOMIC_CALL(cswap(oshmem_ctx_default,
(void *)tail, (uint64_t *)&(prev_value),
OSHMEM_ATOMIC_PTR_2_INT(&swap_cond,
sizeof(swap_cond)),
OSHMEM_ATOMIC_PTR_2_INT(&val_tmp,
sizeof(val_tmp)), sizeof(int),
mcs_tail_owner));
RUNTIME_CHECK_RC(retv);

/** I am the tail.. and lock is released */
if (prev_value == swap_cond) {
return;
}
/**
* I am not the tail, another PE maybe racing to acquire lock,
* let them complete setting themselves as our next
*/
nmask = SHMEM_MCSL_NEXT_MASK;
while(next_value == nmask) {
retv = MCA_SPML_CALL(wait((void*)next, SHMEM_CMP_NE,
(void*)&nmask, SHMEM_INT));
RUNTIME_CHECK_RC(retv);
retv = MCA_ATOMIC_CALL(fadd(oshmem_ctx_default, (void*)next,
(void*)&next_value, tval,
sizeof(int), my_pe));
RUNTIME_CHECK_RC(retv);
}
}
/** There is a successor release lock to the successor */
a_val = SHMEM_MCSL_SIGNAL_MASK;
retv = MCA_ATOMIC_CALL(add(oshmem_ctx_default,
(void *)next, a_val, sizeof(a_val),
SHMEM_MCSL_NEXT(next_value)));
RUNTIME_CHECK_RC(retv);
MCA_SPML_CALL(quiet(oshmem_ctx_default));
}

int
_shmem_mcs_test_lock(long *lockp)
{
shmem_mcs_lock_t *lock = (shmem_mcs_lock_t *) lockp;
int mcs_tail_owner = SHMEM_MCSL_TAIL_OWNER(lock);
int new_tail_req = 0;
int prev_tail = 0;
int tmp_cond = 0;
int *tail = &(lock->tail);
int *next = &(lock->next);
int my_pe = shmem_my_pe();
int retv = 0;

/** Initializing next pointer to next mask */
*next = SHMEM_MCSL_NEXT_MASK;

/** Signal for setting lock */
new_tail_req = SHMEM_MCSL_SET_SIGNAL(my_pe);

/** Check if previously cleared before swapping */
retv = MCA_ATOMIC_CALL(cswap(oshmem_ctx_default,
(void *)tail, (uint64_t *)&(prev_tail),
OSHMEM_ATOMIC_PTR_2_INT(&tmp_cond,
sizeof(tmp_cond)),
OSHMEM_ATOMIC_PTR_2_INT(&new_tail_req,
sizeof(new_tail_req)),
sizeof(int), mcs_tail_owner));
RUNTIME_CHECK_RC(retv);

return (0 != prev_tail);
}
11 changes: 10 additions & 1 deletion oshmem/shmem/c/shmem_set_lock.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/*
* Copyright (c) 2023 NVIDIA Corporation.
* All rights reserved.
* Copyright (c) 2013-2016 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
Expand All @@ -18,6 +20,7 @@
#include "oshmem/shmem/shmem_api_logger.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/shmem/shmem_lock.h"
#include "oshmem/runtime/params.h"

#if OSHMEM_PROFILING
#include "oshmem/include/pshmem.h"
Expand All @@ -27,5 +30,11 @@

void shmem_set_lock(volatile long *lock)
{
_shmem_set_lock((void *)lock, sizeof(long));
if (oshmem_shmem_enable_mcs_locks) {
SHMEM_API_VERBOSE(10, "Set Lock with MCS Lock implementation");
_shmem_mcs_set_lock((long *)lock);
} else {
SHMEM_API_VERBOSE(10, "Set Lock with Ticket Lock implementation");
_shmem_set_lock((void *)lock, sizeof(long));
}
}
11 changes: 10 additions & 1 deletion oshmem/shmem/c/shmem_test_lock.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
/*
* Copyright (c) 2023 NVIDIA Corporation.
* All rights reserved.
* Copyright (c) 2013-2016 Mellanox Technologies, Inc.
* All rights reserved.
* $COPYRIGHT$
Expand All @@ -18,6 +20,7 @@
#include "oshmem/include/shmem.h"
#include "oshmem/shmem/shmem_api_logger.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/runtime/params.h"
#include "oshmem/shmem/shmem_lock.h"

#if OSHMEM_PROFILING
Expand All @@ -28,5 +31,11 @@

int shmem_test_lock(volatile long *lock)
{
return _shmem_test_lock((void *)lock, sizeof(long));
if (oshmem_shmem_enable_mcs_locks) {
SHMEM_API_VERBOSE(10, "Test lock using MCS Lock implementation");
return _shmem_mcs_test_lock((long *)lock);
} else {
SHMEM_API_VERBOSE(10, "Test_lock using Ticket Lock implementation");
return _shmem_test_lock((void *)lock, sizeof(long));
}
}
3 changes: 3 additions & 0 deletions oshmem/shmem/shmem_lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,8 @@ void _shmem_set_lock(void *lock, int lock_size);
int _shmem_test_lock(void *lock, int lock_size);
void _shmem_clear_lock(void *lock, int lock_size);

void _shmem_mcs_set_lock(long *lock);
void _shmem_mcs_clear_lock(long *lock);
int _shmem_mcs_test_lock(long *lock);

#endif /*SHMEM_LOCK_H*/

0 comments on commit 4cecc52

Please sign in to comment.