Skip to content

Commit

Permalink
pml/ob1: make no. of events an mca parameter
Browse files Browse the repository at this point in the history
make the number of events created by the pml/ob1 component an mca
paramter. The error message in case we run out of events already
suggested to increase a particular mca parameter, but a) I couldn't find
that mca parameter, and b) it was definitily not used.

Signed-off-by: Edgar Gabriel <[email protected]>
(cherry picked from commit 628ccef)
  • Loading branch information
edgargabriel authored and wenduwan committed Jul 11, 2024
1 parent 8e848da commit af6c9c1
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 16 deletions.
2 changes: 2 additions & 0 deletions ompi/mca/pml/ob1/pml_ob1.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ typedef struct mca_pml_ob1_t mca_pml_ob1_t;
extern mca_pml_ob1_t mca_pml_ob1;
extern int mca_pml_ob1_output;
extern bool mca_pml_ob1_matching_protection;
extern int mca_pml_ob1_accelerator_events_max;

/*
* PML interface functions.
*/
Expand Down
31 changes: 15 additions & 16 deletions ompi/mca/pml/ob1/pml_ob1_accelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ static int accelerator_event_dtoh_first_used, accelerator_event_htod_first_used;
static volatile int accelerator_event_dtoh_num_used, accelerator_event_htod_num_used;

/* Size of array holding events */
static int accelerator_event_max = 400;
static int accelerator_event_htod_most = 0;

int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *frag)
Expand All @@ -87,9 +86,9 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f
* return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */
OPAL_THREAD_LOCK(&pml_ob1_accelerator_htod_lock);
if (accelerator_event_htod_num_used == accelerator_event_max) {
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca mpi_common_accelerator_event_max %d.",
accelerator_event_max, accelerator_event_max + 100);
if (accelerator_event_htod_num_used == mca_pml_ob1_accelerator_events_max) {
opal_output_verbose(1, mca_pml_ob1_output, "Out of event handles. Max: %d. Suggested to rerun with new max with --mca pml_ob1_accelerator_events_max %d.",
mca_pml_ob1_accelerator_events_max, mca_pml_ob1_accelerator_events_max + 100);
OPAL_THREAD_UNLOCK(&pml_ob1_accelerator_htod_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
}
Expand All @@ -113,7 +112,7 @@ int mca_pml_ob1_record_htod_event(char *msg, struct mca_btl_base_descriptor_t *f

/* Bump up the first available slot and number used by 1 */
accelerator_event_htod_first_avail++;
if (accelerator_event_htod_first_avail >= accelerator_event_max) {
if (accelerator_event_htod_first_avail >= mca_pml_ob1_accelerator_events_max) {
accelerator_event_htod_first_avail = 0;
}
accelerator_event_htod_num_used++;
Expand Down Expand Up @@ -169,7 +168,7 @@ int mca_pml_ob1_progress_one_htod_event(struct mca_btl_base_descriptor_t **frag)
/* Bump counters, loop around the circular buffer if necessary */
--accelerator_event_htod_num_used;
++accelerator_event_htod_first_used;
if (accelerator_event_htod_first_used >= accelerator_event_max) {
if (accelerator_event_htod_first_used >= mca_pml_ob1_accelerator_events_max) {
accelerator_event_htod_first_used = 0;
}
/* A return value of 1 indicates an event completed and a frag was returned */
Expand Down Expand Up @@ -214,16 +213,16 @@ int mca_pml_ob1_accelerator_init(void)
accelerator_event_dtoh_first_avail = 0;
accelerator_event_dtoh_first_used = 0;

accelerator_event_dtoh_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_dtoh_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_dtoh_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i]);
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_dtoh_array[i], false);
if (OPAL_SUCCESS != result) {
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
rc = OPAL_ERROR;
Expand All @@ -234,7 +233,7 @@ int mca_pml_ob1_accelerator_init(void)
/* The first available status index is 0. Make an empty frag
array. */
accelerator_event_dtoh_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
if (NULL == accelerator_event_dtoh_frag_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
Expand All @@ -247,16 +246,16 @@ int mca_pml_ob1_accelerator_init(void)
accelerator_event_htod_first_avail = 0;
accelerator_event_htod_first_used = 0;

accelerator_event_htod_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_htod_array = calloc(mca_pml_ob1_accelerator_events_max, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_htod_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i]);
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
result = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_htod_array[i], false);
if (OPAL_SUCCESS != result) {
opal_output_verbose(1, mca_pml_ob1_output, "Accelerator create event failed.");
rc = OPAL_ERROR;
Expand All @@ -267,7 +266,7 @@ int mca_pml_ob1_accelerator_init(void)
/* The first available status index is 0. Make an empty frag
array. */
accelerator_event_htod_frag_array = (struct mca_btl_base_descriptor_t **) malloc(
sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
sizeof(struct mca_btl_base_descriptor_t *) * mca_pml_ob1_accelerator_events_max);
if (NULL == accelerator_event_htod_frag_array) {
opal_output_verbose(1, mca_pml_ob1_output, "No memory.");
rc = OPAL_ERROR;
Expand Down Expand Up @@ -304,7 +303,7 @@ void mca_pml_ob1_accelerator_fini(void)
}

if (NULL != accelerator_event_htod_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
if (NULL != accelerator_event_htod_array[i]) {
OBJ_RELEASE(accelerator_event_htod_array[i]);
}
Expand All @@ -313,7 +312,7 @@ void mca_pml_ob1_accelerator_fini(void)
}

if (NULL != accelerator_event_dtoh_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_pml_ob1_accelerator_events_max; i++) {
if (NULL != accelerator_event_dtoh_array[i]) {
OBJ_RELEASE(accelerator_event_dtoh_array[i]);
}
Expand Down
7 changes: 7 additions & 0 deletions ompi/mca/pml/ob1/pml_ob1_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ static int mca_pml_ob1_component_fini(void);
int mca_pml_ob1_output = 0;
static int mca_pml_ob1_verbose = 0;
bool mca_pml_ob1_matching_protection = false;
int mca_pml_ob1_accelerator_events_max = 400;

mca_pml_base_component_2_1_0_t mca_pml_ob1_component = {
/* First, the mca_base_component_t struct containing meta
Expand Down Expand Up @@ -242,6 +243,12 @@ static int mca_pml_ob1_component_register(void)
MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS,
mca_pml_ob1_get_posted_recvq_size, NULL, mca_pml_ob1_comm_size_notify, NULL);

mca_pml_ob1_accelerator_events_max = 400;
(void) mca_base_component_var_register(&mca_pml_ob1_component.pmlm_version, "accelerator_events_max",
"Number of events created by the ob1 component internally",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_ob1_accelerator_events_max);

return OMPI_SUCCESS;
}

Expand Down

0 comments on commit af6c9c1

Please sign in to comment.