Skip to content

Commit

Permalink
Add acoll collective component
Browse files Browse the repository at this point in the history
acoll is a collective component optimized for AMD "Zen"-based
processors. It supports Bcast, Allreduce, Reduce, Barrier, Gather and
Allgather APIs.

Signed-off-by: Nithya V S <[email protected]>
  • Loading branch information
amd-nithyavs committed Jul 16, 2024
1 parent 1b95379 commit 70e653d
Show file tree
Hide file tree
Showing 15 changed files with 4,189 additions and 0 deletions.
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Copyright (c) 2020-2021 Cornelis Networks, Inc. All rights reserved.
Copyright (c) 2021 Nanook Consulting
Copyright (c) 2017-2019 Iowa State University Research Foundation, Inc.
All rights reserved.
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.

$COPYRIGHT$

Expand Down
45 changes: 45 additions & 0 deletions ompi/mca/coll/acoll/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

AM_CPPFLAGS = $(coll_acoll_CPPFLAGS)

sources = \
coll_acoll.h \
coll_acoll_utils.h \
coll_acoll_allgather.c \
coll_acoll_bcast.c \
coll_acoll_gather.c \
coll_acoll_reduce.c \
coll_acoll_allreduce.c \
coll_acoll_barrier.c \
coll_acoll_component.c \
coll_acoll_module.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_ompi_coll_acoll_DSO
component_noinst =
component_install = mca_coll_acoll.la
else
component_noinst = libmca_coll_acoll.la
component_install =
endif

mcacomponentdir = $(ompilibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_coll_acoll_la_SOURCES = $(sources)
mca_coll_acoll_la_LDFLAGS = -module -avoid-version $(coll_acoll_LDFLAGS)
mca_coll_acoll_la_LIBADD = $(top_builddir)/ompi/lib@[email protected] $(coll_acoll_LIBS)

noinst_LTLIBRARIES = $(component_noinst)
libmca_coll_acoll_la_SOURCES =$(sources)
libmca_coll_acoll_la_LIBADD = $(coll_acoll_LIBS)
libmca_coll_acoll_la_LDFLAGS = -module -avoid-version $(coll_acoll_LDFLAGS)
16 changes: 16 additions & 0 deletions ompi/mca/coll/acoll/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.

$COPYRIGHT$

Additional copyrights may follow

$HEADER$

===========================================================================

The collective component, AMD Coll (“acoll”), is a high-performant MPI collective component for the OpenMPI library that is optimized for AMD "Zen"-based processors. “acoll” is optimized for communications within a single node of AMD “Zen”-based processors and provides the following commonly used collective algorithms: boardcast (MPI_Bcast), allreduce (MPI_Allreduce), reduce (MPI_Reduce), gather (MPI_Gather), allgather (MPI_Allgather), and barrier (MPI_Barrier).

At present, “acoll” has been tested with OpenMPI v5.0.2 and can be built as part of OpenMPI.

To run an application with acoll, use the following command line parameters
- mpirun <common mpi runtime parameters> --mca coll acoll,tuned,libnbc,basic --mca coll_acoll_priority 40 <executable>
225 changes: 225 additions & 0 deletions ompi/mca/coll/acoll/coll_acoll.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#ifndef MCA_COLL_ACOLL_EXPORT_H
#define MCA_COLL_ACOLL_EXPORT_H

#include "ompi_config.h"

#include "mpi.h"
#include "ompi/communicator/communicator.h"
#include "ompi/mca/coll/base/coll_base_functions.h"
#include "ompi/mca/coll/coll.h"
#include "ompi/mca/mca.h"
#include "ompi/request/request.h"

#ifdef HAVE_XPMEM_H
#include "opal/mca/rcache/base/base.h"
#include <xpmem.h>
#endif

#include "opal/mca/shmem/base/base.h"
#include "opal/mca/shmem/shmem.h"

BEGIN_C_DECLS

/* Globally exported variables */
OMPI_DECLSPEC extern const mca_coll_base_component_3_0_0_t mca_coll_acoll_component;
extern int mca_coll_acoll_priority;
extern int mca_coll_acoll_sg_size;
extern int mca_coll_acoll_sg_scale;
extern int mca_coll_acoll_node_size;
extern int mca_coll_acoll_use_dynamic_rules;
extern int mca_coll_acoll_mnode_enable;
extern int mca_coll_acoll_bcast_lin0;
extern int mca_coll_acoll_bcast_lin1;
extern int mca_coll_acoll_bcast_lin2;
extern int mca_coll_acoll_bcast_nonsg;
extern int mca_coll_acoll_allgather_lin;
extern int mca_coll_acoll_allgather_ring_1;

/* API functions */
int mca_coll_acoll_init_query(bool enable_progress_threads, bool enable_mpi_threads);
mca_coll_base_module_t *mca_coll_acoll_comm_query(struct ompi_communicator_t *comm, int *priority);

int mca_coll_acoll_module_enable(mca_coll_base_module_t *module, struct ompi_communicator_t *comm);

int mca_coll_acoll_allgather(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype,
void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype,
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

int mca_coll_acoll_bcast(void *buff, size_t count, struct ompi_datatype_t *datatype, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

int mca_coll_acoll_gather_intra(const void *sbuf, size_t scount, struct ompi_datatype_t *sdtype,
void *rbuf, size_t rcount, struct ompi_datatype_t *rdtype, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

int mca_coll_acoll_reduce_intra(const void *sbuf, void *rbuf, size_t count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root,
struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

int mca_coll_acoll_allreduce_intra(const void *sbuf, void *rbuf, size_t count,
struct ompi_datatype_t *dtype, struct ompi_op_t *op,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module);

int mca_coll_acoll_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t *module);

END_C_DECLS

#define MCA_COLL_ACOLL_MAX_CID 100
#define MCA_COLL_ACOLL_ROOT_CHANGE_THRESH 10

typedef enum MCA_COLL_ACOLL_SG_SIZES {
MCA_COLL_ACOLL_SG_SIZE_1 = 8,
MCA_COLL_ACOLL_SG_SIZE_2 = 16
} MCA_COLL_ACOLL_SG_SIZES;

typedef enum MCA_COLL_ACOLL_SG_SCALES {
MCA_COLL_ACOLL_SG_SCALE_1 = 1,
MCA_COLL_ACOLL_SG_SCALE_2 = 2,
MCA_COLL_ACOLL_SG_SCALE_3 = 4,
MCA_COLL_ACOLL_SG_SCALE_4 = 8,
MCA_COLL_ACOLL_SG_SCALE_5 = 16
} MCA_COLL_ACOLL_SG_SCALES;

typedef enum MCA_COLL_ACOLL_SUBCOMMS {
MCA_COLL_ACOLL_NODE_L = 0,
MCA_COLL_ACOLL_INTRA,
MCA_COLL_ACOLL_SOCK_L,
MCA_COLL_ACOLL_NUMA_L,
MCA_COLL_ACOLL_L3_L,
MCA_COLL_ACOLL_LEAF,
MCA_COLL_ACOLL_NUM_SC
} MCA_COLL_ACOLL_SUBCOMMS;

typedef enum MCA_COLL_ACOLL_LAYERS {
MCA_COLL_ACOLL_LYR_NODE = 0,
MCA_COLL_ACOLL_LYR_SOCKET,
MCA_COLL_ACOLL_NUM_LAYERS
} MCA_COLL_ACOLL_LAYERS;

typedef enum MCA_COLL_ACOLL_BASE_LYRS {
MCA_COLL_ACOLL_L3CACHE = 0,
MCA_COLL_ACOLL_NUMA,
MCA_COLL_ACOLL_NUM_BASE_LYRS
} MCA_COLL_ACOLL_BASE_LYRS;

typedef struct coll_acoll_data {
#ifdef HAVE_XPMEM_H
xpmem_segid_t *allseg_id;
xpmem_apid_t *all_apid;
void **allshm_sbuf;
void **allshm_rbuf;
void **xpmem_saddr;
void **xpmem_raddr;
mca_rcache_base_module_t **rcache;
void *scratch;
#endif
opal_shmem_ds_t *allshmseg_id;
void **allshmmmap_sbuf;

int comm_size;
int l1_local_rank;
int l2_local_rank;
int l1_gp_size;
int *l1_gp;
int *l2_gp;
int l2_gp_size;
int offset[4];
int sync[2];
} coll_acoll_data_t;

typedef struct coll_acoll_subcomms {
ompi_communicator_t *local_comm;
ompi_communicator_t *local_r_comm;
ompi_communicator_t *leader_comm;
ompi_communicator_t *subgrp_comm;
ompi_communicator_t *numa_comm;
ompi_communicator_t *base_comm[MCA_COLL_ACOLL_NUM_BASE_LYRS][MCA_COLL_ACOLL_NUM_LAYERS];
ompi_communicator_t *orig_comm;
ompi_communicator_t *socket_comm;
ompi_communicator_t *socket_ldr_comm;
int num_nodes;
int derived_node_size;
int is_root_node;
int is_root_sg;
int is_root_numa;
int is_root_socket;
int local_root[MCA_COLL_ACOLL_NUM_LAYERS];
int outer_grp_root;
int subgrp_root;
int numa_root;
int socket_ldr_root;
int base_root[MCA_COLL_ACOLL_NUM_BASE_LYRS][MCA_COLL_ACOLL_NUM_LAYERS];
int base_rank[MCA_COLL_ACOLL_NUM_BASE_LYRS];
int socket_rank;
int subgrp_size;
int initialized;
int prev_init_root;
int num_root_change;

ompi_communicator_t *numa_comm_ldrs;
ompi_communicator_t *node_comm;
ompi_communicator_t *inter_comm;
int cid;
coll_acoll_data_t *data;
bool initialized_data;
bool initialized_shm_data;
#ifdef HAVE_XPMEM_H
uint64_t xpmem_buf_size;
int without_xpmem;
int xpmem_use_sr_buf;
#endif

} coll_acoll_subcomms_t;

typedef struct coll_acoll_reserve_mem {
void *reserve_mem;
uint64_t reserve_mem_size;
bool reserve_mem_allocate;
bool reserve_mem_in_use;
} coll_acoll_reserve_mem_t;

struct mca_coll_acoll_module_t {
mca_coll_base_module_t super;
MCA_COLL_ACOLL_SG_SIZES sg_size;
MCA_COLL_ACOLL_SG_SCALES sg_scale;
int sg_cnt;
// Todo: Remove log2 variables
int log2_sg_cnt;
int node_cnt;
int log2_node_cnt;
int use_dyn_rules;
// Todo: Use substructure for every API related ones
int use_mnode;
int use_lin0;
int use_lin1;
int use_lin2;
int mnode_sg_size;
int mnode_log2_sg_size;
int allg_lin;
int allg_ring;
coll_acoll_subcomms_t subc[MCA_COLL_ACOLL_MAX_CID];
coll_acoll_reserve_mem_t reserve_mem_s;
};

#ifdef HAVE_XPMEM_H
struct acoll_xpmem_rcache_reg_t {
mca_rcache_base_registration_t base;
void *xpmem_vaddr;
};
#endif

typedef struct mca_coll_acoll_module_t mca_coll_acoll_module_t;
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_acoll_module_t);

#endif /* MCA_COLL_ACOLL_EXPORT_H */
Loading

0 comments on commit 70e653d

Please sign in to comment.