diff --git a/.clangd b/.clangd new file mode 100644 index 00000000..fd82f096 --- /dev/null +++ b/.clangd @@ -0,0 +1,17 @@ +CompileFlags: + # 1. Remove NVCC-specific flags that Clang doesn't understand + Remove: + - -forward-unknown-to-host-compiler + - -gencode* + - --generate-code* + - --options-file* + - -Xcompiler* + - --compiler-options* + - -ccbin* + - -restrict + - --expt-extended-lambda + - -rdc* + Add: + - -xcuda + - --cuda-path=/usr/local/cuda + - -std=c++17 # Ensure this matches your project's standard diff --git a/cmake/Options.cmake b/cmake/Options.cmake index df8cc4cd..f3d836da 100644 --- a/cmake/Options.cmake +++ b/cmake/Options.cmake @@ -7,11 +7,16 @@ cmake_dependent_option(TRIBOL_USE_CUDA "Enables Tribol with CUDA support" ON "ENABLE_CUDA" OFF) cmake_dependent_option(TRIBOL_USE_HIP "Enables Tribol with HIP support" ON "ENABLE_HIP" OFF) cmake_dependent_option(TRIBOL_USE_MPI "Enables MPI in Tribol" ON "ENABLE_MPI" OFF) +cmake_dependent_option(TRIBOL_USE_GPU_MPI "Enables GPU-aware MPI in Tribol" ON "ENABLE_GPU_MPI" OFF) cmake_dependent_option(TRIBOL_USE_OPENMP "Enables Tribol with OpenMP support" ON "ENABLE_OPENMP" OFF) cmake_dependent_option(TRIBOL_ENABLE_TESTS "Enables Tribol Tests" ON "ENABLE_TESTS" OFF) cmake_dependent_option(TRIBOL_ENABLE_EXAMPLES "Enables Tribol Examples" ON "ENABLE_EXAMPLES" OFF) cmake_dependent_option(TRIBOL_ENABLE_DOCS "Enables Tribol Docs" ON "ENABLE_DOCS" OFF) +if(TRIBOL_USE_GPU_MPI AND NOT (TRIBOL_USE_CUDA OR TRIBOL_USE_HIP)) + message(FATAL_ERROR "TRIBOL_USE_GPU_MPI requires either TRIBOL_USE_CUDA or TRIBOL_USE_HIP") +endif() + option(TRIBOL_USE_SINGLE_PRECISION "Use single-precision floating point" OFF) option(TRIBOL_USE_64BIT_INDEXTYPE "Use 64-bit index type" OFF) diff --git a/cmake/SetupThirdParty.cmake b/cmake/SetupThirdParty.cmake index 9c328246..b989fec8 100644 --- a/cmake/SetupThirdParty.cmake +++ b/cmake/SetupThirdParty.cmake @@ -18,6 +18,9 @@ include(CMakeFindDependencyMacro) #------------------------------------------------------------------------------ if(TRIBOL_USE_CUDA) set(tribol_device_depends blt::cuda CACHE STRING "" FORCE) + + # This lets clangd see includes with CUDA builds + set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES OFF) endif() if(TRIBOL_USE_HIP) set(tribol_device_depends blt::hip CACHE STRING "" FORCE) diff --git a/cmake/tribol-config.cmake.in b/cmake/tribol-config.cmake.in index aafaac12..dc8f4c8f 100644 --- a/cmake/tribol-config.cmake.in +++ b/cmake/tribol-config.cmake.in @@ -35,6 +35,7 @@ if(NOT TRIBOL_FOUND) # Config options set(TRIBOL_USE_SINGLE_PRECISION "@TRIBOL_USE_SINGLE_PRECISION@") set(TRIBOL_USE_64BIT_INDEXTYPE "@TRIBOL_USE_64BIT_INDEXTYPE@") + set(TRIBOL_USE_GPU_MPI "@TRIBOL_USE_GPU_MPI@") # TPLs set(TRIBOL_USE_UMPIRE "@TRIBOL_USE_UMPIRE@") diff --git a/scripts/spack/configs/linux_ubuntu_24/spack.yaml b/scripts/spack/configs/linux_ubuntu_24/spack.yaml index d7aaccb3..dfb89a57 100644 --- a/scripts/spack/configs/linux_ubuntu_24/spack.yaml +++ b/scripts/spack/configs/linux_ubuntu_24/spack.yaml @@ -32,8 +32,6 @@ spack: when: '%cxx' - spec: '%fortran=gcc' when: '%fortran' - - spec: '%openmpi@4.1.6' - when: '%mpi' gcc_13: - spec: '%c=gcc' when: '%c' @@ -41,8 +39,6 @@ spack: when: '%cxx' - spec: '%fortran=gcc' when: '%fortran' - - spec: '%openmpi@4.1.6' - when: '%mpi' packages: #Compiler packages @@ -74,8 +70,6 @@ spack: buildable: false libllvm: buildable: false - mpi: - buildable: false D: buildable: false golang: @@ -84,99 +78,114 @@ spack: buildable: false c: buildable: false - grep: + autoconf: externals: - - spec: grep@3.11 + - spec: autoconf@2.71 prefix: /usr buildable: false - openssh: + automake: externals: - - spec: openssh@9.6p1 + - spec: automake@1.16.5 prefix: /usr buildable: false - hwloc: + bash: externals: - - spec: hwloc@2.10.0 + - spec: bash@5.2.21 prefix: /usr buildable: false - groff: + bzip2: externals: - - spec: groff@1.23.0 + - spec: bzip2@1.0.8 prefix: /usr buildable: false - file: + binutils: externals: - - spec: file@5.45 + - spec: binutils@2.42+gold~headers prefix: /usr buildable: false - lua: + cmake: externals: - - spec: lua@5.2 + - spec: cmake@3.28.3 prefix: /usr buildable: false - unzip: + coreutils: externals: - - spec: unzip@6.0 + - spec: coreutils@9.4 prefix: /usr buildable: false - ncurses: + cuda: externals: - - spec: ncurses@6.4.20240113+termlib abi=6 + - spec: cuda@12.8.93 + prefix: /usr/local/cuda + buildable: false + curl: + externals: + - spec: curl@8.5.0+gssapi+ldap+nghttp2 prefix: /usr buildable: false - openssl: + diffutils: externals: - - spec: openssl@3.0.13 + - spec: diffutils@3.10 prefix: /usr buildable: false - binutils: + file: externals: - - spec: binutils@2.42+gold~headers + - spec: file@5.45 prefix: /usr buildable: false - git: + findutils: externals: - - spec: git@2.43.0~tcltk + - spec: findutils@4.9.0 prefix: /usr buildable: false - bash: + flex: externals: - - spec: bash@5.2.21 + - spec: flex@2.6.4 prefix: /usr buildable: false - coreutils: + gawk: externals: - - spec: coreutils@9.4 + - spec: gawk@5.2.1 prefix: /usr buildable: false - cuda: + gettext: externals: - - spec: cuda@12.8.93 - prefix: /usr/local/cuda + - spec: gettext@0.21 + prefix: /usr + buildable: false + git: + externals: + - spec: git@2.43.0~tcltk + prefix: /usr + buildable: false + gmake: + externals: + - spec: gmake@4.3 + prefix: /usr buildable: false gnupg: externals: - spec: gnupg@2.4.4 prefix: /usr buildable: false - diffutils: + grep: externals: - - spec: diffutils@3.10 + - spec: grep@3.11 prefix: /usr buildable: false - rsync: + groff: externals: - - spec: rsync@3.2.7 + - spec: groff@1.23.0 prefix: /usr buildable: false - gmake: + hwloc: externals: - - spec: gmake@4.3 + - spec: hwloc@2.10.0 prefix: /usr buildable: false - cmake: + libevent: externals: - - spec: cmake@3.28.3 + - spec: libevent@2.1.12 prefix: /usr buildable: false libfuse: @@ -184,39 +193,44 @@ spack: - spec: libfuse@3.14.0 prefix: /usr buildable: false - python: + libtool: externals: - - spec: python@3.12.3+bz2+crypt+ctypes+dbm+lzma+pyexpat~pythoncmd+readline+sqlite3+ssl~tkinter+uuid+zlib + - spec: libtool@2.4.7 prefix: /usr buildable: false - bzip2: + lua: externals: - - spec: bzip2@1.0.8 + - spec: lua@5.2 prefix: /usr buildable: false - xz: + m4: externals: - - spec: xz@5.4.5 + - spec: m4@1.4.19 prefix: /usr buildable: false - tar: + ncurses: externals: - - spec: tar@1.35 + - spec: ncurses@6.4.20240113+termlib abi=6 prefix: /usr buildable: false - gettext: + numactl: externals: - - spec: gettext@0.21 + - spec: numactl@2.0.18 prefix: /usr buildable: false - openmpi: + openssh: externals: - - spec: openmpi@4.1.6~cuda+cxx~cxx_exceptions+java~memchecker+pmi~static~wrapper-rpath fabrics=ofi,psm,psm2,ucx schedulers=slurm + - spec: openssh@9.6p1 prefix: /usr buildable: false - curl: + openssl: externals: - - spec: curl@8.5.0+gssapi+ldap+nghttp2 + - spec: openssl@3.0.13 + prefix: /usr + buildable: false + pkgconf: + externals: + - spec: pkgconf@1.8.1 prefix: /usr buildable: false perl: @@ -224,23 +238,38 @@ spack: - spec: perl@5.38.2~cpanm+opcode+open+shared+threads prefix: /usr buildable: false + pmix: + externals: + - spec: pmix@5.0.1 + prefix: /usr + buildable: false + python: + externals: + - spec: python@3.12.3+bz2+crypt+ctypes+dbm+lzma+pyexpat~pythoncmd+readline+sqlite3+ssl~tkinter+uuid+zlib + prefix: /usr + buildable: false + rsync: + externals: + - spec: rsync@3.2.7 + prefix: /usr + buildable: false sed: externals: - spec: sed@4.9 prefix: /usr buildable: false - findutils: + tar: externals: - - spec: findutils@4.9.0 + - spec: tar@1.35 prefix: /usr buildable: false - gawk: + unzip: externals: - - spec: gawk@5.2.1 + - spec: unzip@6.0 prefix: /usr buildable: false - elfutils: + xz: externals: - - spec: elfutils@0.190 + - spec: xz@5.4.5 prefix: /usr buildable: false diff --git a/scripts/spack/configs/versions.yaml b/scripts/spack/configs/versions.yaml index 7ebda5d8..4f0203c9 100644 --- a/scripts/spack/configs/versions.yaml +++ b/scripts/spack/configs/versions.yaml @@ -20,7 +20,7 @@ packages: - spec: "@2.26.0" mfem: require: - - spec: "@4.8.0.1" + - spec: "@4.9.0.2" petsc: require: - spec: "@3.21.6" diff --git a/scripts/spack/packages/mfem/package.py b/scripts/spack/packages/mfem/package.py index aadaba4f..ea5e9522 100644 --- a/scripts/spack/packages/mfem/package.py +++ b/scripts/spack/packages/mfem/package.py @@ -11,7 +11,7 @@ class Mfem(BuiltinMfem): # Note: Make sure this sha coincides with the git submodule # Note: We add a number to the end of the real version number to indicate that we have # moved forward past the release. Increment the last number when updating the commit sha. - version("4.9.0", commit="d9d6526cc1749980a2ba1da16e2c1ca1e07d82ec") + version("4.9.0.2", commit="72805e340996a73179082bbe1c7bfb785c2afb47") variant('asan', default=False, description='Add Address Sanitizer flags') diff --git a/scripts/spack/specs.json b/scripts/spack/specs.json index d370b5e5..cf0158e7 100644 --- a/scripts/spack/specs.json +++ b/scripts/spack/specs.json @@ -28,6 +28,7 @@ "linux_ubuntu_24": [ "+raja+umpire+enzyme+profiling ^hypre+int64 %clang_19", - "+cuda+raja+umpire+enzyme+profiling cuda_arch=86 ^hypre+int64 %clang_19" ] + "+cuda+raja+umpire+enzyme cuda_arch=89 ^openmpi+cuda cuda_arch=89 fabrics=ucx ^ucx+cuda cuda_arch=89 ^hypre+int64 %clang_19" + ] } diff --git a/src/redecomp/CMakeLists.txt b/src/redecomp/CMakeLists.txt index 5d94adda..1e205ff4 100644 --- a/src/redecomp/CMakeLists.txt +++ b/src/redecomp/CMakeLists.txt @@ -10,6 +10,7 @@ set( redecomp_headers MultiRedecomp.hpp RedecompMesh.hpp RedecompTransfer.hpp + common/CoordList.hpp common/TypeDefs.hpp partition/PartitionElements.hpp partition/PartitionEntity.hpp diff --git a/src/redecomp/RedecompMesh.cpp b/src/redecomp/RedecompMesh.cpp index c2f97b08..27614b5d 100644 --- a/src/redecomp/RedecompMesh.cpp +++ b/src/redecomp/RedecompMesh.cpp @@ -164,7 +164,7 @@ void RedecompMesh::BuildRedecomp() auto vert_idx_map = std::unordered_map(); auto vert_ct = 0; for ( int r{ 0 }; r < n_ranks; ++r ) { - for ( int v{ 0 }; v < r2p_vert_idx[r].size(); ++v ) { + for ( int v{ 0 }; v < r2p_vert_idx[r].Size(); ++v ) { auto vert_idx_it = vert_idx_map.emplace( r2p_vert_idx[r][v], vert_ct ); r2p_vert_idx[r][v] = vert_idx_it.first->second; if ( vert_idx_it.second ) { @@ -175,11 +175,11 @@ void RedecompMesh::BuildRedecomp() // send vertex coords from parent to redecomp p2r_verts = vertex_transfer.P2RNodeList( false ); - auto redecomp_coords = MPIArray( &mpi_ ); + auto redecomp_coords = MPIArray>( &mpi_ ); redecomp_coords.SendRecvEach( [this, &p2r_verts]( int dest ) { auto parent_coords = axom::Array(); const auto& vert_idx = p2r_verts.first[dest]; - auto n_coords = vert_idx.size(); + auto n_coords = vert_idx.Size(); parent_coords.reserve( 3 * n_coords ); parent_coords.resize( n_coords, 3 ); for ( int v{ 0 }; v < n_coords; ++v ) { @@ -194,7 +194,7 @@ void RedecompMesh::BuildRedecomp() NumOfVertices = vert_idx_map.size(); vertices.SetSize( NumOfVertices ); for ( int r{ 0 }; r < n_ranks; ++r ) { - for ( int v{ 0 }; v < r2p_vert_idx[r].size(); ++v ) { + for ( int v{ 0 }; v < r2p_vert_idx[r].Size(); ++v ) { for ( int d{ 0 }; d < parent_.SpaceDimension(); ++d ) { vertices[r2p_vert_idx[r][v]]( d ) = redecomp_coords[r]( v, d ); } @@ -205,17 +205,18 @@ void RedecompMesh::BuildRedecomp() // Send and receive element types auto redecomp_etypes = MPIArray( &mpi_ ); redecomp_etypes.SendRecvEach( [this]( int dest ) { - auto parent_etypes = axom::Array( p2r_elems_.first[dest].size(), p2r_elems_.first[dest].size() ); - for ( int e{ 0 }; e < p2r_elems_.first[dest].size(); ++e ) { + auto parent_etypes = MPIArray::ArrayT( p2r_elems_.first[dest].Size() ); + for ( int e{ 0 }; e < p2r_elems_.first[dest].Size(); ++e ) { parent_etypes[e] = parent_.GetElementType( p2r_elems_.first[dest][e] ); } return parent_etypes; } ); // Find length of element connectivities - auto parent_tot_conns = axom::Array( n_ranks, n_ranks ); + auto parent_tot_conns = mfem::Array( n_ranks ); + parent_tot_conns = 0; for ( int r{ 0 }; r < n_ranks; ++r ) { const auto& elem_idx = p2r_elems_.first[r]; - auto n_elems = elem_idx.size(); + auto n_elems = elem_idx.Size(); for ( int e{ 0 }; e < n_elems; ++e ) { parent_tot_conns[r] += parent_.GetElement( elem_idx[e] )->GetNVertices(); } @@ -223,10 +224,10 @@ void RedecompMesh::BuildRedecomp() // Send and receive element connectivities auto redecomp_conns = MPIArray( &mpi_ ); redecomp_conns.SendRecvEach( [this, &parent_tot_conns, &parent_vertex_fes]( int dest ) { - auto parent_conns = axom::Array( parent_tot_conns[dest], parent_tot_conns[dest] ); + auto parent_conns = MPIArray::ArrayT( parent_tot_conns[dest] ); parent_tot_conns[dest] = 0; const auto& elem_idx = p2r_elems_.first[dest]; - for ( int e{ 0 }; e < elem_idx.size(); ++e ) { + for ( int e{ 0 }; e < elem_idx.Size(); ++e ) { mfem::Array elem_conn; parent_.GetElement( elem_idx[e] )->GetVertices( elem_conn ); for ( int v{ 0 }; v < elem_conn.Size(); ++v ) { @@ -240,8 +241,8 @@ void RedecompMesh::BuildRedecomp() auto redecomp_attribs = MPIArray( &mpi_ ); redecomp_attribs.SendRecvEach( [this]( int dest ) { const auto& elem_idx = p2r_elems_.first[dest]; - auto n_elems = elem_idx.size(); - auto parent_attribs = axom::Array( n_elems, n_elems ); + auto n_elems = elem_idx.Size(); + auto parent_attribs = MPIArray::ArrayT( n_elems ); for ( int e{ 0 }; e < n_elems; ++e ) { parent_attribs[e] = parent_.GetElement( elem_idx[e] )->GetAttribute(); } @@ -251,13 +252,13 @@ void RedecompMesh::BuildRedecomp() // NOTE: Don't use NumOfElements. AddElement() will increment it. auto n_els = 0; for ( const auto& redecomp_etype : redecomp_etypes ) { - n_els += redecomp_etype.size(); + n_els += redecomp_etype.Size(); } elements.SetSize( n_els ); // Create elements for ( int r{ 0 }; r < n_ranks; ++r ) { auto conn_ct = 0; - for ( int e{ 0 }; e < redecomp_etypes[r].size(); ++e ) { + for ( int e{ 0 }; e < redecomp_etypes[r].Size(); ++e ) { auto el = NewElement( redecomp_etypes[r][e] ); el->SetVertices( &redecomp_conns[r][conn_ct] ); for ( int k{ 0 }; k < el->GetNVertices(); ++k ) { @@ -278,9 +279,9 @@ void RedecompMesh::BuildRedecomp() r2p_elem_offsets_.reserve( n_ranks + 1 ); r2p_elem_offsets_.resize( n_ranks + 1 ); mpi_.SendRecvEach( - type>(), - [this]( int dest ) { return axom::Array( { static_cast( p2r_elems_.first[dest].size() ) } ); }, - [this]( axom::Array&& recv_data, int source ) { r2p_elem_offsets_[source + 1] = recv_data[0]; } ); + type>(), + [this]( int dest ) { return mfem::Array( { static_cast( p2r_elems_.first[dest].Size() ) } ); }, + [this]( mfem::Array&& recv_data, int source ) { r2p_elem_offsets_[source + 1] = recv_data[0]; } ); for ( int i{ 2 }; i < n_ranks + 1; ++i ) { r2p_elem_offsets_[i] += r2p_elem_offsets_[i - 1]; } @@ -289,8 +290,9 @@ void RedecompMesh::BuildRedecomp() // r2p = redecomp to parent r2p_ghost_elems_ = MPIArray( &mpi_ ); r2p_ghost_elems_.SendRecvEach( [this]( int dest ) { - auto n_elems = p2r_elems_.second[dest].size(); - auto parent_gelems = axom::Array( 0, n_elems ); + auto n_elems = p2r_elems_.second[dest].Size(); + auto parent_gelems = MPIArray::ArrayT( 0 ); + parent_gelems.Reserve( n_elems ); for ( int i{ 0 }; i < n_elems; ++i ) { if ( p2r_elems_.second[dest][i] ) { parent_gelems.push_back( i ); diff --git a/src/redecomp/RedecompTransfer.cpp b/src/redecomp/RedecompTransfer.cpp index d60925df..d922c1b5 100644 --- a/src/redecomp/RedecompTransfer.cpp +++ b/src/redecomp/RedecompTransfer.cpp @@ -52,23 +52,21 @@ void RedecompTransfer::TransferToSerial( const mfem::QuadratureFunction& src, mf TRIBOL_MARK_BEGIN( "Send and receive values over MPI" ); auto dst_vals = MPIArray( &redecomp->getMPIUtility() ); dst_vals.SendRecvEach( [redecomp, &src]( int dest ) { - auto src_vals = axom::Array(); + auto src_vals = MPIArray::ArrayT(); const auto& src_elem_idx = redecomp->getParentToRedecompElems().first[dest]; - auto n_els = src_elem_idx.size(); + auto n_els = src_elem_idx.Size(); if ( n_els > 0 ) { auto vals = mfem::Vector(); // guess the size of send_vals based on the size of the first element TRIBOL_MARK_BEGIN( "Getting element values" ); src.GetValues( src_elem_idx[0], vals ); TRIBOL_MARK_END( "Getting element values" ); - src_vals.reserve( vals.Size() * n_els ); - auto quadpt_ct = 0; + src_vals.Reserve( vals.Size() * n_els ); for ( auto src_elem_id : src_elem_idx ) { TRIBOL_MARK_BEGIN( "Getting element values" ); src.GetValues( src_elem_id, vals ); TRIBOL_MARK_END( "Getting element values" ); - src_vals.insert( quadpt_ct, vals.Size(), vals.GetData() ); - quadpt_ct += vals.Size(); + src_vals.Append( vals.GetData(), vals.Size() ); } } return src_vals; @@ -112,7 +110,7 @@ void RedecompTransfer::TransferToParallel( const mfem::QuadratureFunction& src, // send and receive quadrature point values from other ranks auto dst_vals = MPIArray( &redecomp->getMPIUtility() ); dst_vals.SendRecvEach( [redecomp, &src]( int dest ) { - auto src_vals = axom::Array(); + auto src_vals = MPIArray::ArrayT(); auto first_el = redecomp->getRedecompToParentElemOffsets()[dest]; auto last_el = redecomp->getRedecompToParentElemOffsets()[dest + 1]; auto n_els = last_el - first_el; @@ -120,18 +118,16 @@ void RedecompTransfer::TransferToParallel( const mfem::QuadratureFunction& src, auto vals = mfem::Vector(); // guess the size of send_vals based on the size of the first element src.GetValues( first_el, vals ); - src_vals.reserve( vals.Size() * n_els ); - auto quadpt_ct = 0; + src_vals.Reserve( vals.Size() * n_els ); auto ghost_ct = 0; for ( int e{ first_el }; e < last_el; ++e ) { // skip ghost elements - if ( ghost_ct < redecomp->getRedecompToParentGhostElems()[dest].size() && + if ( ghost_ct < redecomp->getRedecompToParentGhostElems()[dest].Size() && redecomp->getRedecompToParentGhostElems()[dest][ghost_ct] == e ) { ++ghost_ct; } else { src.GetValues( e, vals ); - src_vals.insert( quadpt_ct, vals.Size(), vals.GetData() ); - quadpt_ct += vals.Size(); + src_vals.Append( vals.GetData(), vals.Size() ); } } } @@ -144,7 +140,7 @@ void RedecompTransfer::TransferToParallel( const mfem::QuadratureFunction& src, auto n_ranks = redecomp->getMPIUtility().NRanks(); for ( int r{ 0 }; r < n_ranks; ++r ) { auto quadpt_ct = 0; - for ( int e{ 0 }; e < redecomp->getParentToRedecompElems().first[r].size(); ++e ) { + for ( int e{ 0 }; e < redecomp->getParentToRedecompElems().first[r].Size(); ++e ) { // skip ghost elements if ( !redecomp->getParentToRedecompElems().second[r][e] ) { dst.GetValues( redecomp->getParentToRedecompElems().first[r][e], vals ); diff --git a/src/redecomp/common/CoordList.hpp b/src/redecomp/common/CoordList.hpp new file mode 100644 index 00000000..97cc388e --- /dev/null +++ b/src/redecomp/common/CoordList.hpp @@ -0,0 +1,66 @@ +// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and +// other Tribol Project Developers. See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: (MIT) + +#ifndef SRC_REDECOMP_COMMON_COORDLIST_HPP_ +#define SRC_REDECOMP_COMMON_COORDLIST_HPP_ + +#include "mfem.hpp" +#include "redecomp/common/TypeDefs.hpp" + +namespace redecomp { + +/** + * @brief CoordList class that holds a 2D mfem::Vector and provides access to points. + * + * @tparam NDIMS Number of dimensions + * @tparam ORDERING Element DOF ordering (LEXICOGRAPHIC or NATIVE) + */ +template +class CoordList { + public: + /** + * @brief Construct a new Coord List object + * + * @param coords Vector containing coordinates. Expected layout: ByVDIM + * (all x coordinates, then all y coordinates, etc.) + */ + CoordList( mfem::Vector coords ) : coords_( std::move( coords ) ) {} + + /** + * @brief Create a Point object for the k-th node + * + * @param k Index of the node + * @return Point + */ + Point GetPoint( int k ) const + { + Point pt; + for ( int d = 0; d < NDIMS; ++d ) { + pt[d] = coords_( k * NDIMS + d ); + } + return pt; + } + + /** + * @brief Get the underlying coordinate vector + * + * @return const mfem::Vector& + */ + const mfem::Vector& GetCoords() const { return coords_; } + + /** + * @brief Get the number of coordinates + * + * @return int + */ + int GetNumCoords() const { return coords_.Size() / NDIMS; } + + private: + mfem::Vector coords_; +}; + +} // namespace redecomp + +#endif /* SRC_REDECOMP_COMMON_COORDLIST_HPP_ */ diff --git a/src/redecomp/partition/PartitionElements.cpp b/src/redecomp/partition/PartitionElements.cpp index 4dfc8411..0a06e4e4 100644 --- a/src/redecomp/partition/PartitionElements.cpp +++ b/src/redecomp/partition/PartitionElements.cpp @@ -5,21 +5,65 @@ #include "PartitionElements.hpp" +#include "mfem.hpp" + namespace redecomp { template -std::vector>> PartitionElements::EntityCoordinates( +std::vector> PartitionElements::EntityCoordinates( const std::vector& par_meshes ) const { - auto elem_centroids = std::vector>>(); + // check if device is available to MFEM + auto elem_centroids = std::vector>(); elem_centroids.reserve( par_meshes.size() ); + for ( auto par_mesh : par_meshes ) { auto n_elems = par_mesh->GetNE(); - elem_centroids.emplace_back( n_elems, n_elems ); - for ( int i{ 0 }; i < n_elems; ++i ) { - auto vec_centroid = mfem::Vector( elem_centroids.back()[i].data(), NDIMS ); - // TODO: const version of GetElementCenter() - const_cast( par_mesh )->GetElementCenter( i, vec_centroid ); + if ( n_elems == 0 ) { + elem_centroids.emplace_back( mfem::Vector() ); + continue; + } + + auto compute_centroids = [&]( const mfem::ParGridFunction& coords ) { + mfem::Vector mesh_centroids( n_elems * NDIMS ); + mesh_centroids.UseDevice( coords.UseDevice() ); + // Create an IntegrationRule with a single point: the reference center + // Assuming a single element type in the mesh + mfem::Geometry::Type geom_type = par_mesh->GetElementBaseGeometry( 0 ); + mfem::IntegrationRule ir; + ir.Append( mfem::Geometries.GetCenter( geom_type ) ); + + // Setup QuadratureInterpolator. We want to evaluate the 'nodes' GridFunction at the integration points (centers). + auto fes = coords.FESpace(); + const mfem::QuadratureInterpolator* qi = fes->GetQuadratureInterpolator( ir ); + + // We need the ElementRestriction to convert the global 'nodes' vector to E-vector. + // We use LEXICOGRAPHIC ordering which is required for Tensor product evaluation + // often used on device for quads/hexes. + const mfem::Operator* er = fes->GetElementRestriction( mfem::ElementDofOrdering::LEXICOGRAPHIC ); + + mfem::Vector e_vec( er->Height() ); + + // Perform the calculation on device. + // a) Global to Element (E-vector) + er->Mult( coords, e_vec ); + // b) Interpolate to Quadrature points (Q-vector) + qi->Values( e_vec, mesh_centroids ); + + return mesh_centroids; + }; + + auto coords = dynamic_cast( par_mesh->GetNodes() ); + if ( coords ) { + elem_centroids.emplace_back( compute_centroids( *coords ) ); + } else { + // Create temporary nodes + auto* non_const_mesh = const_cast( par_mesh ); + mfem::H1_FECollection fec( 1, NDIMS ); + mfem::ParFiniteElementSpace fes( non_const_mesh, &fec, NDIMS ); + mfem::ParGridFunction nodes( &fes ); + non_const_mesh->GetNodes( nodes ); + elem_centroids.emplace_back( compute_centroids( nodes ) ); } } return elem_centroids; diff --git a/src/redecomp/partition/PartitionElements.hpp b/src/redecomp/partition/PartitionElements.hpp index af61abc8..8a213b05 100644 --- a/src/redecomp/partition/PartitionElements.hpp +++ b/src/redecomp/partition/PartitionElements.hpp @@ -8,6 +8,8 @@ #include "redecomp/partition/PartitionEntity.hpp" +#include "shared/BasicTypes.hpp" + namespace redecomp { /** @@ -24,8 +26,7 @@ class PartitionElements : public PartitionEntity { * @param par_meshes Input meshes * @return Vector of array of points from each entity on each mesh */ - std::vector>> EntityCoordinates( - const std::vector& par_meshes ) const override; + std::vector> EntityCoordinates( const std::vector& par_meshes ) const override; }; using PartitionElements2D = PartitionElements<2>; diff --git a/src/redecomp/partition/PartitionEntity.hpp b/src/redecomp/partition/PartitionEntity.hpp index e42f8c79..5bb6b59f 100644 --- a/src/redecomp/partition/PartitionEntity.hpp +++ b/src/redecomp/partition/PartitionEntity.hpp @@ -10,6 +10,7 @@ #include "mfem.hpp" +#include "redecomp/common/CoordList.hpp" #include "redecomp/common/TypeDefs.hpp" namespace redecomp { @@ -33,7 +34,7 @@ class PartitionEntity { * @param par_meshes Input meshes * @return Vector of array of points from each entity on each mesh */ - virtual std::vector>> EntityCoordinates( + virtual std::vector> EntityCoordinates( const std::vector& par_meshes ) const = 0; /** diff --git a/src/redecomp/partition/PartitionMethod.hpp b/src/redecomp/partition/PartitionMethod.hpp index 47220cfb..befe6a47 100644 --- a/src/redecomp/partition/PartitionMethod.hpp +++ b/src/redecomp/partition/PartitionMethod.hpp @@ -6,8 +6,11 @@ #ifndef SRC_REDECOMP_PARTITION_PARTITIONMETHOD_HPP_ #define SRC_REDECOMP_PARTITION_PARTITIONMETHOD_HPP_ +#include "redecomp/common/CoordList.hpp" #include "redecomp/common/TypeDefs.hpp" +#include "mfem.hpp" + namespace redecomp { /** @@ -25,19 +28,17 @@ template class PartitionMethod { public: /** - * @brief Returns a list of entity ids on each rank/subdomain determined by - * the partitioning method + * @brief Returns a list of entity ids on each rank/subdomain determined by the partitioning method * * @param n_parts Number of subdomains to cut the list of coords into - * @param coords_by_mesh List of on-rank points (one point-per-entity, sorted - * by entity id) to subdivide sorted by mesh - * @param ghost_size Sets length to include entities as ghost on each - * subdomain - * @return List of entity ids and ghost information on each subdomain sorted - * by mesh + * @param coords_by_mesh List of on-rank points (one point-per-entity, sorted by entity id) to subdivide sorted by + * mesh + * @param ghost_size Sets length to include entities as ghost on each subdomain + * @return List of entity ids and ghost information on each subdomain sorted by mesh */ - virtual std::vector generatePartitioning( - int n_parts, const std::vector>>& coords_by_mesh, double ghost_size ) const = 0; + virtual std::vector generatePartitioning( int n_parts, + const std::vector>& coords_by_mesh, + double ghost_size ) const = 0; /** * @brief Returns the MPIUtility diff --git a/src/redecomp/partition/RCB.cpp b/src/redecomp/partition/RCB.cpp index 35fed085..d769a4a7 100644 --- a/src/redecomp/partition/RCB.cpp +++ b/src/redecomp/partition/RCB.cpp @@ -22,8 +22,9 @@ RCB::RCB( const MPI_Comm& comm, double max_out_of_balance, int n_try_new_ } template -std::vector RCB::generatePartitioning( - int n_parts, const std::vector>>& coords_by_mesh, double ghost_len ) const +std::vector RCB::generatePartitioning( int n_parts, + const std::vector>& coords_by_mesh, + double ghost_len ) const { TRIBOL_MARK_FUNCTION; auto partitioning = std::vector(); @@ -37,22 +38,24 @@ std::vector RCB::generatePartitioning( auto rank_jump_interval = this->getMPIUtility().NRanks() / n_parts; for ( const auto& coords : coords_by_mesh ) { + auto num_coords = coords.GetNumCoords(); // Build a list of coords that belong in each of the RCB entity parts auto ent_idx = MPIArray( &this->getMPIUtility() ); auto ent_ghost = MPIArray( &this->getMPIUtility() ); for ( int i{ 0 }; i < n_parts; ++i ) { auto rank = i * rank_jump_interval; - ent_idx[rank].reserve( coords.size() ); - ent_ghost[rank].reserve( coords.size() ); + ent_idx[rank].Reserve( num_coords ); + ent_ghost[rank].Reserve( num_coords ); } - for ( int i{ 0 }; i < coords.size(); ++i ) { - auto dest = DetermineDomain( problem_tree, coords[i] ); + for ( int i{ 0 }; i < num_coords; ++i ) { + Point coord = coords.GetPoint( i ); + auto dest = DetermineDomain( problem_tree, coord ); auto rank = dest * rank_jump_interval; ent_idx[rank].push_back( i ); ent_ghost[rank].push_back( false ); const auto& neighbors = problem_tree( dest ).neighbor_bboxes_; for ( int j{ 0 }; j < neighbors.size(); ++j ) { - if ( problem_tree( neighbors[j] ).ghost_bbox_.contains( coords[i] ) ) { + if ( problem_tree( neighbors[j] ).ghost_bbox_.contains( coord ) ) { auto neighbor_rank = neighbors[j] * rank_jump_interval; ent_idx[neighbor_rank].push_back( i ); ent_ghost[neighbor_rank].push_back( true ); @@ -60,8 +63,10 @@ std::vector RCB::generatePartitioning( } } for ( int i{ 0 }; i < n_parts; ++i ) { - ent_idx[i].shrink(); - ent_ghost[i].shrink(); + auto tmp_idx = ent_idx[i]; + ent_idx[i] = std::move( tmp_idx ); + auto tmp_ghost = ent_ghost[i]; + ent_ghost[i] = std::move( tmp_ghost ); } partitioning.emplace_back( std::move( ent_idx ), std::move( ent_ghost ) ); @@ -72,7 +77,7 @@ std::vector RCB::generatePartitioning( template BisecTree> RCB::BuildProblemTree( int n_parts, - const std::vector>>& coords_by_mesh, + const std::vector>& coords_by_mesh, double ghost_len ) const { TRIBOL_MARK_FUNCTION; @@ -107,7 +112,7 @@ BisecTree> RCB::BuildProblemTree( int n_parts, // compute total number of entities in the domain auto total_ents = 0; for ( const auto& coords : coords_by_mesh ) { - total_ents += coords.size(); + total_ents += coords.GetNumCoords(); } total_ents = TotalEntities( total_ents ); // construct an AABB of the whole domain in the root node @@ -231,13 +236,15 @@ BisecTree> RCB::BuildProblemTree( int n_parts, } template -BoundingBox RCB::DomainBoundingBox( const std::vector>>& coords_by_mesh ) const +BoundingBox RCB::DomainBoundingBox( const std::vector>& coords_by_mesh ) const { auto on_rank_bbox = BoundingBox(); size_t num_coords{ 0 }; for ( const auto& coords : coords_by_mesh ) { - on_rank_bbox.addBox( BoundingBox( coords.data(), coords.size() ) ); - num_coords += coords.size(); + for ( int i = 0; i < coords.GetNumCoords(); ++i ) { + on_rank_bbox.addPoint( coords.GetPoint( i ) ); + } + num_coords += coords.GetNumCoords(); } auto min_coord = on_rank_bbox.getMin(); auto max_coord = on_rank_bbox.getMax(); @@ -256,12 +263,13 @@ BoundingBox RCB::DomainBoundingBox( const std::vector std::pair RCB::CountEntities( const BoundingBox& left_bbox, const BoundingBox& right_bbox, - const std::vector>>& coords_by_mesh ) const + const std::vector>& coords_by_mesh ) const { auto n_ents = std::make_pair( 0, 0 ); for ( const auto& coords : coords_by_mesh ) { - for ( const auto& coord : coords ) { + for ( int i = 0; i < coords.GetNumCoords(); ++i ) { + Point coord = coords.GetPoint( i ); if ( left_bbox.contains( coord ) ) n_ents.first += 1; else if ( right_bbox.contains( coord ) ) diff --git a/src/redecomp/partition/RCB.hpp b/src/redecomp/partition/RCB.hpp index de5274c9..c2203b8f 100644 --- a/src/redecomp/partition/RCB.hpp +++ b/src/redecomp/partition/RCB.hpp @@ -83,8 +83,7 @@ class RCB : public PartitionMethod { * @param ghost_len Sets length to include entities as ghost on each subdomain * @return List of points and ghost entities on each subdomain sorted by mesh */ - std::vector generatePartitioning( int n_parts, - const std::vector>>& coords_by_mesh, + std::vector generatePartitioning( int n_parts, const std::vector>& coords_by_mesh, double ghost_len ) const override; private: @@ -96,7 +95,7 @@ class RCB : public PartitionMethod { * @param ghost_len Sets length to include entities as ghost on each subdomain * @return BisecTree of bounding boxes defining subdomains and ghost subdomains */ - BisecTree> BuildProblemTree( int n_parts, const std::vector>>& coords_by_mesh, + BisecTree> BuildProblemTree( int n_parts, const std::vector>& coords_by_mesh, double ghost_len ) const; /** @@ -105,7 +104,7 @@ class RCB : public PartitionMethod { * @param coords_by_mesh On-rank coords sorted by mesh * @return BoundingBox Bounding box of coords on all ranks */ - BoundingBox DomainBoundingBox( const std::vector>>& coords_by_mesh ) const; + BoundingBox DomainBoundingBox( const std::vector>& coords_by_mesh ) const; /** * @brief Counts the entities across all ranks in the two given bounding boxes @@ -116,7 +115,7 @@ class RCB : public PartitionMethod { * @return std::pair Entity counts in each bounding box */ std::pair CountEntities( const BoundingBox& left_bbox, const BoundingBox& right_bbox, - const std::vector>>& coords_by_mesh ) const; + const std::vector>& coords_by_mesh ) const; /** * @brief Find the domain at the base of the BisecTree the coord belongs to diff --git a/src/redecomp/transfer/MatrixTransfer.cpp b/src/redecomp/transfer/MatrixTransfer.cpp index 5dbc25eb..aef85d81 100644 --- a/src/redecomp/transfer/MatrixTransfer.cpp +++ b/src/redecomp/transfer/MatrixTransfer.cpp @@ -113,13 +113,13 @@ mfem::SparseMatrix MatrixTransfer::TransferToParallelSparse( const axom::Array&& send_vals, axom::IndexType src ) { - if ( recv_trial_elem_dofs[src].empty() ) { + if ( recv_trial_elem_dofs[src].IsEmpty() ) { return; } auto trial_dof_ct = 0; auto dof_ct = 0; // element loop - for ( int e{ 0 }; e < recv_test_elem_offsets[src].size(); ++e ) { + for ( int e{ 0 }; e < recv_test_elem_offsets[src].Size(); ++e ) { auto test_elem_id = test_redecomp->getParentToRedecompElems().first[src][recv_test_elem_offsets[src][e]]; auto test_elem_dofs = mfem::Array(); parent_test_fes_.GetElementVDofs( test_elem_id, test_elem_dofs ); @@ -193,7 +193,7 @@ axom::Array MatrixTransfer::buildRedecomp2ParentElemRank( const RedecompMes r = r2p_elem_rank[e]; ghost_ct = 0; } - if ( ghost_ct < ghost_elems[r].size() && ghost_elems[r][ghost_ct] == e ) { + if ( ghost_ct < ghost_elems[r].Size() && ghost_elems[r][ghost_ct] == e ) { r2p_elem_rank[e] = -1; ++ghost_ct; } @@ -210,7 +210,7 @@ MPIArray MatrixTransfer::buildSendArrayIDs( const axom::Array& test_el auto n_ranks = getMPIUtility().NRanks(); auto est_max_elems = 2 * test_elem_idx.size() / n_ranks; for ( int r{ 0 }; r < n_ranks; ++r ) { - send_array_ids[r].reserve( est_max_elems ); + send_array_ids[r].Reserve( est_max_elems ); } for ( int i{ 0 }; i < test_elem_idx.size(); ++i ) { auto test_e = test_elem_idx[i]; @@ -222,7 +222,8 @@ MPIArray MatrixTransfer::buildSendArrayIDs( const axom::Array& test_el } } for ( auto send_array_ids_rank : send_array_ids ) { - send_array_ids_rank.shrink(); + auto tmp_send_array_ids_rank = send_array_ids_rank; + std::swap( send_array_ids_rank, tmp_send_array_ids_rank ); } return send_array_ids; @@ -250,15 +251,15 @@ axom::Array MatrixTransfer::buildSendNumMatEntries( const axom::Array& return send_num_mat_entries; } -MPIArray MatrixTransfer::buildRecvMatSizes( const axom::Array& test_elem_idx, - const axom::Array& trial_elem_idx ) const +MPIArray> MatrixTransfer::buildRecvMatSizes( const axom::Array& test_elem_idx, + const axom::Array& trial_elem_idx ) const { - auto recv_mat_sizes = MPIArray( &getMPIUtility() ); + auto recv_mat_sizes = MPIArray>( &getMPIUtility() ); // Number of test and trial vdofs for each element matrix to be sent to each // parent test space rank. MPI communication is used to turn this array into // recv_mat_sizes. - auto send_mat_sizes = MPIArray( &getMPIUtility() ); + auto send_mat_sizes = MPIArray>( &getMPIUtility() ); auto n_ranks = getMPIUtility().NRanks(); auto est_max_elems = 2 * test_elem_idx.size() / n_ranks; for ( int r{ 0 }; r < n_ranks; ++r ) { @@ -295,7 +296,7 @@ MPIArray MatrixTransfer::buildRecvTestElemOffsets( const RedecompMesh& test auto n_ranks = getMPIUtility().NRanks(); auto est_max_elems = 2 * test_elem_idx.size() / n_ranks; for ( int r{ 0 }; r < n_ranks; ++r ) { - send_test_elem_offsets[r].reserve( est_max_elems ); + send_test_elem_offsets[r].Reserve( est_max_elems ); } const auto& test_r2p_elem_offsets = test_redecomp.getRedecompToParentElemOffsets(); for ( int i{ 0 }; i < test_elem_idx.size(); ++i ) { @@ -323,13 +324,13 @@ MPIArray MatrixTransfer::buildRecvTrialElemDofs( const RedecompMes // List of trial element offsets sorted by the parent test space rank and the // parent trial space rank it belongs to. Used to get the trial space parent // vdofs (on the parent trial space rank) onto the parent test space rank. - auto send_trial_elem_offsets = MPIArray>( &getMPIUtility() ); + auto send_trial_elem_offsets = MPIArray, std::vector>>( &getMPIUtility() ); // Used to order the trial vdofs by element in the same order as received test // elements. Stores the ordered trial rank of the vdofs. auto send_trial_elem_rank = MPIArray( &getMPIUtility() ); // Used to order the trial vdofs by element in the same order as received test // elements. Stores the start index and length of the vdofs for each element. - auto send_trial_dof_extents = MPIArray( &getMPIUtility() ); + auto send_trial_dof_extents = MPIArray>( &getMPIUtility() ); // Total trial element vdofs to be sent to each parent test space rank auto send_trial_dof_sizes = axom::Array( n_ranks, n_ranks ); // Running count of number of DOFs in each send test/trial rank combo. Used @@ -345,14 +346,13 @@ MPIArray MatrixTransfer::buildRecvTrialElemDofs( const RedecompMes auto est_max_elems = 2 * test_elem_idx.size() / n_ranks; for ( int r{ 0 }; r < n_ranks; ++r ) { send_trial_elem_offsets[r].resize( n_ranks ); - send_trial_elem_offsets[r].shrink(); for ( auto& send_trial_elem_offsets_rank : send_trial_elem_offsets[r] ) { - send_trial_elem_offsets_rank.reserve( est_max_elems / n_ranks ); + send_trial_elem_offsets_rank.Reserve( est_max_elems / n_ranks ); } - send_trial_elem_rank[r].reserve( est_max_elems ); + send_trial_elem_rank[r].Reserve( est_max_elems ); send_trial_dof_extents[r].reserve( 2 * est_max_elems ); - trial_elem_offsets_dof_ct[r].resize( n_ranks ); - trial_elem_offsets_dof_ct[r].shrink(); + trial_elem_offsets_dof_ct[r].SetSize( n_ranks ); + trial_elem_offsets_dof_ct[r] = 0; } const auto& trial_r2p_elem_offsets = trial_redecomp.getRedecompToParentElemOffsets(); @@ -381,20 +381,20 @@ MPIArray MatrixTransfer::buildRecvTrialElemDofs( const RedecompMes } // grab test DOFs from the parent rank and return them to the redecomp rank - auto unsorted_trial_dofs = axom::Array>>( n_ranks, n_ranks ); + auto unsorted_trial_dofs = std::vector>>( n_ranks ); for ( auto& test_dof_rank : unsorted_trial_dofs ) { - test_dof_rank = axom::Array>( n_ranks, n_ranks ); + test_dof_rank = std::vector>( n_ranks ); } const auto& trial_p2r_elems = trial_redecomp.getParentToRedecompElems(); for ( int dst_test_r{ 0 }; dst_test_r < n_ranks; ++dst_test_r ) { // send parent trial element offsets to parent trial rank auto recv_trial_elem_offsets = MPIArray( &getMPIUtility() ); getMPIUtility().SendRecvEach( - type>(), + type>(), [dst_test_r, &send_trial_elem_offsets]( axom::IndexType dst_trial_r ) { return send_trial_elem_offsets[dst_test_r][dst_trial_r]; }, - [&recv_trial_elem_offsets]( axom::Array&& send_vals, axom::IndexType src_trial_r ) { + [&recv_trial_elem_offsets]( mfem::Array&& send_vals, axom::IndexType src_trial_r ) { recv_trial_elem_offsets[src_trial_r] = std::move( send_vals ); } ); // send parent trial vdofs back to test redecomp rank @@ -402,19 +402,19 @@ MPIArray MatrixTransfer::buildRecvTrialElemDofs( const RedecompMes auto trial_dofs_by_rank = MPIArray( &getMPIUtility() ); trial_dofs_by_rank.SendRecvEach( [this, &trial_p2r_elems, &recv_trial_elem_offsets, first_dof]( axom::IndexType src_trial_r ) { - auto trial_dofs = axom::Array(); + auto trial_dofs = MPIArray::ArrayT(); const auto& rank_elem_offsets = recv_trial_elem_offsets[src_trial_r]; - auto n_elems = rank_elem_offsets.size(); + auto n_elems = rank_elem_offsets.Size(); if ( n_elems > 0 ) { auto elem_id = trial_p2r_elems.first[src_trial_r][rank_elem_offsets[0]]; auto est_n_dofs = n_elems * parent_trial_fes_.GetFE( elem_id )->GetDof() * parent_trial_fes_.GetVDim(); - trial_dofs.reserve( est_n_dofs ); + trial_dofs.Reserve( est_n_dofs ); for ( auto elem_offset : rank_elem_offsets ) { elem_id = trial_p2r_elems.first[src_trial_r][elem_offset]; auto n_elem_dofs = parent_trial_fes_.GetFE( elem_id )->GetDof() * parent_trial_fes_.GetVDim(); - trial_dofs.resize( trial_dofs.size() + n_elem_dofs ); - auto dof_array = mfem::Array( &trial_dofs[trial_dofs.size() - n_elem_dofs], n_elem_dofs ); + trial_dofs.SetSize( trial_dofs.Size() + n_elem_dofs ); + auto dof_array = mfem::Array( &trial_dofs[trial_dofs.Size() - n_elem_dofs], n_elem_dofs ); auto int_dof_array = mfem::Array( n_elem_dofs ); parent_trial_fes_.GetElementVDofs( elem_id, int_dof_array ); for ( int i{ 0 }; i < n_elem_dofs; ++i ) { @@ -437,13 +437,13 @@ MPIArray MatrixTransfer::buildRecvTrialElemDofs( const RedecompMes } // construct trial vdof vectors for each test rank for ( int test_r{ 0 }; test_r < n_ranks; ++test_r ) { - send_trial_elem_dofs[test_r].reserve( send_trial_dof_sizes[test_r] ); - auto n_trial_elems = send_trial_elem_rank[test_r].size(); + send_trial_elem_dofs[test_r].Reserve( send_trial_dof_sizes[test_r] ); + auto n_trial_elems = send_trial_elem_rank[test_r].Size(); for ( int e{ 0 }; e < n_trial_elems; ++e ) { - auto trial_elem_dofs = axom::ArrayView( + auto trial_elem_dofs = mfem::Array( &unsorted_trial_dofs[test_r][send_trial_elem_rank[test_r][e]][send_trial_dof_extents[test_r]( e, 0 )], - { send_trial_dof_extents[test_r]( e, 1 ) } ); - send_trial_elem_dofs[test_r].append( trial_elem_dofs ); + send_trial_dof_extents[test_r]( e, 1 ) ); + send_trial_elem_dofs[test_r].Append( trial_elem_dofs ); } } recv_trial_elem_dofs.SendRecvArrayEach( send_trial_elem_dofs ); diff --git a/src/redecomp/transfer/MatrixTransfer.hpp b/src/redecomp/transfer/MatrixTransfer.hpp index 484699ce..5babb99b 100644 --- a/src/redecomp/transfer/MatrixTransfer.hpp +++ b/src/redecomp/transfer/MatrixTransfer.hpp @@ -132,8 +132,8 @@ class MatrixTransfer { * @param trial_elem_idx List of element IDs on the redecomp trial space * @return MPIArray */ - MPIArray buildRecvMatSizes( const axom::Array& test_elem_idx, - const axom::Array& trial_elem_idx ) const; + MPIArray> buildRecvMatSizes( const axom::Array& test_elem_idx, + const axom::Array& trial_elem_idx ) const; /** * @brief List of test element offsets received from test space redecomp ranks diff --git a/src/redecomp/transfer/SparseMatrixTransfer.cpp b/src/redecomp/transfer/SparseMatrixTransfer.cpp index 91b1fdc8..37c2a22f 100644 --- a/src/redecomp/transfer/SparseMatrixTransfer.cpp +++ b/src/redecomp/transfer/SparseMatrixTransfer.cpp @@ -84,10 +84,10 @@ std::unique_ptr SparseMatrixTransfer::TransferToParallel( // guess size and preallocate auto approx_size = 2 * src.NumNonZeroElems() / n_ranks; for ( int r{ 0 }; r < n_ranks; ++r ) { - send_matrix_data[r].reserve( approx_size ); - send_parent_local_row_offsets[r].reserve( approx_size ); - send_parent_local_col_offsets[r].reserve( approx_size ); - send_parent_col_ranks[r].reserve( approx_size ); + send_matrix_data[r].Reserve( approx_size ); + send_parent_local_row_offsets[r].Reserve( approx_size ); + send_parent_local_col_offsets[r].Reserve( approx_size ); + send_parent_col_ranks[r].Reserve( approx_size ); } auto src_I = src.GetI(); @@ -110,7 +110,7 @@ std::unique_ptr SparseMatrixTransfer::TransferToParallel( test_ghost_ct = 0; } // skip unowned elements - if ( test_ghost_ct < test_ghost_elems[test_parent_rank].size() && + if ( test_ghost_ct < test_ghost_elems[test_parent_rank].Size() && test_elem_id == test_ghost_elems[test_parent_rank][test_ghost_ct] ) { // the element is a ghost on this rank. increase the index to check for the next ghost (since they are sorted in // ascending order). @@ -145,11 +145,11 @@ std::unique_ptr SparseMatrixTransfer::TransferToParallel( // do per-rank communication to transform local_col from offsets to element ids auto send_col_offsets_by_rank = MPIArray( &getMPIUtility() ); // guess size and preallocate - auto approx_rank_size = 2 * send_parent_local_col_offsets[r].size() / n_ranks; + auto approx_rank_size = 2 * send_parent_local_col_offsets[r].Size() / n_ranks; for ( auto& send_col_offsets : send_col_offsets_by_rank ) { - send_col_offsets.reserve( approx_rank_size ); + send_col_offsets.Reserve( approx_rank_size ); } - for ( int i{ 0 }; i < send_parent_local_col_offsets[r].size(); ++i ) { + for ( int i{ 0 }; i < send_parent_local_col_offsets[r].Size(); ++i ) { send_col_offsets_by_rank[send_parent_col_ranks[r][i]].push_back( send_parent_local_col_offsets[r][i] ); } auto recv_col_offsets_by_rank = MPIArray( &getMPIUtility() ); @@ -164,7 +164,7 @@ std::unique_ptr SparseMatrixTransfer::TransferToParallel( send_col_offsets_by_rank.SendRecvArrayEach( recv_col_offsets_by_rank ); // fill send_parent_local_col_offsets with local element ids axom::Array per_rank_ct( n_ranks, n_ranks ); - for ( int i{ 0 }; i < send_parent_local_col_offsets[r].size(); ++i ) { + for ( int i{ 0 }; i < send_parent_local_col_offsets[r].Size(); ++i ) { auto col_rank = send_parent_col_ranks[r][i]; send_parent_local_col_offsets[r][i] = send_col_offsets_by_rank[col_rank][per_rank_ct[col_rank]++]; } @@ -203,7 +203,7 @@ std::unique_ptr SparseMatrixTransfer::TransferToParallel( // local offdiagonal matrix to the global offdiagonal matrix. std::map cmap_j_offd; for ( int r{ 0 }; r < n_ranks; ++r ) { - for ( int i{ 0 }; i < recv_parent_col_ranks[r].size(); ++i ) { + for ( int i{ 0 }; i < recv_parent_col_ranks[r].Size(); ++i ) { auto col_rank = recv_parent_col_ranks[r][i]; if ( col_rank == my_rank ) { ++diag_nnz; @@ -243,7 +243,7 @@ std::unique_ptr SparseMatrixTransfer::TransferToParallel( mfem::Memory j_offd( offd_nnz ); mfem::Memory data_offd( offd_nnz ); for ( int r{ 0 }; r < n_ranks; ++r ) { - for ( int i{ 0 }; i < recv_parent_col_ranks[r].size(); ++i ) { + for ( int i{ 0 }; i < recv_parent_col_ranks[r].Size(); ++i ) { auto curr_row = recv_parent_local_row[r][i]; auto curr_col = recv_parent_local_col[r][i]; auto col_rank = recv_parent_col_ranks[r][i]; diff --git a/src/redecomp/transfer/TransferByElements.cpp b/src/redecomp/transfer/TransferByElements.cpp index ef340598..47fceef4 100644 --- a/src/redecomp/transfer/TransferByElements.cpp +++ b/src/redecomp/transfer/TransferByElements.cpp @@ -5,6 +5,8 @@ #include "TransferByElements.hpp" +#include + #include "axom/slic.hpp" #include "shared/infrastructure/Profiling.hpp" @@ -30,40 +32,54 @@ void TransferByElements::TransferToSerial( const mfem::ParGridFunction& src, mfe "not the same." ); // send and receive DOF values from other ranks - auto dst_dofs = MPIArray( &redecomp->getMPIUtility() ); - dst_dofs.SendRecvEach( [redecomp, &src]( int dest ) { - auto src_dofs = axom::Array(); - const auto& src_elem_idx = redecomp->getParentToRedecompElems().first[dest]; - auto n_els = src_elem_idx.size(); - if ( n_els > 0 ) { - auto elem_vdofs = mfem::Array(); - auto dof_vals = mfem::Vector(); - // guess the size of src_dofs based on the size of the first element - src.FESpace()->GetElementVDofs( src_elem_idx[0], elem_vdofs ); - src_dofs.reserve( elem_vdofs.Size() * n_els ); - auto vdof_ct = 0; - for ( int e{ 0 }; e < n_els; ++e ) { - src.FESpace()->GetElementVDofs( src_elem_idx[e], elem_vdofs ); - src.GetSubVector( elem_vdofs, dof_vals ); - src_dofs.insert( vdof_ct, dof_vals.Size(), dof_vals.GetData() ); - vdof_ct += dof_vals.Size(); - } - } - return src_dofs; - } ); + MPIArray dst_dofs( &redecomp->getMPIUtility() ); + dst_dofs.SendRecvEach( + [redecomp, &src]( int dest ) { + mfem::Vector src_dofs; + src_dofs.UseDevice( src.UseDevice() ); + const auto& src_elem_idx = redecomp->getParentToRedecompElems().first[dest]; + auto n_els = src_elem_idx.Size(); + if ( n_els > 0 ) { + mfem::Array elem_vdofs; + mfem::Array all_vdofs; + // guess the size of src_dofs based on the size of the first element + src.FESpace()->GetElementVDofs( src_elem_idx[0], elem_vdofs ); + all_vdofs.Reserve( elem_vdofs.Size() * n_els ); + for ( int e{ 0 }; e < n_els; ++e ) { + src.FESpace()->GetElementVDofs( src_elem_idx[e], elem_vdofs ); + all_vdofs.Append( elem_vdofs ); + } + + all_vdofs.GetMemory().UseDevice( src.UseDevice() ); + src_dofs.SetSize( all_vdofs.Size() ); + src.GetSubVector( all_vdofs, src_dofs ); + } + return src_dofs; + }, + src.UseDevice() ); // map received DOF values to local DOFs - auto elem_vdofs = mfem::Array(); + mfem::Array elem_vdofs; auto n_ranks = redecomp->getMPIUtility().NRanks(); for ( int r{ 0 }; r < n_ranks; ++r ) { - auto vdof_ct = 0; auto first_el = redecomp->getRedecompToParentElemOffsets()[r]; auto last_el = redecomp->getRedecompToParentElemOffsets()[r + 1]; - for ( int e{ first_el }; e < last_el; ++e ) { - dst.FESpace()->GetElementVDofs( e, elem_vdofs ); - auto dof_vals = mfem::Vector( &dst_dofs[r][vdof_ct], elem_vdofs.Size() ); - dst.SetSubVector( elem_vdofs, dof_vals ); - vdof_ct += elem_vdofs.Size(); + auto n_els = last_el - first_el; + if ( n_els > 0 ) { + mfem::Array all_vdofs; + // guess the size of all_vdofs based on the size of the first element + dst.FESpace()->GetElementVDofs( first_el, elem_vdofs ); + all_vdofs.Reserve( elem_vdofs.Size() * n_els ); + for ( int e{ first_el }; e < last_el; ++e ) { + dst.FESpace()->GetElementVDofs( e, elem_vdofs ); + all_vdofs.Append( elem_vdofs ); + } + + all_vdofs.GetMemory().UseDevice( dst.UseDevice() ); + // set explicitly in case e.g. src is on device and dst is on host or vice versa + dst_dofs[r].Read( dst.UseDevice() ); + dst_dofs[r].UseDevice( dst.UseDevice() ); + dst.SetSubVector( all_vdofs, dst_dofs[r] ); } } } @@ -85,49 +101,65 @@ void TransferByElements::TransferToParallel( const mfem::GridFunction& src, mfem "not the same." ); // send and receive non-ghost DOF values from other ranks - auto dst_dofs = MPIArray( &redecomp->getMPIUtility() ); - dst_dofs.SendRecvEach( [redecomp, &src]( int dest ) { - auto src_dofs = axom::Array(); - auto first_el = redecomp->getRedecompToParentElemOffsets()[dest]; - auto last_el = redecomp->getRedecompToParentElemOffsets()[dest + 1]; - auto n_els = last_el - first_el; - if ( n_els > 0 ) { - auto elem_vdofs = mfem::Array(); - auto dof_vals = mfem::Vector(); - // guess the size of src_dofs based on the size of the first element - src.FESpace()->GetElementVDofs( first_el, elem_vdofs ); - src_dofs.reserve( elem_vdofs.Size() * n_els ); - auto vdof_ct = 0; - auto ghost_ct = 0; - for ( int e{ first_el }; e < last_el; ++e ) { - // skip ghost elements - if ( ghost_ct < redecomp->getRedecompToParentGhostElems()[dest].size() && - redecomp->getRedecompToParentGhostElems()[dest][ghost_ct] == e ) { - ++ghost_ct; - } else { - src.FESpace()->GetElementVDofs( e, elem_vdofs ); - src.GetSubVector( elem_vdofs, dof_vals ); - src_dofs.insert( vdof_ct, dof_vals.Size(), dof_vals.GetData() ); - vdof_ct += dof_vals.Size(); + MPIArray dst_dofs( &redecomp->getMPIUtility() ); + dst_dofs.SendRecvEach( + [redecomp, &src]( int dest ) { + mfem::Vector src_dofs; + src_dofs.UseDevice( src.UseDevice() ); + auto first_el = redecomp->getRedecompToParentElemOffsets()[dest]; + auto last_el = redecomp->getRedecompToParentElemOffsets()[dest + 1]; + auto n_els = last_el - first_el; + if ( n_els > 0 ) { + mfem::Array elem_vdofs; + mfem::Array all_vdofs; + // guess the size of src_dofs based on the size of the first element + src.FESpace()->GetElementVDofs( first_el, elem_vdofs ); + all_vdofs.Reserve( elem_vdofs.Size() * n_els ); + auto ghost_ct = 0; + for ( int e{ first_el }; e < last_el; ++e ) { + // skip ghost elements + if ( ghost_ct < redecomp->getRedecompToParentGhostElems()[dest].Size() && + redecomp->getRedecompToParentGhostElems()[dest][ghost_ct] == e ) { + ++ghost_ct; + } else { + src.FESpace()->GetElementVDofs( e, elem_vdofs ); + all_vdofs.Append( elem_vdofs ); + } + } + + all_vdofs.GetMemory().UseDevice( src.UseDevice() ); + src_dofs.SetSize( all_vdofs.Size() ); + src.GetSubVector( all_vdofs, src_dofs ); } - } - } - return src_dofs; - } ); + return src_dofs; + }, + src.UseDevice() ); // map received non-ghost DOF values to local DOFs - auto elem_vdofs = mfem::Array(); + mfem::Array elem_vdofs; auto n_ranks = redecomp->getMPIUtility().NRanks(); for ( int r{ 0 }; r < n_ranks; ++r ) { - auto vdof_ct = 0; - for ( int e{ 0 }; e < redecomp->getParentToRedecompElems().first[r].size(); ++e ) { - // skip ghost elements - if ( !redecomp->getParentToRedecompElems().second[r][e] ) { - dst.FESpace()->GetElementVDofs( redecomp->getParentToRedecompElems().first[r][e], elem_vdofs ); - auto dof_vals = mfem::Vector( &dst_dofs[r][vdof_ct], elem_vdofs.Size() ); - dst.SetSubVector( elem_vdofs, dof_vals ); - vdof_ct += elem_vdofs.Size(); + auto n_els = redecomp->getParentToRedecompElems().first[r].Size(); + if ( n_els > 0 ) { + mfem::Array all_vdofs; + // guess the size of all_vdofs based on the size of the first element + dst.FESpace()->GetElementVDofs( redecomp->getParentToRedecompElems().first[r][0], elem_vdofs ); + all_vdofs.Reserve( n_els * elem_vdofs.Size() ); + for ( int e{ 0 }; e < n_els; ++e ) { + // skip ghost elements + if ( !redecomp->getParentToRedecompElems().second[r][e] ) { + dst.FESpace()->GetElementVDofs( redecomp->getParentToRedecompElems().first[r][e], elem_vdofs ); + all_vdofs.Append( elem_vdofs ); + } + } + + if ( all_vdofs.Size() > 0 ) { + all_vdofs.GetMemory().UseDevice( dst.UseDevice() ); + // set explicitly in case e.g. src is on device and dst is on host or vice versa + dst_dofs[r].Read( dst.UseDevice() ); + dst_dofs[r].UseDevice( dst.UseDevice() ); } + dst.SetSubVector( all_vdofs, dst_dofs[r] ); } } } diff --git a/src/redecomp/transfer/TransferByNodes.cpp b/src/redecomp/transfer/TransferByNodes.cpp index c1ca8dc4..0fefcdeb 100644 --- a/src/redecomp/transfer/TransferByNodes.cpp +++ b/src/redecomp/transfer/TransferByNodes.cpp @@ -62,31 +62,47 @@ void TransferByNodes::TransferToSerial( const mfem::ParGridFunction& src, mfem:: "in TransferByNodes." ); // send and receive DOF values from other ranks - auto dst_dofs = MPIArray( &redecomp_->getMPIUtility() ); - auto src_ptr = src.HostRead(); - dst_dofs.SendRecvEach( [src_ptr, src_fes, &src_nodes]( int dst_rank ) { - auto src_dofs = axom::Array(); - auto n_vdofs = src_fes->GetVDim(); - auto n_src_dofs = src_nodes.first[dst_rank].size(); - src_dofs.reserve( n_vdofs * n_src_dofs ); - src_dofs.resize( axom::ArrayOptions::Uninitialized(), n_vdofs, n_src_dofs ); - for ( int d{ 0 }; d < n_vdofs; ++d ) { - for ( int j{ 0 }; j < n_src_dofs; ++j ) { - src_dofs( d, j ) = src_ptr[src_fes->DofToVDof( src_nodes.first[dst_rank][j], d )]; - } - } - return src_dofs; - } ); + auto dst_dofs = MPIArray( &redecomp_->getMPIUtility() ); + dst_dofs.SendRecvEach( + [src_fes, &src_nodes, &src]( int dst_rank ) { + mfem::Vector src_dofs; + src_dofs.UseDevice( src.UseDevice() ); + auto n_vdofs = src_fes->GetVDim(); + auto n_src_dofs = src_nodes.first[dst_rank].Size(); + if ( n_src_dofs > 0 ) { + mfem::Array all_vdofs; + all_vdofs.Reserve( n_vdofs * n_src_dofs ); + for ( int d{ 0 }; d < n_vdofs; ++d ) { + for ( int j{ 0 }; j < n_src_dofs; ++j ) { + all_vdofs.Append( src_fes->DofToVDof( src_nodes.first[dst_rank][j], d ) ); + } + } + + all_vdofs.GetMemory().UseDevice( src.UseDevice() ); + src_dofs.SetSize( all_vdofs.Size() ); + src.GetSubVector( all_vdofs, src_dofs ); + } + return src_dofs; + }, + src.UseDevice() ); // map received DOF values to local DOFs auto n_vdofs = src_fes->GetVDim(); auto n_ranks = redecomp_->getMPIUtility().NRanks(); - auto dst_ptr = dst.HostWrite(); for ( int i{ 0 }; i < n_ranks; ++i ) { - for ( int d{ 0 }; d < n_vdofs; ++d ) { - for ( int j{ 0 }; j < dst_nodes.first[i].size(); ++j ) { - dst_ptr[dst_fes->DofToVDof( dst_nodes.first[i][j], d )] = dst_dofs[i]( d, j ); + if ( dst_nodes.first[i].Size() > 0 ) { + mfem::Array all_vdofs; + all_vdofs.Reserve( n_vdofs * dst_nodes.first[i].Size() ); + for ( int d{ 0 }; d < n_vdofs; ++d ) { + for ( int j{ 0 }; j < dst_nodes.first[i].Size(); ++j ) { + all_vdofs.Append( dst_fes->DofToVDof( dst_nodes.first[i][j], d ) ); + } } + all_vdofs.GetMemory().UseDevice( dst.UseDevice() ); + // set explicitly in case e.g. src is on device and dst is on host or vice versa + dst_dofs[i].Read( dst.UseDevice() ); + dst_dofs[i].UseDevice( dst.UseDevice() ); + dst.SetSubVector( all_vdofs, dst_dofs[i] ); } } } @@ -111,40 +127,64 @@ void TransferByNodes::TransferToParallel( const mfem::GridFunction& src, mfem::P "in TransferByNodes." ); // send and receive non-ghost DOF values from other ranks - auto dst_dofs = MPIArray( &redecomp_->getMPIUtility() ); - auto src_ptr = src.HostRead(); - dst_dofs.SendRecvEach( [src_ptr, src_fes, &src_nodes]( int dst_rank ) { - auto src_dofs = axom::Array(); - auto n_vdofs = src_fes->GetVDim(); - auto n_src_dofs = src_nodes.first[dst_rank].size(); - src_dofs.reserve( n_vdofs * n_src_dofs ); - src_dofs.resize( axom::ArrayOptions::Uninitialized(), n_vdofs, n_src_dofs ); - auto dof_ct = 0; - for ( int j{ 0 }; j < n_src_dofs; ++j ) { - if ( !src_nodes.second[dst_rank][j] ) { - for ( int d{ 0 }; d < n_vdofs; ++d ) { - src_dofs( d, dof_ct ) = src_ptr[src_fes->DofToVDof( src_nodes.first[dst_rank][j], d )]; + auto dst_dofs = MPIArray( &redecomp_->getMPIUtility() ); + dst_dofs.SendRecvEach( + [src_fes, &src_nodes, &src]( int dst_rank ) { + mfem::Vector src_dofs; + src_dofs.UseDevice( src.UseDevice() ); + auto n_vdofs = src_fes->GetVDim(); + auto n_src_dofs = src_nodes.first[dst_rank].Size(); + auto count = 0; + for ( int j{ 0 }; j < n_src_dofs; ++j ) { + if ( !src_nodes.second[dst_rank][j] ) { + ++count; + } } - ++dof_ct; - } - } - src_dofs.shrink(); - return src_dofs; - } ); + if ( count > 0 ) { + mfem::Array all_vdofs; + all_vdofs.Reserve( n_vdofs * count ); + for ( int j{ 0 }; j < n_src_dofs; ++j ) { + if ( !src_nodes.second[dst_rank][j] ) { + for ( int d{ 0 }; d < n_vdofs; ++d ) { + all_vdofs.Append( src_fes->DofToVDof( src_nodes.first[dst_rank][j], d ) ); + } + } + } + + all_vdofs.GetMemory().UseDevice( src.UseDevice() ); + src_dofs.SetSize( all_vdofs.Size() ); + src.GetSubVector( all_vdofs, src_dofs ); + } + return src_dofs; + }, + src.UseDevice() ); // map received non-ghost DOF values to dst auto n_vdofs = src_fes->GetVDim(); auto n_ranks = redecomp_->getMPIUtility().NRanks(); - auto dst_ptr = dst.HostWrite(); for ( int i{ 0 }; i < n_ranks; ++i ) { - auto dof_ct = 0; - for ( int j{ 0 }; j < dst_nodes.first[i].size(); ++j ) { + auto count = 0; + for ( int j{ 0 }; j < dst_nodes.first[i].Size(); ++j ) { if ( !dst_nodes.second[i][j] ) { - for ( int d{ 0 }; d < n_vdofs; ++d ) { - dst_ptr[dst_fes->DofToVDof( dst_nodes.first[i][j], d )] = dst_dofs[i]( d, dof_ct ); + ++count; + } + } + if ( count > 0 ) { + mfem::Array all_vdofs; + all_vdofs.Reserve( n_vdofs * count ); + for ( int j{ 0 }; j < dst_nodes.first[i].Size(); ++j ) { + if ( !dst_nodes.second[i][j] ) { + for ( int d{ 0 }; d < n_vdofs; ++d ) { + all_vdofs.Append( dst_fes->DofToVDof( dst_nodes.first[i][j], d ) ); + } } - ++dof_ct; } + + all_vdofs.GetMemory().UseDevice( dst.UseDevice() ); + // set explicitly in case e.g. src is on device and dst is on host or vice versa + dst_dofs[i].Read( dst.UseDevice() ); + dst_dofs[i].UseDevice( dst.UseDevice() ); + dst.SetSubVector( all_vdofs, dst_dofs[i] ); } } } @@ -159,14 +199,14 @@ EntityIndexByRank TransferByNodes::P2RNodeList( bool use_global_ids ) const auto& p2r_elem_ghost = redecomp_->getParentToRedecompElems().second; auto n_ranks = redecomp_->getMPIUtility().NRanks(); for ( int r{ 0 }; r < n_ranks; ++r ) { - auto n_els = p2r_elem_idx[r].size(); + auto n_els = p2r_elem_idx[r].Size(); if ( n_els > 0 ) { auto n_dofs = parent_fes_->GetFE( p2r_elem_idx[r][0] )->GetDof(); - p2r_node_idx[r].reserve( n_els * n_dofs ); - p2r_node_ghost[r].reserve( n_els * n_dofs ); + p2r_node_idx[r].Reserve( n_els * n_dofs ); + p2r_node_ghost[r].Reserve( n_els * n_dofs ); auto node_idx_map = std::unordered_map(); auto dof_ct = 0; - for ( int e{ 0 }; e < p2r_elem_idx[r].size(); ++e ) { + for ( int e{ 0 }; e < p2r_elem_idx[r].Size(); ++e ) { auto is_elem_ghost = p2r_elem_ghost[r][e]; auto elem_dofs = mfem::Array(); parent_fes_->GetElementDofs( p2r_elem_idx[r][e], elem_dofs ); @@ -184,8 +224,10 @@ EntityIndexByRank TransferByNodes::P2RNodeList( bool use_global_ids ) } } } - p2r_node_idx[r].shrink(); - p2r_node_ghost[r].shrink(); + auto tmp_p2r_node_idx = p2r_node_idx[r]; + auto tmp_p2r_node_ghost = p2r_node_ghost[r]; + std::swap( p2r_node_idx[r], tmp_p2r_node_idx ); + std::swap( p2r_node_ghost[r], tmp_p2r_node_ghost ); } } return { std::move( p2r_node_idx ), std::move( p2r_node_ghost ) }; @@ -204,14 +246,14 @@ EntityIndexByRank TransferByNodes::R2PNodeList() auto n_els = last_el - first_el; if ( n_els > 0 ) { auto n_dofs = redecomp_fes_->GetFE( first_el )->GetDof(); - r2p_node_idx[r].reserve( n_els * n_dofs ); - r2p_node_ghost[r].reserve( n_els * n_dofs ); + r2p_node_idx[r].Reserve( n_els * n_dofs ); + r2p_node_ghost[r].Reserve( n_els * n_dofs ); auto node_idx_map = std::unordered_map(); auto dof_ct = 0; auto ghost_ct = 0; for ( int e{ first_el }; e < last_el; ++e ) { auto is_elem_ghost = false; - if ( ghost_ct < redecomp_->getRedecompToParentGhostElems()[r].size() && + if ( ghost_ct < redecomp_->getRedecompToParentGhostElems()[r].Size() && redecomp_->getRedecompToParentGhostElems()[r][ghost_ct] == e ) { ++ghost_ct; is_elem_ghost = true; @@ -229,8 +271,10 @@ EntityIndexByRank TransferByNodes::R2PNodeList() } } } - r2p_node_idx[r].shrink(); - r2p_node_idx[r].shrink(); + auto tmp_r2p_node_idx = r2p_node_idx[r]; + auto tmp_r2p_node_ghost = r2p_node_ghost[r]; + std::swap( r2p_node_idx[r], tmp_r2p_node_idx ); + std::swap( r2p_node_ghost[r], tmp_r2p_node_ghost ); } } return { std::move( r2p_node_idx ), std::move( r2p_node_ghost ) }; diff --git a/src/redecomp/utils/MPIArray.hpp b/src/redecomp/utils/MPIArray.hpp index f6ccb24c..4d4f6d92 100644 --- a/src/redecomp/utils/MPIArray.hpp +++ b/src/redecomp/utils/MPIArray.hpp @@ -6,7 +6,7 @@ #ifndef SRC_REDECOMP_UTILS_MPIARRAY_HPP_ #define SRC_REDECOMP_UTILS_MPIARRAY_HPP_ -#include "axom/core.hpp" +#include "mfem.hpp" #include "shared/infrastructure/Profiling.hpp" @@ -15,25 +15,23 @@ namespace redecomp { /** - * @brief Creates and manages per-MPI-rank axom::Arrays + * @brief Creates and manages per-MPI-rank arrays * * @tparam T Array data type - * @tparam DIM Array dimension + * @tparam ArrayType Array type on each rank */ -template >> -class MPIArray : public ArrayType { +template > +class MPIArray : public std::vector { public: + typedef ArrayType ArrayT; /** * @brief Construct a new MPIArray object * * @param mpi MPIUtility to define MPI_Comm for MPI operations * @param array Array data */ - MPIArray( const MPIUtility* mpi, const ArrayType& array ) : ArrayType( array ), mpi_{ mpi } + MPIArray( const MPIUtility* mpi, const std::vector& array ) : std::vector( array ), mpi_{ mpi } { - this->reserve( mpi_->NRanks() ); - this->resize( mpi_->NRanks() ); - this->shrink(); } /** @@ -42,11 +40,9 @@ class MPIArray : public ArrayType { * @param mpi MPIUtility to define MPI_Comm for MPI operations * @param array Array data */ - MPIArray( const MPIUtility* mpi, ArrayType&& array ) : ArrayType( std::move( array ) ), mpi_{ mpi } + MPIArray( const MPIUtility* mpi, std::vector&& array ) + : std::vector( std::move( array ) ), mpi_{ mpi } { - this->reserve( mpi_->NRanks() ); - this->resize( mpi_->NRanks() ); - this->shrink(); } /** @@ -54,7 +50,7 @@ class MPIArray : public ArrayType { * * @param mpi MPIUtility to define MPI_Comm for MPI operations */ - MPIArray( const MPIUtility* mpi ) : MPIArray( mpi, ArrayType( 0, 0 ) ) {} + MPIArray( const MPIUtility* mpi ) : MPIArray( mpi, std::vector( mpi->NRanks() ) ) {} /** * @brief Construct an empty MPIArray object (note: object cannot be used) @@ -62,59 +58,65 @@ class MPIArray : public ArrayType { MPIArray() = default; /** - * @brief Returns the axom::Array at the given rank + * @brief Returns the array at the given rank * * @param rank The MPI rank of the array - * @return axom::Array& holding array values at rank + * @return ArrayType reference holding array values at rank */ - axom::Array& at( axom::IndexType rank ) { return this->operator[]( rank ); } + ArrayType& at( axom::IndexType rank ) { return this->operator[]( rank ); } /** - * @brief Returns the axom::Array at the given rank + * @brief Returns the array at the given rank * * @param rank The MPI rank of the array - * @return axom::Array& holding array values at rank + * @return ArrayType reference holding array values at rank */ - const axom::Array& at( axom::IndexType rank ) const { return this->operator[]( rank ); } + const ArrayType& at( axom::IndexType rank ) const { return this->operator[]( rank ); } /** * @brief Sends the Array data to all other MPI ranks * * @param data Data to send to other ranks */ - static void SendAll( const axom::Array& data ) { data.mpi_.SendAll( data ); } + static void SendAll( const ArrayType& data ) { data.mpi_.SendAll( data ); } /** * @brief Receive data sent from a call to MPIArray::SendAll() * * @param src The source rank of the data + * @param use_device Whether to allocate the received array on device */ - void RecvSendAll( axom::IndexType src ) { at( src ) = mpi_->RecvSendAll( type>(), src ); } + void RecvSendAll( axom::IndexType src, bool use_device = false ) + { + at( src ) = mpi_->RecvSendAll( type(), src, use_device ); + } /** * @brief Sends the MPIArray data to all other MPI ranks while receiving from other ranks * * @param data Data to send to other ranks + * @param use_device Whether to allocate the received array on device */ - void SendRecvArrayEach( const MPIArray& data ) + void SendRecvArrayEach( const MPIArray& data, bool use_device = false ) { mpi_->SendRecvEach( - type>(), [data]( axom::IndexType dst ) { return data.at( dst ); }, - [this]( axom::Array&& recv_data, axom::IndexType src ) { at( src ) = std::move( recv_data ); } ); + type(), [data]( axom::IndexType dst ) { return data.at( dst ); }, + [this]( ArrayType&& recv_data, axom::IndexType src ) { at( src ) = std::move( recv_data ); }, use_device ); } /** * @brief Create data to send to all other MPI ranks while receiving from other ranks * - * @param build_send A lambda which returns an axom::Array to send to the input rank + * @param build_send A lambda which returns an ArrayType to send to the input rank + * @param use_device Whether to allocate the received array on device */ template - void SendRecvEach( F&& build_send ) + void SendRecvEach( F&& build_send, bool use_device = false ) { TRIBOL_MARK_FUNCTION; mpi_->SendRecvEach( - type>(), std::forward( build_send ), - [this]( axom::Array&& recv_data, axom::IndexType src ) { at( src ) = std::move( recv_data ); } ); + type(), std::forward( build_send ), + [this]( ArrayType&& recv_data, axom::IndexType src ) { at( src ) = std::move( recv_data ); }, use_device ); } private: diff --git a/src/redecomp/utils/MPIUtility.cpp b/src/redecomp/utils/MPIUtility.cpp index 9553c273..963445fd 100644 --- a/src/redecomp/utils/MPIUtility.cpp +++ b/src/redecomp/utils/MPIUtility.cpp @@ -19,23 +19,6 @@ MPIUtility::Request::Request( std::unique_ptr request ) : request_{ void MPIUtility::Request::Wait() { MPI_Wait( request_.get(), &status_ ); } -BisecTree MPIUtility::BuildSendTree( int rank ) const -{ - auto send_tree = BisecTree( n_ranks_ ); - auto lvl_it = --send_tree.end(); - *lvl_it = ArrayUtility::IndexArray( n_ranks_ ); - while ( lvl_it != send_tree.begin() ) { - --lvl_it; - for ( auto node_it = send_tree.begin( lvl_it ); node_it != send_tree.end( lvl_it ); ++node_it ) { - auto right_node_it = send_tree.right( lvl_it, node_it ); - *node_it = right_node_it.second == send_tree.end( right_node_it.first ) || *right_node_it.second != rank - ? *send_tree.left( lvl_it, node_it ).second - : rank; - } - } - return send_tree; -} - template <> MPI_Datatype MPIUtility::GetMPIType() const { @@ -66,4 +49,65 @@ MPI_Datatype MPIUtility::GetMPIType() const return MPI_LONG; } -} // end namespace redecomp +void MPIUtility::Send( const mfem::Vector& container, int dest, int tag ) const +{ + const void* data = nullptr; + if ( container.UseDevice() && mfem::Device::GetGPUAwareMPI() ) { + data = container.Read(); + } else { + data = container.HostRead(); + } + MPI_Send( data, container.Size(), GetMPIType(), dest, tag, comm_ ); +} + +std::unique_ptr MPIUtility::Isend( const mfem::Vector& container, int dest, int tag ) const +{ + auto request = std::make_unique(); + const void* data = nullptr; + if ( container.UseDevice() && mfem::Device::GetGPUAwareMPI() ) { + data = container.Read(); + } else { + data = container.HostRead(); + } + MPI_Isend( data, container.Size(), GetMPIType(), dest, tag, comm_, request.get() ); + return std::make_unique( std::move( request ) ); +} + +mfem::Vector MPIUtility::Recv( type, int source, int tag, bool use_device ) const +{ + auto container = mfem::Vector(); + if ( use_device && mfem::Device::GetGPUAwareMPI() ) { + container.UseDevice( true ); + } + MPI_Probe( source, tag, comm_, &status_ ); + int count; + MPI_Get_count( &status_, GetMPIType(), &count ); + container.SetSize( count ); + void* data = nullptr; + if ( container.UseDevice() && mfem::Device::GetGPUAwareMPI() ) { + data = container.Write(); + } else { + data = container.HostWrite(); + } + MPI_Recv( data, count, GetMPIType(), source, tag, comm_, &status_ ); + return container; +} + +BisecTree MPIUtility::BuildSendTree( int rank ) const +{ + auto send_tree = BisecTree( n_ranks_ ); + auto lvl_it = --send_tree.end(); + *lvl_it = ArrayUtility::IndexArray( n_ranks_ ); + while ( lvl_it != send_tree.begin() ) { + --lvl_it; + for ( auto node_it = send_tree.begin( lvl_it ); node_it != send_tree.end( lvl_it ); ++node_it ) { + auto right_node_it = send_tree.right( lvl_it, node_it ); + *node_it = right_node_it.second == send_tree.end( right_node_it.first ) || *right_node_it.second != rank + ? *send_tree.left( lvl_it, node_it ).second + : rank; + } + } + return send_tree; +} + +} // end namespace redecomp \ No newline at end of file diff --git a/src/redecomp/utils/MPIUtility.hpp b/src/redecomp/utils/MPIUtility.hpp index 181fd0e8..65c966a7 100644 --- a/src/redecomp/utils/MPIUtility.hpp +++ b/src/redecomp/utils/MPIUtility.hpp @@ -11,6 +11,8 @@ #include +#include "mfem.hpp" + #include "axom/core.hpp" #include "axom/slic.hpp" @@ -126,6 +128,26 @@ class MPIUtility { template void Send( const T& container, int dest, int tag = 0 ) const; + /** + * @brief Calls MPI_Send on an array stored in an mfem::Array + * + * @tparam T Type stored in mfem::Array + * @param container Stores the data to send + * @param dest Rank to send the data in container to + * @param tag MPI tag for identifying the data + */ + template + void Send( const mfem::Array& container, int dest, int tag = 0 ) const; + + /** + * @brief Calls MPI_Send on a vector stored in an mfem::Vector + * + * @param container Stores the data to send + * @param dest Rank to send the data in container to + * @param tag MPI tag for identifying the data + */ + void Send( const mfem::Vector& container, int dest, int tag = 0 ) const; + /** * @brief Calls MPI_Send on an 2D array stored in an axom::Array * @@ -150,6 +172,28 @@ class MPIUtility { template std::unique_ptr Isend( const T& container, int dest, int tag = 0 ) const; + /** + * @brief Calls MPI_Isend (non-blocking send) on an array stored in an mfem::Array + * + * @tparam T Type stored in mfem::Array + * @param container Stores the data to send + * @param dest Rank to send the data in container to + * @param tag MPI tag for identifying the data + * @return Request object to track completion of the send + */ + template + std::unique_ptr Isend( const mfem::Array& container, int dest, int tag = 0 ) const; + + /** + * @brief Calls MPI_Isend (non-blocking send) on a vector stored in an mfem::Vector + * + * @param container Stores the data to send + * @param dest Rank to send the data in container to + * @param tag MPI tag for identifying the data + * @return Request object to track completion of the send + */ + std::unique_ptr Isend( const mfem::Vector& container, int dest, int tag = 0 ) const; + /** * @brief Calls MPI_Isend (non-blocking send) on a 2D axom::Array * @@ -169,10 +213,33 @@ class MPIUtility { * @tparam T Data type of the data container * @param source MPI rank data is coming from * @param tag MPI tag for identifying the data + * @param use_device Whether to allocate the received array on device (if GPU-aware MPI is available) * @return Container of type T holding data sent */ template - T Recv( type, int source, int tag = 0 ) const; + T Recv( type, int source, int tag = 0, bool use_device = false ) const; + + /** + * @brief Calls MPI_Recv on an array stored in an mfem::Array + * + * @tparam T Data type of the data container + * @param source MPI rank data is coming from + * @param tag MPI tag for identifying the data + * @param use_device Whether to allocate the received array on device (if GPU-aware MPI is available) + * @return Container of type mfem::Array holding data sent + */ + template + mfem::Array Recv( type>, int source, int tag = 0, bool use_device = false ) const; + + /** + * @brief Calls MPI_Recv on a vector stored in an mfem::Vector + * + * @param source MPI rank data is coming from + * @param tag MPI tag for identifying the data + * @param use_device Whether to allocate the received vector on device (if GPU-aware MPI is available) + * @return Container of type mfem::Vector holding data sent + */ + mfem::Vector Recv( type, int source, int tag = 0, bool use_device = false ) const; /** * @brief Calls MPI_Recv on an 2D array stored in an axom::Array @@ -181,10 +248,11 @@ class MPIUtility { * @tparam Sp axom::MemorySpace of the axom::Array * @param source MPI rank data is coming from * @param tag MPI tag for identifying the data + * @param use_device Whether to allocate the received array on device (if GPU-aware MPI is available) * @return 2D axom::Array holding the data sent */ template - axom::Array Recv( type>, int source, int tag = 0 ) const; + axom::Array Recv( type>, int source, int tag = 0, bool use_device = false ) const; /** * @brief Sends the array stored in container to all other ranks @@ -200,10 +268,11 @@ class MPIUtility { * * @tparam T Data type of the data container * @param rank Rank the data originated from + * @param use_device Whether to allocate the received array on device * @return Container of type T holding data sent */ template - T RecvSendAll( type, int rank ) const; + T RecvSendAll( type, int rank, bool use_device = false ) const; /** * @brief Sends and receives a different array to each rank @@ -213,9 +282,10 @@ class MPIUtility { * @tparam F2 Lambda with container type T and source rank parameters * @param build_send Builds a container of type T holding data to be sent to destination rank * @param process_recv Process the data received from another rank + * @param use_device Whether to allocate the received array on device */ template - void SendRecvEach( type, F1&& build_send, F2&& process_recv ) const; + void SendRecvEach( type, F1&& build_send, F2&& process_recv, bool use_device = false ) const; private: /** @@ -306,6 +376,18 @@ void MPIUtility::Send( const T& container, int dest, int tag ) const MPI_Send( container.data(), container.size(), GetMPIDatatype( container.data() ), dest, tag, comm_ ); } +template +void MPIUtility::Send( const mfem::Array& container, int dest, int tag ) const +{ + const void* data = nullptr; + if ( container.UseDevice() && mfem::Device::GetGPUAwareMPI() ) { + data = container.Read(); + } else { + data = container.HostRead(); + } + MPI_Send( data, container.Size(), GetMPIType::type>(), dest, tag, comm_ ); +} + template void MPIUtility::Send( const axom::Array& container, int dest, int tag ) const { @@ -321,6 +403,20 @@ std::unique_ptr MPIUtility::Isend( const T& container, int return std::make_unique( std::move( request ) ); } +template +std::unique_ptr MPIUtility::Isend( const mfem::Array& container, int dest, int tag ) const +{ + auto request = std::make_unique(); + const void* data = nullptr; + if ( container.UseDevice() && mfem::Device::GetGPUAwareMPI() ) { + data = container.Read(); + } else { + data = container.HostRead(); + } + MPI_Isend( data, container.Size(), GetMPIType::type>(), dest, tag, comm_, request.get() ); + return std::make_unique( std::move( request ) ); +} + template std::unique_ptr MPIUtility::Isend( const axom::Array& container, int dest, int tag ) const @@ -332,7 +428,7 @@ std::unique_ptr MPIUtility::Isend( const axom::Array -T MPIUtility::Recv( type, int source, int tag ) const +T MPIUtility::Recv( type, int source, int tag, bool ) const { auto container = T(); MPI_Probe( source, tag, comm_, &status_ ); @@ -344,8 +440,29 @@ T MPIUtility::Recv( type, int source, int tag ) const return container; } +template +mfem::Array MPIUtility::Recv( type>, int source, int tag, bool use_device ) const +{ + auto container = mfem::Array(); + if ( use_device && mfem::Device::GetGPUAwareMPI() ) { + container.GetMemory().UseDevice( true ); + } + MPI_Probe( source, tag, comm_, &status_ ); + int count; + MPI_Get_count( &status_, GetMPIType::type>(), &count ); + container.SetSize( count ); + void* data = nullptr; + if ( container.UseDevice() && mfem::Device::GetGPUAwareMPI() ) { + data = container.Write(); + } else { + data = container.HostWrite(); + } + MPI_Recv( data, count, GetMPIType::type>(), source, tag, comm_, &status_ ); + return container; +} + template -axom::Array MPIUtility::Recv( type>, int source, int tag ) const +axom::Array MPIUtility::Recv( type>, int source, int tag, bool ) const { auto container = axom::Array(); axom::StackArray dim_size; @@ -365,7 +482,7 @@ void MPIUtility::SendAll( const T& container ) const } template -T MPIUtility::RecvSendAll( type, int rank ) const +T MPIUtility::RecvSendAll( type, int rank, bool use_device ) const { SLIC_ERROR_IF( rank == my_rank_, "Send and receive rank are the same." ); const auto send_tree = BuildSendTree( rank ); @@ -377,7 +494,7 @@ T MPIUtility::RecvSendAll( type, int rank ) const node_it = it.second; it = send_tree.root( lvl_it, node_it ); } - auto container = Recv( type(), *it.second ); + auto container = Recv( type(), *it.second, 0, use_device ); SendToRest( container, send_tree, lvl_it, node_it ); return container; } @@ -404,7 +521,7 @@ void MPIUtility::SendToRest( const T& container, const BisecTree& send_tree } template -void MPIUtility::SendRecvEach( type, F1&& build_send, F2&& process_recv ) const +void MPIUtility::SendRecvEach( type, F1&& build_send, F2&& process_recv, bool use_device ) const { for ( int i{ 1 }; i < n_ranks_; ++i ) { // compute which rank we are sending and receiving data to @@ -418,7 +535,7 @@ void MPIUtility::SendRecvEach( type, F1&& build_send, F2&& process_recv ) con auto request = Isend( data, dest ); // receive data and process - process_recv( Recv( type(), source ), source ); + process_recv( Recv( type(), source, 0, use_device ), source ); // wait for send to complete request->Wait(); diff --git a/src/tribol/common/BasicTypes.hpp b/src/shared/BasicTypes.hpp similarity index 94% rename from src/tribol/common/BasicTypes.hpp rename to src/shared/BasicTypes.hpp index f8927524..ecc86344 100644 --- a/src/tribol/common/BasicTypes.hpp +++ b/src/shared/BasicTypes.hpp @@ -3,11 +3,11 @@ // // SPDX-License-Identifier: (MIT) -#ifndef SRC_TRIBOL_COMMON_BASICTYPES_HPP_ -#define SRC_TRIBOL_COMMON_BASICTYPES_HPP_ +#ifndef SHARED_BASICTYPES_HPP_ +#define SHARED_BASICTYPES_HPP_ -// Tribol config include -#include "tribol/config.hpp" +// Shared config include +#include "shared/config.hpp" // C includes #include diff --git a/src/shared/CMakeLists.txt b/src/shared/CMakeLists.txt index 54688413..1a63ba5f 100644 --- a/src/shared/CMakeLists.txt +++ b/src/shared/CMakeLists.txt @@ -6,6 +6,9 @@ ## list of headers set(shared_headers + BasicTypes.hpp + ExecModel.hpp + LoopExec.hpp infrastructure/Profiling.hpp mesh/MeshBuilder.hpp ) @@ -20,6 +23,9 @@ set(shared_sources set(shared_depends mfem) blt_list_append(TO shared_depends ELEMENTS blt::mpi IF TRIBOL_USE_MPI ) blt_list_append(TO shared_depends ELEMENTS caliper IF TRIBOL_USE_CALIPER ) +blt_list_append(TO shared_depends ELEMENTS umpire IF TRIBOL_USE_UMPIRE ) +blt_list_append(TO shared_depends ELEMENTS RAJA IF TRIBOL_USE_RAJA ) +blt_list_append(TO shared_depends ELEMENTS ClangEnzymeFlags IF TRIBOL_USE_ENZYME) message(STATUS "Tribol shared library dependencies: ${shared_depends}") ## create the library diff --git a/src/tribol/common/ExecModel.hpp b/src/shared/ExecModel.hpp similarity index 96% rename from src/tribol/common/ExecModel.hpp rename to src/shared/ExecModel.hpp index ef4ebed2..725110d1 100644 --- a/src/tribol/common/ExecModel.hpp +++ b/src/shared/ExecModel.hpp @@ -3,11 +3,11 @@ // // SPDX-License-Identifier: (MIT) -#ifndef SRC_TRIBOL_COMMON_EXECMODEL_HPP_ -#define SRC_TRIBOL_COMMON_EXECMODEL_HPP_ +#ifndef SHARED_EXECMODEL_HPP_ +#define SHARED_EXECMODEL_HPP_ // Tribol includes -#include "tribol/common/BasicTypes.hpp" +#include "shared/BasicTypes.hpp" // Axom includes #include "axom/core/memory_management.hpp" diff --git a/src/tribol/common/LoopExec.hpp b/src/shared/LoopExec.hpp similarity index 98% rename from src/tribol/common/LoopExec.hpp rename to src/shared/LoopExec.hpp index 5edf3bdf..f721ea45 100644 --- a/src/tribol/common/LoopExec.hpp +++ b/src/shared/LoopExec.hpp @@ -3,14 +3,14 @@ // // SPDX-License-Identifier: (MIT) -#ifndef SRC_TRIBOL_COMMON_LOOPEXEC_HPP_ -#define SRC_TRIBOL_COMMON_LOOPEXEC_HPP_ +#ifndef SHARED_LOOPEXEC_HPP_ +#define SHARED_LOOPEXEC_HPP_ // C++ includes #include // Tribol includes -#include "tribol/common/ExecModel.hpp" +#include "shared/ExecModel.hpp" // Axom includes #include "axom/slic.hpp" diff --git a/src/shared/config.hpp.in b/src/shared/config.hpp.in index c421ee4c..2b5716e7 100644 --- a/src/shared/config.hpp.in +++ b/src/shared/config.hpp.in @@ -3,18 +3,25 @@ // // SPDX-License-Identifier: (MIT) -#ifndef SHARED_CONFIG_HPP_ -#define SHARED_CONFIG_HPP_ +#ifndef SRC_SHARED_CONFIG_HPP_ +#define SRC_SHARED_CONFIG_HPP_ /* - * Note: Use only C-style comments in this file - * since it might be included from a C file + * Note: Use only C-style comments in this file since it might be included from a C file */ /* * Build information */ +#cmakedefine TRIBOL_USE_SINGLE_PRECISION #cmakedefine TRIBOL_USE_CALIPER #cmakedefine TRIBOL_USE_MPI +#cmakedefine TRIBOL_USE_UMPIRE +#cmakedefine TRIBOL_USE_RAJA +#cmakedefine TRIBOL_USE_ENZYME +#cmakedefine TRIBOL_USE_CUDA +#cmakedefine TRIBOL_USE_HIP +#cmakedefine TRIBOL_USE_OPENMP +#cmakedefine TRIBOL_USE_GPU_MPI -#endif /* SHARED_CONFIG_HPP_ */ +#endif /* SRC_SHARED_CONFIG_HPP_ */ diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 5b0a90ac..69156c88 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -52,7 +52,8 @@ foreach( test ${tribol_tests} ) FOLDER tribol/tests ) blt_add_test( NAME ${test_name} - COMMAND ${test_name}_test ) + COMMAND ${test_name}_test + NUM_MPI_TASKS 1 ) if (ENABLE_CUDA) set_target_properties(${test_name}_test PROPERTIES CUDA_SEPARABLE_COMPILATION On) @@ -82,6 +83,7 @@ if ( BUILD_REDECOMP AND TRIBOL_USE_MPI ) redecomp_massmatrix.cpp redecomp_rectmatrix.cpp redecomp_sparsematrix.cpp + redecomp_gpu_mpi.cpp ) set(redecomp_test_depends tribol gtest) @@ -107,6 +109,9 @@ if ( BUILD_REDECOMP AND TRIBOL_USE_MPI ) if (ENABLE_CUDA) set_target_properties(${test_name}_test PROPERTIES CUDA_SEPARABLE_COMPILATION On) + target_compile_options(${test_name}_test PRIVATE + $<$,$>:-G> + ) endif() endforeach() @@ -192,7 +197,8 @@ if( TRIBOL_USE_ENZYME ) FOLDER tribol/tests ) blt_add_test( NAME ${test_name} - COMMAND ${test_name}_test ) + COMMAND ${test_name}_test + NUM_MPI_TASKS 1 ) if (ENABLE_CUDA) set_target_properties(${test_name}_test PROPERTIES CUDA_SEPARABLE_COMPILATION On) diff --git a/src/tests/redecomp_gpu_mpi.cpp b/src/tests/redecomp_gpu_mpi.cpp new file mode 100644 index 00000000..dcc95a9b --- /dev/null +++ b/src/tests/redecomp_gpu_mpi.cpp @@ -0,0 +1,171 @@ +// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and +// other Tribol Project Developers. See the top-level LICENSE file for details. +// +// SPDX-License-Identifier: (MIT) + +#include "shared/config.hpp" + +#include + +#include + +#include "mfem.hpp" +#include "redecomp/redecomp.hpp" + +namespace redecomp { + +#ifdef TRIBOL_USE_GPU_MPI + +class GpuMpiTest : public testing::Test { + protected: + mfem::ParMesh par_mesh_; + std::unique_ptr h1_elems_; + std::unique_ptr par_vector_space_; + std::unique_ptr orig_; + std::unique_ptr final_; + std::unique_ptr redecomp_mesh_; + std::unique_ptr redecomp_vector_space_; + std::unique_ptr xfer_; + + void SetUp() override + { + // Create a simple mesh + mfem::Mesh serial_mesh = mfem::Mesh::MakeCartesian2D( 4, 4, mfem::Element::QUADRILATERAL ); + par_mesh_ = mfem::ParMesh( MPI_COMM_WORLD, serial_mesh ); + + // Setup fields + h1_elems_ = std::make_unique( 1, 2 ); + par_vector_space_ = std::make_unique( &par_mesh_, h1_elems_.get(), 2 ); + orig_ = std::make_unique( par_vector_space_.get() ); + final_ = std::make_unique( par_vector_space_.get() ); + + // Initialize orig with coordinates + par_mesh_.GetNodes( *orig_ ); + + // Move to device (important!) + if ( mfem::Device::Allows( mfem::Backend::CUDA_MASK | mfem::Backend::HIP_MASK ) ) { + orig_->UseDevice( true ); + orig_->Read( true ); // Allocate/Move to device + } + + redecomp_mesh_ = std::make_unique( par_mesh_ ); + redecomp_vector_space_ = std::make_unique( redecomp_mesh_.get(), h1_elems_.get(), 2 ); + xfer_ = std::make_unique( redecomp_vector_space_.get() ); + + // xfer needs to be on device too + if ( mfem::Device::Allows( mfem::Backend::CUDA_MASK | mfem::Backend::HIP_MASK ) ) { + xfer_->UseDevice( true ); + } + } +}; + +TEST_F( GpuMpiTest, VerifyTransfer ) +{ + if ( !mfem::Device::GetGPUAwareMPI() ) { + // If not enabled (maybe runtime doesn't support it), skip + GTEST_SKIP() << "GPU Aware MPI not enabled or supported."; + } + + // Double check that we are actually on device if expected +#if defined( TRIBOL_USE_CUDA ) || defined( TRIBOL_USE_HIP ) + ASSERT_TRUE( orig_->GetMemory().UseDevice() ); +#endif + + auto transfer_map = RedecompTransfer(); + + // Transfer to Serial (Redecomp) + // orig_ is on device. + transfer_map.TransferToSerial( *orig_, *xfer_ ); + + // Transfer back + transfer_map.TransferToParallel( *xfer_, *final_ ); + + // Check error + if ( mfem::Device::Allows( mfem::Backend::CUDA_MASK | mfem::Backend::HIP_MASK ) ) { + final_->UseDevice( true ); + } + + // Verify values (L2 error) + *final_ -= *orig_; + double err = final_->Norml2() / orig_->Norml2(); + EXPECT_LT( err, 1e-12 ); +} + +TEST_F( GpuMpiTest, VerifyDeviceTransferStrict ) +{ + if ( !mfem::Device::GetGPUAwareMPI() ) { + GTEST_SKIP() << "GPU Aware MPI not enabled or supported."; + } + + int rank; + MPI_Comm_rank( MPI_COMM_WORLD, &rank ); + int size; + MPI_Comm_size( MPI_COMM_WORLD, &size ); + if ( size < 2 ) GTEST_SKIP(); + + const auto& mpi = redecomp_mesh_->getMPIUtility(); + + int N = 100; + mfem::Vector send_vec( N ); + send_vec.UseDevice( true ); + send_vec = 1.0; // Device = 1.0 + + // Pollute Host to verify we are reading from Device + { + const double* h_read = send_vec.HostRead(); // Syncs to host (1.0) + double* h_ptr = const_cast( h_read ); + for ( int i = 0; i < N; ++i ) h_ptr[i] = 2.0; // Host = 2.0 + } + // send_vec internal state: ValidDevice=true, ValidHost=true (but we hacked it). + + mfem::Vector recv_vec( N ); + + // Transfer rank 0 -> rank 1 + if ( rank == 0 ) { + mpi.Send( send_vec, 1, 999 ); + } else if ( rank == 1 ) { + recv_vec = mpi.Recv( type(), 0, 999, true ); + } + + MPI_Barrier( MPI_COMM_WORLD ); + + if ( rank == 1 ) { + // Check if we received 1.0 (from Device) or 2.0 (from Host) + const double* res = recv_vec.HostRead(); + EXPECT_DOUBLE_EQ( res[0], 1.0 ); + } +} + +#endif // TRIBOL_USE_GPU_MPI + +} // namespace redecomp + +// Main +#include "axom/slic/core/SimpleLogger.hpp" + +int main( int argc, char* argv[] ) +{ + MPI_Init( &argc, &argv ); + + ::testing::InitGoogleTest( &argc, argv ); + +#if defined( TRIBOL_USE_CUDA ) + mfem::Device device( "cuda" ); + // Enable GPU Aware MPI + mfem::Device::SetGPUAwareMPI( true ); + device.Print(); +#elif defined( TRIBOL_USE_HIP ) + mfem::Device device( "hip" ); + // Enable GPU Aware MPI + mfem::Device::SetGPUAwareMPI( true ); + device.Print(); +#endif + + axom::slic::SimpleLogger logger; + + int result = RUN_ALL_TESTS(); + + MPI_Finalize(); + + return result; +} diff --git a/src/tests/redecomp_multitransfer.cpp b/src/tests/redecomp_multitransfer.cpp index e8baf9db..18692ecf 100644 --- a/src/tests/redecomp_multitransfer.cpp +++ b/src/tests/redecomp_multitransfer.cpp @@ -264,9 +264,19 @@ int main( int argc, char* argv[] ) ::testing::InitGoogleTest( &argc, argv ); +#ifdef TRIBOL_USE_UMPIRE + umpire::ResourceManager::getInstance(); // initialize umpire's ResouceManager +#endif + axom::slic::SimpleLogger logger; // create & initialize test logger, finalized when // exiting main scope +#ifdef TRIBOL_USE_CUDA + mfem::Device device( "cuda" ); +#elif defined( TRIBOL_USE_HIP ) + mfem::Device device( "hip" ); +#endif + result = RUN_ALL_TESTS(); MPI_Finalize(); diff --git a/src/tests/redecomp_transfer.cpp b/src/tests/redecomp_transfer.cpp index f7ef71a4..26f5c170 100644 --- a/src/tests/redecomp_transfer.cpp +++ b/src/tests/redecomp_transfer.cpp @@ -3,6 +3,8 @@ // // SPDX-License-Identifier: (MIT) +#include "tribol/config.hpp" + #include #include "mfem.hpp" @@ -12,8 +14,6 @@ #include "umpire/ResourceManager.hpp" #endif -#include "tribol/config.hpp" - #include "redecomp/redecomp.hpp" namespace redecomp { @@ -138,8 +138,10 @@ int main( int argc, char* argv[] ) axom::slic::SimpleLogger logger; // create & initialize test logger, finalized when // exiting main scope -#ifdef TRIBOL_ENABLE_CUDA +#ifdef TRIBOL_USE_CUDA mfem::Device device( "cuda" ); +#elif defined( TRIBOL_USE_HIP ) + mfem::Device device( "hip" ); #endif result = RUN_ALL_TESTS(); diff --git a/src/tests/tribol_common_plane_gpu.cpp b/src/tests/tribol_common_plane_gpu.cpp index 848dd4fe..3332ac23 100644 --- a/src/tests/tribol_common_plane_gpu.cpp +++ b/src/tests/tribol_common_plane_gpu.cpp @@ -16,7 +16,7 @@ #include "axom/slic.hpp" -#include "tribol/common/ExecModel.hpp" +#include "shared/ExecModel.hpp" #include "tribol/interface/tribol.hpp" namespace tribol { diff --git a/src/tests/tribol_mfem_mortar_lm.cpp b/src/tests/tribol_mfem_mortar_lm.cpp index 3c193e64..98db248b 100644 --- a/src/tests/tribol_mfem_mortar_lm.cpp +++ b/src/tests/tribol_mfem_mortar_lm.cpp @@ -136,7 +136,6 @@ class MfemMortarTest : public testing::TestWithParam binning_methods_{ tribol::BINNING_CARTESIAN_PRODUCT, tribol::BINNING_GRID, tribol::BINNING_BVH }; + /** + * @brief Execution mode for the test + */ + tribol::ExecutionMode exec_mode_; + + void SetUp() override + { +#if defined( TRIBOL_USE_CUDA ) + exec_mode_ = tribol::ExecutionMode::Cuda; +#elif defined( TRIBOL_USE_HIP ) + exec_mode_ = tribol::ExecutionMode::Hip; +#elif defined( TRIBOL_USE_OPENMP ) + exec_mode_ = tribol::ExecutionMode::OpenMP; +#else + exec_mode_ = tribol::ExecutionMode::Sequential; +#endif + } + void UpdateTribol( shared::ParMeshBuilder& mesh, int coupling_scheme_id ) { tribol::updateMfemParallelDecomposition(); @@ -70,7 +88,8 @@ class ProximityTest : public testing::TestWithParam( mesh ), std::get<0>( mesh ).getNodes(), std::get<1>( mesh ), std::get<2>( mesh ), tribol::SURFACE_TO_SURFACE, tribol::NO_CASE, tribol::SINGLE_MORTAR, - tribol::FRICTIONLESS, tribol::LAGRANGE_MULTIPLIER, binning_method ); + tribol::FRICTIONLESS, tribol::LAGRANGE_MULTIPLIER, binning_method, exec_mode_ ); tribol::setLagrangeMultiplierOptions( coupling_scheme_id, tribol::ImplicitEvalMode::MORTAR_RESIDUAL ); tribol::getMfemPressure( coupling_scheme_id ) = 1.0; tribol::setBinningProximityScale( coupling_scheme_id, binning_proximity ); @@ -231,6 +250,8 @@ TEST_P( ProximityTest, CheckForceValues3DCommonPlane ) MPI_Barrier( MPI_COMM_WORLD ); } +#ifdef TRIBOL_USE_HOST +// NOTE: Mortar doesn't work on device TEST_P( ProximityTest, CheckForceValues3DMortar ) { auto should_have_force = std::get<3>( GetParam() ); @@ -250,6 +271,7 @@ TEST_P( ProximityTest, CheckForceValues3DMortar ) MPI_Barrier( MPI_COMM_WORLD ); } +#endif // The parameters for the tuple are: finite element order (int), binning proximity parameter (multiplier of element // length), amount of element interpenetration, and whether or not we expect Tribol to consider the interface pair in @@ -283,6 +305,19 @@ int main( int argc, char* argv[] ) umpire::ResourceManager::getInstance(); // initialize umpire's ResouceManager #endif +#if defined( TRIBOL_USE_CUDA ) + std::string device_str( "cuda" ); +#elif defined( TRIBOL_USE_HIP ) + std::string device_str( "hip" ); +#elif defined( TRIBOL_USE_OPENMP ) + std::string device_str( "omp" ); +#else + std::string device_str( "cpu" ); +#endif + + mfem::Device device( device_str ); + device.Print(); + axom::slic::SimpleLogger logger; // create & initialize test logger, finalized when // exiting main scope diff --git a/src/tribol/CMakeLists.txt b/src/tribol/CMakeLists.txt index 01caae75..b73eb2da 100644 --- a/src/tribol/CMakeLists.txt +++ b/src/tribol/CMakeLists.txt @@ -12,11 +12,8 @@ endif() set(tribol_headers common/ArrayTypes.hpp - common/BasicTypes.hpp common/Containers.hpp common/Enzyme.hpp - common/ExecModel.hpp - common/LoopExec.hpp common/Parameters.hpp geom/CompGeom.hpp diff --git a/src/tribol/common/ArrayTypes.hpp b/src/tribol/common/ArrayTypes.hpp index b36c9549..7d191feb 100644 --- a/src/tribol/common/ArrayTypes.hpp +++ b/src/tribol/common/ArrayTypes.hpp @@ -6,12 +6,13 @@ #ifndef SRC_TRIBOL_COMMON_ARRAYTYPES_HPP_ #define SRC_TRIBOL_COMMON_ARRAYTYPES_HPP_ -// Tribol includes -#include "tribol/common/ExecModel.hpp" - +// Axom includes #include "axom/core/Array.hpp" #include "axom/core/ArrayView.hpp" +// Shared includes +#include "shared/ExecModel.hpp" + namespace tribol { /** diff --git a/src/tribol/common/Containers.hpp b/src/tribol/common/Containers.hpp index e0af5f98..072d57e7 100644 --- a/src/tribol/common/Containers.hpp +++ b/src/tribol/common/Containers.hpp @@ -9,8 +9,8 @@ // C++ includes #include -// Tribol includes -#include "tribol/common/BasicTypes.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" namespace tribol { diff --git a/src/tribol/common/Enzyme.hpp b/src/tribol/common/Enzyme.hpp index a384d9f5..3d4d4593 100644 --- a/src/tribol/common/Enzyme.hpp +++ b/src/tribol/common/Enzyme.hpp @@ -6,9 +6,11 @@ #ifndef SRC_TRIBOL_COMMON_ENZYME_HPP_ #define SRC_TRIBOL_COMMON_ENZYME_HPP_ +// Tribol config include #include "tribol/config.hpp" -#include "tribol/common/BasicTypes.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" #ifdef TRIBOL_USE_ENZYME #include "mfem/general/enzyme.hpp" diff --git a/src/tribol/common/Parameters.hpp b/src/tribol/common/Parameters.hpp index c944a135..42ea7ff7 100644 --- a/src/tribol/common/Parameters.hpp +++ b/src/tribol/common/Parameters.hpp @@ -6,10 +6,8 @@ #ifndef SRC_TRIBOL_COMMON_PARAMETERS_HPP_ #define SRC_TRIBOL_COMMON_PARAMETERS_HPP_ -// Tribol includes -#include "tribol/common/BasicTypes.hpp" - -#include +// Shared includes +#include "shared/BasicTypes.hpp" namespace tribol { diff --git a/src/tribol/config.hpp.in b/src/tribol/config.hpp.in index 8b5c36e8..c0dcb094 100644 --- a/src/tribol/config.hpp.in +++ b/src/tribol/config.hpp.in @@ -6,6 +6,8 @@ #ifndef SRC_TRIBOL_CONFIG_HPP_ #define SRC_TRIBOL_CONFIG_HPP_ +#include "shared/config.hpp" + /* * Note: Use only C-style comments in this file since it might be included from a C file */ @@ -30,14 +32,6 @@ /* * Build information */ -#cmakedefine TRIBOL_USE_SINGLE_PRECISION -#cmakedefine TRIBOL_USE_MPI -#cmakedefine TRIBOL_USE_UMPIRE -#cmakedefine TRIBOL_USE_RAJA -#cmakedefine TRIBOL_USE_ENZYME -#cmakedefine TRIBOL_USE_CUDA -#cmakedefine TRIBOL_USE_HIP -#cmakedefine TRIBOL_USE_OPENMP #cmakedefine BUILD_REDECOMP #endif /* SRC_TRIBOL_CONFIG_HPP_ */ diff --git a/src/tribol/geom/ElementNormal.hpp b/src/tribol/geom/ElementNormal.hpp index 0e9c340d..04d336f4 100644 --- a/src/tribol/geom/ElementNormal.hpp +++ b/src/tribol/geom/ElementNormal.hpp @@ -6,7 +6,8 @@ #ifndef SRC_TRIBOL_GEOM_ELEMENTNORMAL_HPP_ #define SRC_TRIBOL_GEOM_ELEMENTNORMAL_HPP_ -#include "tribol/common/BasicTypes.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" namespace tribol { diff --git a/src/tribol/geom/Hyperplane.hpp b/src/tribol/geom/Hyperplane.hpp index 1f3e016d..86f76d76 100644 --- a/src/tribol/geom/Hyperplane.hpp +++ b/src/tribol/geom/Hyperplane.hpp @@ -6,15 +6,11 @@ #ifndef SRC_TRIBOL_GEOM_HYPERPLANE_HPP_ #define SRC_TRIBOL_GEOM_HYPERPLANE_HPP_ -// Tribol config include -#include "tribol/config.hpp" - // Axom includes #include "axom/primal.hpp" -// Tribol includes -#include "tribol/common/BasicTypes.hpp" -#include "tribol/geom/Vector.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" namespace tribol { diff --git a/src/tribol/interface/mfem_tribol.cpp b/src/tribol/interface/mfem_tribol.cpp index e8d0a842..f13604d3 100644 --- a/src/tribol/interface/mfem_tribol.cpp +++ b/src/tribol/interface/mfem_tribol.cpp @@ -92,7 +92,8 @@ void registerMfemCouplingScheme( IndexT cs_id, int mesh_id_1, int mesh_id_2, con // create pressure field on the parent-linked boundary submesh and // transfer operators to the redecomp level cs.setMfemSubmeshData( std::make_unique( mfem_data->GetSubmesh(), mfem_data->GetLORMesh(), - std::move( pressure_fec ), pressure_vdim ) ); + std::move( pressure_fec ), pressure_vdim, + isOnDevice( exec_mode ) ) ); // set up Jacobian transfer if the coupling scheme requires it auto lm_options = cs.getEnforcementOptions().lm_implicit_options; if ( lm_options.enforcement_option_set && ( lm_options.eval_mode == ImplicitEvalMode::MORTAR_JACOBIAN || diff --git a/src/tribol/interface/mfem_tribol.hpp b/src/tribol/interface/mfem_tribol.hpp index 2a142962..a18d311e 100644 --- a/src/tribol/interface/mfem_tribol.hpp +++ b/src/tribol/interface/mfem_tribol.hpp @@ -13,7 +13,7 @@ #include "mfem.hpp" #include "tribol/common/Parameters.hpp" -#include "tribol/common/ExecModel.hpp" +#include "shared/ExecModel.hpp" namespace tribol { diff --git a/src/tribol/interface/simple_tribol.cpp b/src/tribol/interface/simple_tribol.cpp index 88ddac14..cd1ad2ac 100644 --- a/src/tribol/interface/simple_tribol.cpp +++ b/src/tribol/interface/simple_tribol.cpp @@ -3,21 +3,20 @@ // // SPDX-License-Identifier: (MIT) -#include "tribol.hpp" - -// Tribol includes -#include "tribol/common/ExecModel.hpp" -#include "tribol/common/Parameters.hpp" #include "tribol/interface/simple_tribol.hpp" +// C/C++ includes +#include + // Axom includes -#include "axom/core.hpp" #include "axom/slic.hpp" -// C/C++ includes -#include -#include -#include +// Shared includes +#include "shared/ExecModel.hpp" + +// Tribol includes +#include "tribol.hpp" +#include "tribol/common/Parameters.hpp" //------------------------------------------------------------------------------ // Interface Implementation diff --git a/src/tribol/interface/tribol.hpp b/src/tribol/interface/tribol.hpp index 4ac9e3be..500fad9e 100644 --- a/src/tribol/interface/tribol.hpp +++ b/src/tribol/interface/tribol.hpp @@ -6,15 +6,19 @@ #ifndef SRC_TRIBOL_INTERFACE_TRIBOL_HPP_ #define SRC_TRIBOL_INTERFACE_TRIBOL_HPP_ -#include "tribol/common/ExecModel.hpp" -#include "tribol/common/ArrayTypes.hpp" -#include "tribol/common/Parameters.hpp" - +// C++ includes #include // MFEM includes #include "mfem.hpp" +// Shared includes +#include "shared/ExecModel.hpp" + +// Tribol includes +#include "tribol/common/ArrayTypes.hpp" +#include "tribol/common/Parameters.hpp" + namespace tribol { /// \name Contact Library Initialization methods diff --git a/src/tribol/mesh/CouplingScheme.cpp b/src/tribol/mesh/CouplingScheme.cpp index bcce23a5..b2abf86d 100644 --- a/src/tribol/mesh/CouplingScheme.cpp +++ b/src/tribol/mesh/CouplingScheme.cpp @@ -5,8 +5,20 @@ #include "tribol/mesh/CouplingScheme.hpp" +// C++ includes +#include + +// MFEM includes +#include "mfem.hpp" + +// Axom includes +#include "axom/slic.hpp" + +// Shared includes +#include "shared/ExecModel.hpp" +#include "shared/LoopExec.hpp" + // Tribol includes -#include "tribol/common/ExecModel.hpp" #include "tribol/geom/ElementNormal.hpp" #include "tribol/geom/GeomUtilities.hpp" #include "tribol/mesh/MethodCouplingData.hpp" @@ -18,15 +30,6 @@ #include "tribol/physics/Physics.hpp" #include "tribol/integ/FE.hpp" -// Axom includes -#include "axom/slic.hpp" - -// MFEM includes -#include "mfem.hpp" - -// C++ includes -#include - namespace tribol { //------------------------------------------------------------------------------ diff --git a/src/tribol/mesh/CouplingScheme.hpp b/src/tribol/mesh/CouplingScheme.hpp index 35b2be63..f45bd2fa 100644 --- a/src/tribol/mesh/CouplingScheme.hpp +++ b/src/tribol/mesh/CouplingScheme.hpp @@ -5,9 +5,14 @@ #ifndef SRC_TRIBOL_MESH_COUPLINGSCHEME_HPP_ #define SRC_TRIBOL_MESH_COUPLINGSCHEME_HPP_ +// Tribol config include +#include "tribol/config.hpp" + +// Shared includes +#include "shared/BasicTypes.hpp" +#include "shared/ExecModel.hpp" + // Tribol includes -#include "tribol/common/BasicTypes.hpp" -#include "tribol/common/ExecModel.hpp" #include "tribol/common/Parameters.hpp" #include "tribol/mesh/MeshData.hpp" #include "tribol/mesh/MethodCouplingData.hpp" @@ -15,15 +20,11 @@ #include "tribol/utils/DataManager.hpp" #include "tribol/mesh/InterfacePairs.hpp" #include "tribol/geom/CompGeom.hpp" -#include "tribol/geom/ElementNormal.hpp" #ifdef TRIBOL_USE_ENZYME #include "tribol/geom/NodalNormal.hpp" #endif -// Axom includes -#include "axom/core.hpp" - namespace tribol { // Struct to hold on-rank coupling scheme face-pair reporting data // generated from computational geometry issues diff --git a/src/tribol/mesh/InterfacePairs.hpp b/src/tribol/mesh/InterfacePairs.hpp index 6c7aa4e2..aef0652a 100644 --- a/src/tribol/mesh/InterfacePairs.hpp +++ b/src/tribol/mesh/InterfacePairs.hpp @@ -6,7 +6,8 @@ #ifndef SRC_TRIBOL_MESH_INTERFACE_PAIRS_HPP_ #define SRC_TRIBOL_MESH_INTERFACE_PAIRS_HPP_ -#include "tribol/common/BasicTypes.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" namespace tribol { diff --git a/src/tribol/mesh/MeshData.cpp b/src/tribol/mesh/MeshData.cpp index a93bbc9f..ed12b61b 100644 --- a/src/tribol/mesh/MeshData.cpp +++ b/src/tribol/mesh/MeshData.cpp @@ -3,20 +3,24 @@ // // SPDX-License-Identifier: (MIT) +#include "tribol/mesh/MeshData.hpp" + +// C++ includes #include #include -#include -#include -#include - -#include "tribol/mesh/MeshData.hpp" -#include "tribol/common/ExecModel.hpp" -#include "tribol/geom/ElementNormal.hpp" -#include "tribol/utils/Math.hpp" +// Axom includes #include "axom/slic.hpp" #include "axom/fmt.hpp" +// Shared includes +#include "shared/ExecModel.hpp" +#include "shared/LoopExec.hpp" + +// Tribol includes +#include "tribol/geom/ElementNormal.hpp" +#include "tribol/utils/Math.hpp" + namespace tribol { //------------------------------------------------------------------------------ diff --git a/src/tribol/mesh/MeshData.hpp b/src/tribol/mesh/MeshData.hpp index 6992d9d6..deecd002 100644 --- a/src/tribol/mesh/MeshData.hpp +++ b/src/tribol/mesh/MeshData.hpp @@ -6,12 +6,15 @@ #ifndef SRC_TRIBOL_MESH_MESHDATA_HPP_ #define SRC_TRIBOL_MESH_MESHDATA_HPP_ +// C++ includes #include +// Shared includes +#include "shared/ExecModel.hpp" + +// Tribol includes #include "tribol/common/ArrayTypes.hpp" -#include "tribol/common/LoopExec.hpp" #include "tribol/common/Parameters.hpp" -#include "tribol/geom/ElementNormal.hpp" #include "tribol/utils/DataManager.hpp" namespace tribol { diff --git a/src/tribol/mesh/MfemData.cpp b/src/tribol/mesh/MfemData.cpp index 47671456..8f315f7c 100644 --- a/src/tribol/mesh/MfemData.cpp +++ b/src/tribol/mesh/MfemData.cpp @@ -9,9 +9,10 @@ #ifdef BUILD_REDECOMP -#include "axom/slic.hpp" +#include "axom/slic/interface/slic_macros.hpp" #include "shared/infrastructure/Profiling.hpp" +#include "shared/LoopExec.hpp" #include "redecomp/utils/ArrayUtility.hpp" @@ -33,8 +34,6 @@ void SubmeshLORTransfer::TransferToLORGridFn( const mfem::ParGridFunction& subme void SubmeshLORTransfer::TransferFromLORVector( mfem::Vector& submesh_dst ) const { // make sure host data is up to date. this transfer needs to be on the host until submesh supports device transfer - lor_gridfn_->HostRead(); - submesh_dst.HostWrite(); lor_xfer_.ForwardOperator().MultTranspose( *lor_gridfn_, submesh_dst ); } @@ -42,8 +41,6 @@ void SubmeshLORTransfer::SubmeshToLOR( const mfem::ParGridFunction& submesh_src, { TRIBOL_MARK_FUNCTION; // make sure host data is up to date. this transfer needs to be on the host until submesh supports device transfer - submesh_src.HostRead(); - lor_dst.HostWrite(); lor_xfer_.ForwardOperator().Mult( submesh_src, lor_dst ); } @@ -53,9 +50,6 @@ std::unique_ptr SubmeshLORTransfer::CreateLORGridFunction auto lor_gridfn = std::make_unique( new mfem::ParFiniteElementSpace( &lor_mesh, lor_fec.get(), vdim, mfem::Ordering::byNODES ) ); lor_gridfn->MakeOwner( lor_fec.release() ); - // NOTE: This needs to be false until submesh supports device transfer. Otherwise, there will be extra copies to/from - // device. - lor_gridfn->UseDevice( false ); return lor_gridfn; } @@ -114,19 +108,16 @@ void SubmeshRedecompTransfer::RedecompToSubmesh( const mfem::GridFunction& redec // P_I is the row index vector on the MFEM prolongation matrix. If there are no column entries for the row, then the // DOF is owned by another rank. - auto dst_data = dst_ptr->HostWrite(); - auto P_I = - mfem::Read( dst_fespace_ptr->Dof_TrueDof_Matrix()->GetDiagMemoryI(), dst_fespace_ptr->GetVSize() + 1, false ); - HYPRE_Int tdof_ct{ 0 }; - // TODO: Convert to mfem::forall() once submesh transfers on device and once GPU-enabled MPI is in redecomp (dst_data - // is always on host now so not needed yet) - for ( int i{ 0 }; i < dst_fespace_ptr->GetVSize(); ++i ) { - if ( P_I[i + 1] != tdof_ct ) { - ++tdof_ct; - } else { + auto dst_data = dst_ptr->ReadWrite( dst_ptr->UseDevice() ); + auto P_I = mfem::Read( dst_fespace_ptr->Dof_TrueDof_Matrix()->GetDiagMemoryI(), dst_fespace_ptr->GetVSize() + 1, + dst_ptr->UseDevice() ); + // set non-owned DOF values to zero. + // P_I[i+1] == P_I[i] implies no diagonal entry, so the DOF is not owned. + mfem::forall_switch( dst_ptr->UseDevice(), dst_fespace_ptr->GetVSize(), [=] MFEM_HOST_DEVICE( int i ) { + if ( P_I[i + 1] == P_I[i] ) { dst_data[i] = 0.0; } - } + } ); // if using LOR, transfer data from LOR mesh to submesh if ( submesh_lor_xfer_ ) { submesh_lor_xfer_->TransferFromLORVector( submesh_dst ); @@ -287,7 +278,7 @@ PressureField::UpdateData::UpdateData( SubmeshRedecompTransfer& submesh_redecomp const mfem::ParGridFunction& submesh_gridfn ) : submesh_redecomp_xfer_{ submesh_redecomp_xfer }, redecomp_gridfn_{ &submesh_redecomp_xfer.GetRedecompFESpace() } { - // keep on host since tribol does computations there + // keep on host since tribol always does mortar computations there (update when mortar is on gpu) redecomp_gridfn_.UseDevice( false ); redecomp_gridfn_ = 0.0; submesh_redecomp_xfer_.SubmeshToRedecomp( submesh_gridfn, redecomp_gridfn_ ); @@ -318,9 +309,6 @@ MfemMeshData::MfemMeshData( IndexT mesh_id_1, IndexT mesh_id_2, const mfem::ParM submesh_xfer_gridfn_.SetSpace( new mfem::ParFiniteElementSpace( &submesh_, submesh_fec.get(), current_coords.ParFESpace()->GetVDim(), mfem::Ordering::byNODES ) ); submesh_xfer_gridfn_.MakeOwner( submesh_fec.release() ); - // NOTE: This needs to be on host until the submesh transfer supports device. Otherwise, there will be extra - // transfers to/from device. - submesh_xfer_gridfn_.UseDevice( false ); // build LOR submesh if ( current_coords.FESpace()->FEColl()->GetOrder() > 1 ) { @@ -356,12 +344,12 @@ bool MfemMeshData::UpdateMfemMeshData( RealT binning_proximity_scale, int n_rank // compute max displacement change auto& current_coords_gf = coords_.GetParentGridFn(); // Use inf-norm of coordinate differences as a proxy for max displacement change. - const RealT* d_curr = current_coords_gf.Read(); - const RealT* d_last = coords_at_last_redecomp_.Read(); + const RealT* d_curr = current_coords_gf.Read( use_device_ ); + const RealT* d_last = coords_at_last_redecomp_.Read( use_device_ ); mfem::Vector max_diff( 1 ); max_diff.UseDevice( use_device_ ); max_diff = 0.0; - RealT* d_max_diff = max_diff.Write(); + RealT* d_max_diff = max_diff.Write( use_device_ ); forAllExec( exec_mode_, current_coords_gf.Size(), [d_curr, d_last, d_max_diff] TRIBOL_HOST_DEVICE( int i ) { #ifdef TRIBOL_USE_RAJA RAJA::atomicMax( d_max_diff, std::abs( d_curr[i] - d_last[i] ) ); @@ -784,11 +772,13 @@ void MfemMeshData::UpdateData::SetElementData() } MfemSubmeshData::MfemSubmeshData( mfem::ParSubMesh& submesh, mfem::ParMesh* lor_mesh, - std::unique_ptr pressure_fec, int pressure_vdim ) + std::unique_ptr pressure_fec, int pressure_vdim, + bool use_device ) : submesh_pressure_{ new mfem::ParFiniteElementSpace( &submesh, pressure_fec.get(), pressure_vdim ) }, pressure_{ submesh_pressure_ }, submesh_lor_xfer_{ lor_mesh ? std::make_unique( *submesh_pressure_.ParFESpace(), *lor_mesh ) - : nullptr } + : nullptr }, + use_device_{ use_device } { submesh_pressure_.MakeOwner( pressure_fec.release() ); submesh_pressure_ = 0.0; @@ -802,6 +792,7 @@ void MfemSubmeshData::UpdateMfemSubmeshData( redecomp::RedecompMesh& redecomp_me } pressure_.UpdateField( update_data_->pressure_xfer_ ); redecomp_gap_.SetSpace( pressure_.GetRedecompGridFn().FESpace() ); + redecomp_gap_.UseDevice( use_device_ ); redecomp_gap_ = 0.0; } @@ -986,12 +977,9 @@ std::unique_ptr MfemJacobianData::GetMfemBlockJacobian( con auto J_idx = redecomp::MPIArray( &mpi ); auto est_num_J = submesh_J.NumNonZeroElems() / mpi.NRanks(); for ( int r{}; r < mpi.NRanks(); ++r ) { - if ( r == mpi.MyRank() ) { - send_J_by_rank[r].shrink(); - J_idx[r].shrink(); - } else { - send_J_by_rank[r].reserve( est_num_J ); - J_idx[r].reserve( est_num_J ); + if ( r != mpi.MyRank() ) { + send_J_by_rank[r].Reserve( est_num_J ); + J_idx[r].Reserve( est_num_J ); } } for ( int j{}; j < submesh_J.NumNonZeroElems(); ++j ) { @@ -1019,7 +1007,7 @@ std::unique_ptr MfemJacobianData::GetMfemBlockJacobian( con // step 4) send the updated parent J values back and update the J vector send_J_by_rank.SendRecvArrayEach( recv_J_by_rank ); for ( int r{}; r < mpi.NRanks(); ++r ) { - for ( int j{}; j < send_J_by_rank[r].size(); ++j ) { + for ( int j{}; j < send_J_by_rank[r].Size(); ++j ) { J[J_idx[r][j]] = send_J_by_rank[r][j]; } } diff --git a/src/tribol/mesh/MfemData.hpp b/src/tribol/mesh/MfemData.hpp index de2846d8..f6623411 100644 --- a/src/tribol/mesh/MfemData.hpp +++ b/src/tribol/mesh/MfemData.hpp @@ -11,15 +11,23 @@ #ifdef BUILD_REDECOMP +// C++ includes #include #include +// MFEM includes #include "mfem.hpp" +// Axom includes #include "axom/core.hpp" + +// Shared includes +#include "shared/BasicTypes.hpp" + +// Redecomp includes #include "redecomp/redecomp.hpp" -#include "tribol/common/BasicTypes.hpp" +// Tribol includes #include "tribol/common/Parameters.hpp" #include "tribol/mesh/MethodCouplingData.hpp" @@ -1472,9 +1480,10 @@ class MfemSubmeshData { * otherwise) * @param pressure_fec Finite element collection of the pressure field * @param pressure_vdim Vector dimension of the pressure field + * @param use_device Whether to use device memory */ MfemSubmeshData( mfem::ParSubMesh& submesh, mfem::ParMesh* lor_mesh, - std::unique_ptr pressure_fec, int pressure_vdim ); + std::unique_ptr pressure_fec, int pressure_vdim, bool use_device ); /** * @brief Build a new transfer operator and update redecomp-level grid @@ -1627,6 +1636,11 @@ class MfemSubmeshData { * @brief UpdateData object created upon call to UpdateSubmeshData() */ std::unique_ptr update_data_; + + /** + * @brief Whether to use device memory for MFEM data + */ + bool use_device_; }; /** diff --git a/src/tribol/physics/CommonPlane.cpp b/src/tribol/physics/CommonPlane.cpp index 1aaf5b59..cff698b4 100644 --- a/src/tribol/physics/CommonPlane.cpp +++ b/src/tribol/physics/CommonPlane.cpp @@ -5,15 +5,14 @@ #include "CommonPlane.hpp" +#include "shared/LoopExec.hpp" + #include "tribol/mesh/MethodCouplingData.hpp" -#include "tribol/mesh/InterfacePairs.hpp" #include "tribol/mesh/CouplingScheme.hpp" #include "tribol/geom/CompGeom.hpp" -#include "tribol/geom/GeomUtilities.hpp" #include "tribol/common/Parameters.hpp" #include "tribol/integ/Integration.hpp" #include "tribol/integ/FE.hpp" -#include "tribol/utils/ContactPlaneOutput.hpp" #include "tribol/utils/Math.hpp" namespace tribol { diff --git a/src/tribol/search/InterfacePairFinder.cpp b/src/tribol/search/InterfacePairFinder.cpp index d2476e01..35fde2b7 100644 --- a/src/tribol/search/InterfacePairFinder.cpp +++ b/src/tribol/search/InterfacePairFinder.cpp @@ -5,19 +5,23 @@ #include "tribol/search/InterfacePairFinder.hpp" -#include "tribol/common/ExecModel.hpp" +// Axom includes +#include "axom/slic.hpp" +#include "axom/primal.hpp" +#include "axom/spin.hpp" + +// Shared includes +#include "shared/ExecModel.hpp" +#include "shared/LoopExec.hpp" + +// Tribol includes #include "tribol/common/Parameters.hpp" -#include "tribol/geom/GeomUtilities.hpp" #include "tribol/mesh/CouplingScheme.hpp" #include "tribol/mesh/MeshData.hpp" #include "tribol/mesh/InterfacePairs.hpp" #include "tribol/utils/Algorithm.hpp" #include "tribol/utils/Math.hpp" -#include "axom/slic.hpp" -#include "axom/primal.hpp" -#include "axom/spin.hpp" - // Define some namespace aliases to help with axom usage namespace primal = axom::primal; namespace spin = axom::spin; diff --git a/src/tribol/utils/ContactPlaneOutput.hpp b/src/tribol/utils/ContactPlaneOutput.hpp index 18982d33..1729a036 100644 --- a/src/tribol/utils/ContactPlaneOutput.hpp +++ b/src/tribol/utils/ContactPlaneOutput.hpp @@ -6,12 +6,15 @@ #ifndef SRC_TRIBOL_UTILS_CONTACTPLANEOUTPUT_HPP_ #define SRC_TRIBOL_UTILS_CONTACTPLANEOUTPUT_HPP_ -#include "tribol/common/BasicTypes.hpp" -#include "tribol/common/Parameters.hpp" - // C++ includes #include +// Shared includes +#include "shared/BasicTypes.hpp" + +// Tribol includes +#include "tribol/common/Parameters.hpp" + namespace tribol { // forward declarations diff --git a/src/tribol/utils/DataManager.hpp b/src/tribol/utils/DataManager.hpp index 095c5270..795d9a0c 100644 --- a/src/tribol/utils/DataManager.hpp +++ b/src/tribol/utils/DataManager.hpp @@ -6,11 +6,11 @@ #ifndef SRC_TRIBOL_UTILS_DATAMANAGER_HPP_ #define SRC_TRIBOL_UTILS_DATAMANAGER_HPP_ -// C/C++ includes +// C++ includes #include -// Tribol includes -#include "tribol/common/BasicTypes.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" // Axom includes #include "axom/fmt.hpp" diff --git a/src/tribol/utils/Math.cpp b/src/tribol/utils/Math.cpp index 6754b4d2..88811f66 100644 --- a/src/tribol/utils/Math.cpp +++ b/src/tribol/utils/Math.cpp @@ -5,11 +5,17 @@ #include "tribol/utils/Math.hpp" +// C++ includes +#include + +// Axom includes +#include "axom/slic.hpp" + namespace tribol { TRIBOL_HOST_DEVICE RealT magnitude( RealT const vx, RealT const vy, RealT const vz ) { - return sqrt( vx * vx + vy * vy + vz * vz ); + return std::sqrt( vx * vx + vy * vy + vz * vz ); } //------------------------------------------------------------------------------ @@ -19,7 +25,7 @@ RealT magnitude( RealT const* const v, int const dim ) for ( int i = 0; i < dim; ++i ) { mag += v[i] * v[i]; } - return sqrt( mag ); + return std::sqrt( mag ); } //------------------------------------------------------------------------------ @@ -93,7 +99,7 @@ int binary_search( const int* const array, const int n, const int val ) template void swap_val( T* xp, T* yp ) { - int temp = *xp; + T temp = *xp; *xp = *yp; *yp = temp; } diff --git a/src/tribol/utils/Math.hpp b/src/tribol/utils/Math.hpp index ffb0d799..e91c768f 100644 --- a/src/tribol/utils/Math.hpp +++ b/src/tribol/utils/Math.hpp @@ -9,9 +9,11 @@ // C++ includes #include +// Axom includes #include "axom/slic.hpp" -#include "tribol/common/BasicTypes.hpp" +// Shared includes +#include "shared/BasicTypes.hpp" namespace tribol { @@ -26,7 +28,7 @@ TRIBOL_HOST_DEVICE inline RealT magnitude( RealT const vx, ///< [in] x-componen RealT const vy ///< [in] y-component of the input vector ) { - return sqrt( vx * vx + vy * vy ); + return std::sqrt( vx * vx + vy * vy ); } /// returns the dot product of two vectors @@ -103,10 +105,10 @@ void allocArray( T** arr, int length, T init_val ); /// allocate and initialize an array of booleans void allocBoolArray( bool** arr, int length, bool init_val ); -/// initialize a array of reals +/// initialize an array of reals TRIBOL_HOST_DEVICE inline void initRealArray( RealT* arr, int length, RealT init_val ) { -#if defined( TRIBOL_USE_HOST ) && !defined( TRIBOL_USE_ENZYME ) +#if !defined( TRIBOL_DEVICE_CODE ) && !defined( TRIBOL_USE_ENZYME ) SLIC_ERROR_IF( arr == nullptr, "initRealArray(): " << "input pointer to array is null." ); #endif @@ -118,7 +120,7 @@ TRIBOL_HOST_DEVICE inline void initRealArray( RealT* arr, int length, RealT init /// initialize a array of integers TRIBOL_HOST_DEVICE inline void initIntArray( int* arr, int length, int init_val ) { -#if defined( TRIBOL_USE_HOST ) && !defined( TRIBOL_USE_ENZYME ) +#if !defined( TRIBOL_DEVICE_CODE ) && !defined( TRIBOL_USE_ENZYME ) SLIC_ERROR_IF( arr == nullptr, "initIntArray(): " << "input pointer to array is null." ); #endif for ( int i = 0; i < length; ++i ) { @@ -133,7 +135,7 @@ TRIBOL_HOST_DEVICE void initArray( T* arr, int length, T init_val ); /// initialize a array of booleans TRIBOL_HOST_DEVICE inline void initBoolArray( bool* arr, int length, bool init_val ) { -#if defined( TRIBOL_USE_HOST ) && !defined( TRIBOL_USE_ENZYME ) +#if !defined( TRIBOL_DEVICE_CODE ) && !defined( TRIBOL_USE_ENZYME ) SLIC_ERROR_IF( arr == nullptr, "initBoolArray(): " << "input pointer to array is null." ); #endif for ( int i = 0; i < length; ++i ) {