From ee36032549282a7e2368dc6a4de1d2301bd09fab Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Mon, 17 Feb 2025 12:32:28 -0500 Subject: [PATCH 1/9] Updated iglob ordering --- src/compute/compute_mesh.cpp | 93 ++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 35 deletions(-) diff --git a/src/compute/compute_mesh.cpp b/src/compute/compute_mesh.cpp index 1885333b..6fefcd81 100644 --- a/src/compute/compute_mesh.cpp +++ b/src/compute/compute_mesh.cpp @@ -50,13 +50,32 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { std::vector cart_cord(nspec * ngllxz); - for (int ispec = 0; ispec < nspec; ispec++) { - for (int iz = 0; iz < ngll; iz++) { - for (int ix = 0; ix < ngll; ix++) { - int iloc = ix * nspec * ngll + iz * nspec + ispec; - cart_cord[iloc].x = global_coordinates(ispec, iz, ix, 0); - cart_cord[iloc].z = global_coordinates(ispec, iz, ix, 1); - cart_cord[iloc].iloc = iloc; + using simd = specfem::datatype::simd; + +#ifdef KOKKOS_ENABLE_CUDA + constexpr int chunk_size = 32 * simd::size(); +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + constexpr int chunk_size = 1 * simd::size(); +#endif + +#ifdef KOKKOS_ENABLE_SERIAL + constexpr int chunk_size = 1 * simd::size(); +#endif + + int nchunks = nspec / chunk_size; + int iloc = 0; + for (int ichunk = 0; ichunk < nchunks; ichunk++) { + for (int ix = 0; ix < ngll; ix++) { + for (int iz = 0; iz < ngll; iz++) { + for (int ielement = 0; ielement < chunk_size; ielement++) { + int ispec = ichunk * chunk_size + ielement; + cart_cord[iloc].x = global_coordinates(ispec, iz, ix, 0); + cart_cord[iloc].z = global_coordinates(ispec, iz, ix, 1); + cart_cord[iloc].iloc = iloc; + iloc++; + } } } } @@ -101,40 +120,44 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { // Assign numbering to corresponding ispec, iz, ix std::vector iglob_counted(nglob, -1); - int iloc = 0; + iloc = 0; int inum = 0; type_real xmin = std::numeric_limits::max(); type_real xmax = std::numeric_limits::min(); type_real zmin = std::numeric_limits::max(); type_real zmax = std::numeric_limits::min(); - for (int ix = 0; ix < ngll; ix++) { - for (int iz = 0; iz < ngll; iz++) { - for (int ispec = 0; ispec < nspec; ispec++) { - if (iglob_counted[copy_cart_cord[iloc].iglob] == -1) { - - const type_real x_cor = copy_cart_cord[iloc].x; - const type_real z_cor = copy_cart_cord[iloc].z; - if (xmin > x_cor) - xmin = x_cor; - if (zmin > z_cor) - zmin = z_cor; - if (xmax < x_cor) - xmax = x_cor; - if (zmax < z_cor) - zmax = z_cor; - - iglob_counted[copy_cart_cord[iloc].iglob] = inum; - points.h_index_mapping(ispec, iz, ix) = inum; - points.h_coord(0, ispec, iz, ix) = x_cor; - points.h_coord(1, ispec, iz, ix) = z_cor; - inum++; - } else { - points.h_index_mapping(ispec, iz, ix) = - iglob_counted[copy_cart_cord[iloc].iglob]; - points.h_coord(0, ispec, iz, ix) = copy_cart_cord[iloc].x; - points.h_coord(1, ispec, iz, ix) = copy_cart_cord[iloc].z; + + for (int ichunk = 0; ichunk < nchunks; ichunk++) { + for (int ix = 0; ix < ngll; ix++) { + for (int iz = 0; iz < ngll; iz++) { + for (int ielement = 0; ielement < chunk_size; ielement++) { + int ispec = ichunk * chunk_size + ielement; + if (iglob_counted[copy_cart_cord[iloc].iglob] == -1) { + + const type_real x_cor = copy_cart_cord[iloc].x; + const type_real z_cor = copy_cart_cord[iloc].z; + if (xmin > x_cor) + xmin = x_cor; + if (zmin > z_cor) + zmin = z_cor; + if (xmax < x_cor) + xmax = x_cor; + if (zmax < z_cor) + zmax = z_cor; + + iglob_counted[copy_cart_cord[iloc].iglob] = inum; + points.h_index_mapping(ispec, iz, ix) = inum; + points.h_coord(0, ispec, iz, ix) = x_cor; + points.h_coord(1, ispec, iz, ix) = z_cor; + inum++; + } else { + points.h_index_mapping(ispec, iz, ix) = + iglob_counted[copy_cart_cord[iloc].iglob]; + points.h_coord(0, ispec, iz, ix) = copy_cart_cord[iloc].x; + points.h_coord(1, ispec, iz, ix) = copy_cart_cord[iloc].z; + } + iloc++; } - iloc++; } } } From 2fc7e46083fe04900c63846c8514cf94b3b667a6 Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Mon, 17 Feb 2025 16:56:39 -0500 Subject: [PATCH 2/9] Fixed global numbering scheme. --- include/compute/fields/data_access.tpp | 4 +- include/compute/fields/impl/field_impl.tpp | 47 +++++++++++++++------- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/include/compute/fields/data_access.tpp b/include/compute/fields/data_access.tpp index 1fc1cae9..1b976849 100644 --- a/include/compute/fields/data_access.tpp +++ b/include/compute/fields/data_access.tpp @@ -1902,7 +1902,7 @@ template < typename ViewType, typename std::enable_if_t< ViewType::isChunkFieldType && !ViewType::simd::using_simd, int> = 0> -KOKKOS_FORCEINLINE_FUNCTION void +NOINLINE KOKKOS_FUNCTION void impl_load_on_device(const MemberType &team, const IteratorType &iterator, const WavefieldType &field, ViewType &chunk_field) { @@ -1980,7 +1980,7 @@ template < typename ViewType, typename std::enable_if_t< ViewType::isChunkFieldType && ViewType::simd::using_simd, int> = 0> -KOKKOS_FORCEINLINE_FUNCTION void +NOINLINE KOKKOS_FUNCTION void impl_load_on_device(const MemberType &team, const IteratorType &iterator, const WavefieldType &field, ViewType &chunk_field) { diff --git a/include/compute/fields/impl/field_impl.tpp b/include/compute/fields/impl/field_impl.tpp index ebb9db43..c7d4e8ea 100644 --- a/include/compute/fields/impl/field_impl.tpp +++ b/include/compute/fields/impl/field_impl.tpp @@ -1,8 +1,8 @@ #ifndef _COMPUTE_FIELDS_IMPL_FIELD_IMPL_TPP_ #define _COMPUTE_FIELDS_IMPL_FIELD_IMPL_TPP_ -#include "compute/fields/impl/field_impl.hpp" #include "compute/element_types/element_types.hpp" +#include "compute/fields/impl/field_impl.hpp" #include "kokkos_abstractions.h" #include @@ -35,19 +35,38 @@ specfem::compute::impl::field_impl::field_impl( // Count the total number of distinct global indices for the medium int count = 0; - - for (int ix = 0; ix < ngllx; ++ix) { - for (int iz = 0; iz < ngllz; ++iz) { - for (int ispec = 0; ispec < nspec; ++ispec) { - const auto medium = element_types.get_medium_tag(ispec); - if (medium == MediumTag) { - const int index = index_mapping(ispec, iz, ix); // get global index - // increase the count only if the global index is not already counted - /// static_cast(medium::value) is the index of the medium in the - /// enum class - if (assembly_index_mapping(index) == -1) { - assembly_index_mapping(index) = count; - count++; + using simd = specfem::datatype::simd; + +#ifdef KOKKOS_ENABLE_CUDA + constexpr int chunk_size = 32 * simd::size(); +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + constexpr int chunk_size = 1 * simd::size(); +#endif + +#ifdef KOKKOS_ENABLE_SERIAL + constexpr int chunk_size = 1 * simd::size(); +#endif + + int nchunks = nspec / chunk_size; + int iloc = 0; + for (int ichunk = 0; ichunk < nchunks; ichunk++) { + for (int iz = 0; iz < ngllz; iz++) { + for (int ix = 0; ix < ngllx; ix++) { + for (int ielement = 0; ielement < chunk_size; ielement++) { + int ispec = ichunk * chunk_size + ielement; + const auto medium = element_types.get_medium_tag(ispec); + if (medium == MediumTag) { + const int index = index_mapping(ispec, iz, ix); // get global index + // increase the count only if the global index is not already + // counted + /// static_cast(medium::value) is the index of the medium in + /// the enum class + if (assembly_index_mapping(index) == -1) { + assembly_index_mapping(index) = count; + count++; + } } } } From 9f8e4a7eecace3f88c731540cb621ccc566022de Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Mon, 17 Feb 2025 16:57:43 -0500 Subject: [PATCH 3/9] Fixed global numbering scheme --- src/compute/compute_mesh.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/compute/compute_mesh.cpp b/src/compute/compute_mesh.cpp index 6fefcd81..3631aa79 100644 --- a/src/compute/compute_mesh.cpp +++ b/src/compute/compute_mesh.cpp @@ -67,8 +67,8 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { int nchunks = nspec / chunk_size; int iloc = 0; for (int ichunk = 0; ichunk < nchunks; ichunk++) { - for (int ix = 0; ix < ngll; ix++) { - for (int iz = 0; iz < ngll; iz++) { + for (int iz = 0; iz < ngll; iz++) { + for (int ix = 0; ix < ngll; ix++) { for (int ielement = 0; ielement < chunk_size; ielement++) { int ispec = ichunk * chunk_size + ielement; cart_cord[iloc].x = global_coordinates(ispec, iz, ix, 0); @@ -128,8 +128,8 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { type_real zmax = std::numeric_limits::min(); for (int ichunk = 0; ichunk < nchunks; ichunk++) { - for (int ix = 0; ix < ngll; ix++) { - for (int iz = 0; iz < ngll; iz++) { + for (int iz = 0; iz < ngll; iz++) { + for (int ix = 0; ix < ngll; ix++) { for (int ielement = 0; ielement < chunk_size; ielement++) { int ispec = ichunk * chunk_size + ielement; if (iglob_counted[copy_cart_cord[iloc].iglob] == -1) { From 4a48af595df0fe74a0f0f2fb00544526607abac7 Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Tue, 18 Feb 2025 08:08:34 -0500 Subject: [PATCH 4/9] Added no inline to gradient and divergence --- include/algorithms/divergence.hpp | 2 +- include/algorithms/gradient.hpp | 2 +- include/compute/fields/data_access.tpp | 4 +- .../impl/compute_stiffness_interaction.tpp | 278 +++++++++--------- 4 files changed, 144 insertions(+), 142 deletions(-) diff --git a/include/algorithms/divergence.hpp b/include/algorithms/divergence.hpp index 8fa92293..2ecc641e 100644 --- a/include/algorithms/divergence.hpp +++ b/include/algorithms/divergence.hpp @@ -41,7 +41,7 @@ namespace algorithms { template = 0> -KOKKOS_FORCEINLINE_FUNCTION void divergence( +NOINLINE KOKKOS_FUNCTION void divergence( const MemberType &team, const IteratorType &iterator, const specfem::compute::partial_derivatives &partial_derivatives, const Kokkos::View = 0> -KOKKOS_FORCEINLINE_FUNCTION void +NOINLINE KOKKOS_FUNCTION void gradient(const MemberType &team, const IteratorType &iterator, const specfem::compute::partial_derivatives &partial_derivatives, const QuadratureType &quadrature, const ViewType &f, const ViewType &g, diff --git a/include/compute/fields/data_access.tpp b/include/compute/fields/data_access.tpp index 1b976849..051c0764 100644 --- a/include/compute/fields/data_access.tpp +++ b/include/compute/fields/data_access.tpp @@ -1690,7 +1690,7 @@ template < typename WavefieldType, typename ViewType, typename std::enable_if_t< ViewType::isPointFieldType && !ViewType::simd::using_simd, int> = 0> -KOKKOS_FORCEINLINE_FUNCTION void impl_atomic_add_on_device( +NOINLINE KOKKOS_FUNCTION void impl_atomic_add_on_device( const specfem::point::index &index, const ViewType &point_field, const WavefieldType &field) { @@ -1707,7 +1707,7 @@ template < typename WavefieldType, typename ViewType, typename std::enable_if_t< ViewType::isPointFieldType && ViewType::simd::using_simd, int> = 0> -KOKKOS_FORCEINLINE_FUNCTION void impl_atomic_add_on_device( +NOINLINE KOKKOS_FUNCTION void impl_atomic_add_on_device( const specfem::point::simd_index &index, const ViewType &point_field, const WavefieldType &field) { diff --git a/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp b/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp index 29f927d2..18c357a4 100644 --- a/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp +++ b/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp @@ -105,151 +105,153 @@ void specfem::kokkos_kernels::impl::compute_stiffness_interaction( constexpr int simd_size = simd::size(); if constexpr (BoundaryTag == specfem::element::boundary_tag::stacey && - WavefieldType == specfem::wavefield::simulation_field::backward) { - - Kokkos::parallel_for( - "specfem::domain::impl::kernels::elements::compute_stiffness_" - "interaction", - static_cast(chunk_policy), - KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { - for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; - tile += ChunkPolicyType::chunk_size * simd_size) { - const int starting_element_index = - team.league_rank() * ChunkPolicyType::tile_size * simd_size + - tile; - - if (starting_element_index >= nelements) { - break; + WavefieldType == + specfem::wavefield::simulation_field::backward) { + + Kokkos::parallel_for( + "specfem::domain::impl::kernels::elements::compute_stiffness_" + "interaction", + static_cast( + chunk_policy), + KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { + for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; + tile += ChunkPolicyType::chunk_size * simd_size) { + const int starting_element_index = + team.league_rank() * ChunkPolicyType::tile_size * simd_size + + tile; + + if (starting_element_index >= nelements) { + break; + } + + const auto iterator = + chunk_policy.league_iterator(starting_element_index); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, iterator.chunk_size()), + [&](const int i) { + const auto iterator_index = iterator(i); + const auto index = iterator_index.index; + + PointAccelerationType acceleration; + specfem::compute::load_on_device( + istep, index, boundary_values, acceleration); + + specfem::compute::atomic_add_on_device(index, acceleration, + field); + }); } + }); + } else { + + Kokkos::parallel_for( + "specfem::kernels::impl::domain_kernels::compute_stiffness_interaction", + chunk_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { + ChunkElementFieldType element_field(team); + ElementQuadratureType element_quadrature(team); + ChunkStressIntegrandType stress_integrand(team); + + specfem::compute::load_on_device(team, quadrature, + element_quadrature); + for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; + tile += ChunkPolicyType::chunk_size * simd_size) { + const int starting_element_index = + team.league_rank() * ChunkPolicyType::tile_size * simd_size + + tile; + + if (starting_element_index >= nelements) { + break; + } + + const auto iterator = + chunk_policy.league_iterator(starting_element_index); + specfem::compute::load_on_device(team, iterator, field, + element_field); + + team.team_barrier(); + + specfem::algorithms::gradient( + team, iterator, partial_derivatives, + element_quadrature.hprime_gll, element_field.displacement, + // Compute stresses using the gradients + [&](const typename ChunkPolicyType::iterator_type::index_type + &iterator_index, + const typename PointFieldDerivativesType::ViewType &du) { + const auto &index = iterator_index.index; + + PointPartialDerivativesType point_partial_derivatives; + specfem::compute::load_on_device(index, partial_derivatives, + point_partial_derivatives); + + PointPropertyType point_property; + specfem::compute::load_on_device(index, properties, + point_property); + + PointFieldDerivativesType field_derivatives(du); + + const auto point_stress = specfem::medium::compute_stress( + point_property, field_derivatives); + + const auto F = point_stress * point_partial_derivatives; + + const int &ielement = iterator_index.ielement; + + for (int icomponent = 0; icomponent < components; + ++icomponent) { + for (int idim = 0; idim < num_dimensions; ++idim) { + stress_integrand.F(ielement, index.iz, index.ix, idim, + icomponent) = F(idim, icomponent); + } + } + }); + + team.team_barrier(); + + specfem::algorithms::divergence( + team, iterator, partial_derivatives, wgll, + element_quadrature.hprime_wgll, stress_integrand.F, + [&, istep = istep]( + const typename ChunkPolicyType::iterator_type::index_type + &iterator_index, + const typename PointAccelerationType::ViewType &result) { + const auto &index = iterator_index.index; + PointAccelerationType acceleration(result); + + for (int icomponent = 0; icomponent < components; + ++icomponent) { + acceleration.acceleration(icomponent) *= + static_cast(-1.0); + } - const auto iterator = - chunk_policy.league_iterator(starting_element_index); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, iterator.chunk_size()), - [&](const int i) { - const auto iterator_index = iterator(i); - const auto index = iterator_index.index; - - PointAccelerationType acceleration; - specfem::compute::load_on_device( - istep, index, boundary_values, acceleration); - - specfem::compute::atomic_add_on_device(index, acceleration, - field); - }); - } - }); - } - else { - - Kokkos::parallel_for( - "specfem::kernels::impl::domain_kernels::compute_stiffness_interaction", - chunk_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)), - KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { - ChunkElementFieldType element_field(team); - ElementQuadratureType element_quadrature(team); - ChunkStressIntegrandType stress_integrand(team); - - specfem::compute::load_on_device(team, quadrature, element_quadrature); - for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; - tile += ChunkPolicyType::chunk_size * simd_size) { - const int starting_element_index = - team.league_rank() * ChunkPolicyType::tile_size * simd_size + - tile; - - if (starting_element_index >= nelements) { - break; - } - - const auto iterator = - chunk_policy.league_iterator(starting_element_index); - specfem::compute::load_on_device(team, iterator, field, - element_field); - - team.team_barrier(); - - specfem::algorithms::gradient( - team, iterator, partial_derivatives, - element_quadrature.hprime_gll, element_field.displacement, - // Compute stresses using the gradients - [&](const typename ChunkPolicyType::iterator_type::index_type - &iterator_index, - const typename PointFieldDerivativesType::ViewType &du) { - const auto &index = iterator_index.index; - - PointPartialDerivativesType point_partial_derivatives; - specfem::compute::load_on_device(index, partial_derivatives, - point_partial_derivatives); - - PointPropertyType point_property; - specfem::compute::load_on_device(index, properties, - point_property); - - PointFieldDerivativesType field_derivatives(du); + PointPropertyType point_property; + specfem::compute::load_on_device(index, properties, + point_property); - const auto point_stress = specfem::medium::compute_stress( - point_property, field_derivatives); + PointVelocityType velocity; + specfem::compute::load_on_device(index, field, velocity); - const auto F = point_stress * point_partial_derivatives; + PointBoundaryType point_boundary; + specfem::compute::load_on_device(index, boundaries, + point_boundary); - const int &ielement = iterator_index.ielement; + specfem::boundary_conditions::apply_boundary_conditions( + point_boundary, point_property, velocity, acceleration); - for (int icomponent = 0; icomponent < components; - ++icomponent) { - for (int idim = 0; idim < num_dimensions; ++idim) { - stress_integrand.F(ielement, index.iz, index.ix, idim, - icomponent) = F(idim, icomponent); + // Store forward boundary values for reconstruction during + // adjoint simulations. The function does nothing if the + // boundary tag is not stacey + if (wavefield == + specfem::wavefield::simulation_field::forward) { + specfem::compute::store_on_device( + istep, index, acceleration, boundary_values); } - } - }); - - team.team_barrier(); - - specfem::algorithms::divergence( - team, iterator, partial_derivatives, wgll, - element_quadrature.hprime_wgll, stress_integrand.F, - [&, istep = istep]( - const typename ChunkPolicyType::iterator_type::index_type - &iterator_index, - const typename PointAccelerationType::ViewType &result) { - const auto &index = iterator_index.index; - PointAccelerationType acceleration(result); - - for (int icomponent = 0; icomponent < components; - ++icomponent) { - acceleration.acceleration(icomponent) *= - static_cast(-1.0); - } - - PointPropertyType point_property; - specfem::compute::load_on_device(index, properties, - point_property); - - PointVelocityType velocity; - specfem::compute::load_on_device(index, field, velocity); - - PointBoundaryType point_boundary; - specfem::compute::load_on_device(index, boundaries, - point_boundary); - - specfem::boundary_conditions::apply_boundary_conditions( - point_boundary, point_property, velocity, acceleration); - - // Store forward boundary values for reconstruction during - // adjoint simulations. The function does nothing if the - // boundary tag is not stacey - if (wavefield == - specfem::wavefield::simulation_field::forward) { - specfem::compute::store_on_device(istep, index, acceleration, - boundary_values); - } - - specfem::compute::atomic_add_on_device(index, acceleration, - field); - }); - } - }); + + specfem::compute::atomic_add_on_device(index, acceleration, + field); + }); + } + }); } Kokkos::fence(); From 8f88ffffbbe3920a131dece8ceb26d7bb2754a96 Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Tue, 18 Feb 2025 08:21:44 -0500 Subject: [PATCH 5/9] Added no inline to gradient function --- include/algorithms/gradient.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/algorithms/gradient.hpp b/include/algorithms/gradient.hpp index 6569b3d4..dcf57285 100644 --- a/include/algorithms/gradient.hpp +++ b/include/algorithms/gradient.hpp @@ -38,7 +38,7 @@ namespace algorithms { template = 0> -KOKKOS_FORCEINLINE_FUNCTION void +NOINLINE KOKKOS_FUNCTION void gradient(const MemberType &team, const IteratorType &iterator, const specfem::compute::partial_derivatives &partial_derivatives, const QuadratureType &quadrature, const ViewType &f, From 83b75d110b194bdc98ccca5ec2a55afd5dce8e7b Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Tue, 18 Feb 2025 13:36:54 -0500 Subject: [PATCH 6/9] Chunk size fix for GPUs --- include/compute/fields/impl/field_impl.tpp | 10 +++------- src/compute/compute_mesh.cpp | 10 +++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/compute/fields/impl/field_impl.tpp b/include/compute/fields/impl/field_impl.tpp index c7d4e8ea..48abaf74 100644 --- a/include/compute/fields/impl/field_impl.tpp +++ b/include/compute/fields/impl/field_impl.tpp @@ -39,13 +39,9 @@ specfem::compute::impl::field_impl::field_impl( #ifdef KOKKOS_ENABLE_CUDA constexpr int chunk_size = 32 * simd::size(); -#endif - -#ifdef KOKKOS_ENABLE_OPENMP - constexpr int chunk_size = 1 * simd::size(); -#endif - -#ifdef KOKKOS_ENABLE_SERIAL +#elif KOKKOS_ENABLE_OPENMP + constexpr int chunk_size = 1; +#else constexpr int chunk_size = 1 * simd::size(); #endif diff --git a/src/compute/compute_mesh.cpp b/src/compute/compute_mesh.cpp index 3631aa79..e210ad05 100644 --- a/src/compute/compute_mesh.cpp +++ b/src/compute/compute_mesh.cpp @@ -54,13 +54,9 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { #ifdef KOKKOS_ENABLE_CUDA constexpr int chunk_size = 32 * simd::size(); -#endif - -#ifdef KOKKOS_ENABLE_OPENMP - constexpr int chunk_size = 1 * simd::size(); -#endif - -#ifdef KOKKOS_ENABLE_SERIAL +#elif KOKKOS_ENABLE_OPENMP + constexpr int chunk_size = 1; +#else constexpr int chunk_size = 1 * simd::size(); #endif From 2939aac6809a4b40a636cf9d8765dd3fadc12206 Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Tue, 18 Feb 2025 14:40:14 -0500 Subject: [PATCH 7/9] Added storage chunk config to specfem::parallel_configuration namespace --- include/compute/fields/impl/field_impl.tpp | 10 ++-------- .../parallel_configuration/chunk_config.hpp | 18 ++++++++++++++++-- src/compute/compute_mesh.cpp | 9 ++------- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/include/compute/fields/impl/field_impl.tpp b/include/compute/fields/impl/field_impl.tpp index 48abaf74..39ede278 100644 --- a/include/compute/fields/impl/field_impl.tpp +++ b/include/compute/fields/impl/field_impl.tpp @@ -3,6 +3,7 @@ #include "compute/element_types/element_types.hpp" #include "compute/fields/impl/field_impl.hpp" +#include "parallel_configuration/chunk_config.hpp" #include "kokkos_abstractions.h" #include @@ -37,14 +38,7 @@ specfem::compute::impl::field_impl::field_impl( int count = 0; using simd = specfem::datatype::simd; -#ifdef KOKKOS_ENABLE_CUDA - constexpr int chunk_size = 32 * simd::size(); -#elif KOKKOS_ENABLE_OPENMP - constexpr int chunk_size = 1; -#else - constexpr int chunk_size = 1 * simd::size(); -#endif - + constexpr int chunk_size = specfem::parallel_config::storage_chunk_size; int nchunks = nspec / chunk_size; int iloc = 0; for (int ichunk = 0; ichunk < nchunks; ichunk++) { diff --git a/include/parallel_configuration/chunk_config.hpp b/include/parallel_configuration/chunk_config.hpp index 41d151ba..161eb90c 100644 --- a/include/parallel_configuration/chunk_config.hpp +++ b/include/parallel_configuration/chunk_config.hpp @@ -6,6 +6,20 @@ namespace specfem { namespace parallel_config { + +#ifdef KOKKOS_ENABLE_CUDA +constexpr int chunk_size = 32; +constexpr int storage_chunk_size = chunk_size; +#elif KOKKOS_ENABLE_OPENMP +constexpr int chunk_size = 1; +constexpr int simd_size = specfem::datatypes::simd::size(); +constexpr int storage_chunk_size = chunk_size * simd_size; +#else +constexpr int chunk_size = 1; +constexpr int simd_size = specfem::datatypes::simd::size(); +constexpr int storage_chunk_size = chunk_size * simd_size; +#endif + /** * @brief Parallel configuration for chunk policy. * @@ -48,8 +62,8 @@ struct default_chunk_config; #ifdef KOKKOS_ENABLE_CUDA template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif #ifdef KOKKOS_ENABLE_OPENMP diff --git a/src/compute/compute_mesh.cpp b/src/compute/compute_mesh.cpp index e210ad05..55f6bf0a 100644 --- a/src/compute/compute_mesh.cpp +++ b/src/compute/compute_mesh.cpp @@ -3,6 +3,7 @@ #include "enumerations/material_definitions.hpp" #include "jacobian/interface.hpp" #include "kokkos_abstractions.h" +#include "parallel_configuration/chunk_config.hpp" #include "quadrature/interface.hpp" #include "specfem_setup.hpp" #include @@ -52,13 +53,7 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { using simd = specfem::datatype::simd; -#ifdef KOKKOS_ENABLE_CUDA - constexpr int chunk_size = 32 * simd::size(); -#elif KOKKOS_ENABLE_OPENMP - constexpr int chunk_size = 1; -#else - constexpr int chunk_size = 1 * simd::size(); -#endif + constexpr int chunk_size = specfem::parallel_config::storage_chunk_size; int nchunks = nspec / chunk_size; int iloc = 0; From e344fcedef97dbe99f5eb293727b62ad9e7c1be2 Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Tue, 18 Feb 2025 14:44:44 -0500 Subject: [PATCH 8/9] Added chunk sizes to impl namespace --- .../parallel_configuration/chunk_config.hpp | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/include/parallel_configuration/chunk_config.hpp b/include/parallel_configuration/chunk_config.hpp index 161eb90c..1a53760d 100644 --- a/include/parallel_configuration/chunk_config.hpp +++ b/include/parallel_configuration/chunk_config.hpp @@ -7,17 +7,20 @@ namespace specfem { namespace parallel_config { +namespace impl { +constexpr int cuda_chunk_size = 32; +constexpr int openmp_chunk_size = 1; +constexpr int serial_chunk_size = 1; +} // namespace impl + #ifdef KOKKOS_ENABLE_CUDA -constexpr int chunk_size = 32; -constexpr int storage_chunk_size = chunk_size; +constexpr int storage_chunk_size = impl::cuda_chunk_size; #elif KOKKOS_ENABLE_OPENMP -constexpr int chunk_size = 1; constexpr int simd_size = specfem::datatypes::simd::size(); -constexpr int storage_chunk_size = chunk_size * simd_size; +constexpr int storage_chunk_size = impl::openmp_chunk_size * simd_size; #else -constexpr int chunk_size = 1; constexpr int simd_size = specfem::datatypes::simd::size(); -constexpr int storage_chunk_size = chunk_size * simd_size; +constexpr int storage_chunk_size = impl::serial_chunk_size * simd_size; #endif /** @@ -62,24 +65,24 @@ struct default_chunk_config; #ifdef KOKKOS_ENABLE_CUDA template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif #ifdef KOKKOS_ENABLE_OPENMP template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif #ifdef KOKKOS_ENABLE_SERIAL template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif } // namespace parallel_config } // namespace specfem From 1ff5ffbdd31538dee539394af29a9d96054a8cd4 Mon Sep 17 00:00:00 2001 From: Rohit Kakodkar Date: Tue, 18 Feb 2025 15:25:10 -0500 Subject: [PATCH 9/9] Compilation fix --- include/parallel_configuration/chunk_config.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/parallel_configuration/chunk_config.hpp b/include/parallel_configuration/chunk_config.hpp index 1a53760d..51fd35b3 100644 --- a/include/parallel_configuration/chunk_config.hpp +++ b/include/parallel_configuration/chunk_config.hpp @@ -1,6 +1,7 @@ #pragma once #include "constants.hpp" +#include "datatypes/simd.hpp" #include "enumerations/dimension.hpp" #include @@ -16,10 +17,10 @@ constexpr int serial_chunk_size = 1; #ifdef KOKKOS_ENABLE_CUDA constexpr int storage_chunk_size = impl::cuda_chunk_size; #elif KOKKOS_ENABLE_OPENMP -constexpr int simd_size = specfem::datatypes::simd::size(); +constexpr int simd_size = specfem::datatype::simd::size(); constexpr int storage_chunk_size = impl::openmp_chunk_size * simd_size; #else -constexpr int simd_size = specfem::datatypes::simd::size(); +constexpr int simd_size = specfem::datatype::simd::size(); constexpr int storage_chunk_size = impl::serial_chunk_size * simd_size; #endif