diff --git a/include/compute/fields/impl/field_impl.tpp b/include/compute/fields/impl/field_impl.tpp index ebb9db43a..39ede2786 100644 --- a/include/compute/fields/impl/field_impl.tpp +++ b/include/compute/fields/impl/field_impl.tpp @@ -1,8 +1,9 @@ #ifndef _COMPUTE_FIELDS_IMPL_FIELD_IMPL_TPP_ #define _COMPUTE_FIELDS_IMPL_FIELD_IMPL_TPP_ -#include "compute/fields/impl/field_impl.hpp" #include "compute/element_types/element_types.hpp" +#include "compute/fields/impl/field_impl.hpp" +#include "parallel_configuration/chunk_config.hpp" #include "kokkos_abstractions.h" #include @@ -35,19 +36,27 @@ specfem::compute::impl::field_impl::field_impl( // Count the total number of distinct global indices for the medium int count = 0; - - for (int ix = 0; ix < ngllx; ++ix) { - for (int iz = 0; iz < ngllz; ++iz) { - for (int ispec = 0; ispec < nspec; ++ispec) { - const auto medium = element_types.get_medium_tag(ispec); - if (medium == MediumTag) { - const int index = index_mapping(ispec, iz, ix); // get global index - // increase the count only if the global index is not already counted - /// static_cast(medium::value) is the index of the medium in the - /// enum class - if (assembly_index_mapping(index) == -1) { - assembly_index_mapping(index) = count; - count++; + using simd = specfem::datatype::simd; + + constexpr int chunk_size = specfem::parallel_config::storage_chunk_size; + int nchunks = nspec / chunk_size; + int iloc = 0; + for (int ichunk = 0; ichunk < nchunks; ichunk++) { + for (int iz = 0; iz < ngllz; iz++) { + for (int ix = 0; ix < ngllx; ix++) { + for (int ielement = 0; ielement < chunk_size; ielement++) { + int ispec = ichunk * chunk_size + ielement; + const auto medium = element_types.get_medium_tag(ispec); + if (medium == MediumTag) { + const int index = index_mapping(ispec, iz, ix); // get global index + // increase the count only if the global index is not already + // counted + /// static_cast(medium::value) is the index of the medium in + /// the enum class + if (assembly_index_mapping(index) == -1) { + assembly_index_mapping(index) = count; + count++; + } } } } diff --git a/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp b/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp index 29f927d20..18c357a44 100644 --- a/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp +++ b/include/kokkos_kernels/impl/compute_stiffness_interaction.tpp @@ -105,151 +105,153 @@ void specfem::kokkos_kernels::impl::compute_stiffness_interaction( constexpr int simd_size = simd::size(); if constexpr (BoundaryTag == specfem::element::boundary_tag::stacey && - WavefieldType == specfem::wavefield::simulation_field::backward) { - - Kokkos::parallel_for( - "specfem::domain::impl::kernels::elements::compute_stiffness_" - "interaction", - static_cast(chunk_policy), - KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { - for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; - tile += ChunkPolicyType::chunk_size * simd_size) { - const int starting_element_index = - team.league_rank() * ChunkPolicyType::tile_size * simd_size + - tile; - - if (starting_element_index >= nelements) { - break; + WavefieldType == + specfem::wavefield::simulation_field::backward) { + + Kokkos::parallel_for( + "specfem::domain::impl::kernels::elements::compute_stiffness_" + "interaction", + static_cast( + chunk_policy), + KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { + for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; + tile += ChunkPolicyType::chunk_size * simd_size) { + const int starting_element_index = + team.league_rank() * ChunkPolicyType::tile_size * simd_size + + tile; + + if (starting_element_index >= nelements) { + break; + } + + const auto iterator = + chunk_policy.league_iterator(starting_element_index); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, iterator.chunk_size()), + [&](const int i) { + const auto iterator_index = iterator(i); + const auto index = iterator_index.index; + + PointAccelerationType acceleration; + specfem::compute::load_on_device( + istep, index, boundary_values, acceleration); + + specfem::compute::atomic_add_on_device(index, acceleration, + field); + }); } + }); + } else { + + Kokkos::parallel_for( + "specfem::kernels::impl::domain_kernels::compute_stiffness_interaction", + chunk_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { + ChunkElementFieldType element_field(team); + ElementQuadratureType element_quadrature(team); + ChunkStressIntegrandType stress_integrand(team); + + specfem::compute::load_on_device(team, quadrature, + element_quadrature); + for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; + tile += ChunkPolicyType::chunk_size * simd_size) { + const int starting_element_index = + team.league_rank() * ChunkPolicyType::tile_size * simd_size + + tile; + + if (starting_element_index >= nelements) { + break; + } + + const auto iterator = + chunk_policy.league_iterator(starting_element_index); + specfem::compute::load_on_device(team, iterator, field, + element_field); + + team.team_barrier(); + + specfem::algorithms::gradient( + team, iterator, partial_derivatives, + element_quadrature.hprime_gll, element_field.displacement, + // Compute stresses using the gradients + [&](const typename ChunkPolicyType::iterator_type::index_type + &iterator_index, + const typename PointFieldDerivativesType::ViewType &du) { + const auto &index = iterator_index.index; + + PointPartialDerivativesType point_partial_derivatives; + specfem::compute::load_on_device(index, partial_derivatives, + point_partial_derivatives); + + PointPropertyType point_property; + specfem::compute::load_on_device(index, properties, + point_property); + + PointFieldDerivativesType field_derivatives(du); + + const auto point_stress = specfem::medium::compute_stress( + point_property, field_derivatives); + + const auto F = point_stress * point_partial_derivatives; + + const int &ielement = iterator_index.ielement; + + for (int icomponent = 0; icomponent < components; + ++icomponent) { + for (int idim = 0; idim < num_dimensions; ++idim) { + stress_integrand.F(ielement, index.iz, index.ix, idim, + icomponent) = F(idim, icomponent); + } + } + }); + + team.team_barrier(); + + specfem::algorithms::divergence( + team, iterator, partial_derivatives, wgll, + element_quadrature.hprime_wgll, stress_integrand.F, + [&, istep = istep]( + const typename ChunkPolicyType::iterator_type::index_type + &iterator_index, + const typename PointAccelerationType::ViewType &result) { + const auto &index = iterator_index.index; + PointAccelerationType acceleration(result); + + for (int icomponent = 0; icomponent < components; + ++icomponent) { + acceleration.acceleration(icomponent) *= + static_cast(-1.0); + } - const auto iterator = - chunk_policy.league_iterator(starting_element_index); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, iterator.chunk_size()), - [&](const int i) { - const auto iterator_index = iterator(i); - const auto index = iterator_index.index; - - PointAccelerationType acceleration; - specfem::compute::load_on_device( - istep, index, boundary_values, acceleration); - - specfem::compute::atomic_add_on_device(index, acceleration, - field); - }); - } - }); - } - else { - - Kokkos::parallel_for( - "specfem::kernels::impl::domain_kernels::compute_stiffness_interaction", - chunk_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)), - KOKKOS_LAMBDA(const typename ChunkPolicyType::member_type &team) { - ChunkElementFieldType element_field(team); - ElementQuadratureType element_quadrature(team); - ChunkStressIntegrandType stress_integrand(team); - - specfem::compute::load_on_device(team, quadrature, element_quadrature); - for (int tile = 0; tile < ChunkPolicyType::tile_size * simd_size; - tile += ChunkPolicyType::chunk_size * simd_size) { - const int starting_element_index = - team.league_rank() * ChunkPolicyType::tile_size * simd_size + - tile; - - if (starting_element_index >= nelements) { - break; - } - - const auto iterator = - chunk_policy.league_iterator(starting_element_index); - specfem::compute::load_on_device(team, iterator, field, - element_field); - - team.team_barrier(); - - specfem::algorithms::gradient( - team, iterator, partial_derivatives, - element_quadrature.hprime_gll, element_field.displacement, - // Compute stresses using the gradients - [&](const typename ChunkPolicyType::iterator_type::index_type - &iterator_index, - const typename PointFieldDerivativesType::ViewType &du) { - const auto &index = iterator_index.index; - - PointPartialDerivativesType point_partial_derivatives; - specfem::compute::load_on_device(index, partial_derivatives, - point_partial_derivatives); - - PointPropertyType point_property; - specfem::compute::load_on_device(index, properties, - point_property); - - PointFieldDerivativesType field_derivatives(du); + PointPropertyType point_property; + specfem::compute::load_on_device(index, properties, + point_property); - const auto point_stress = specfem::medium::compute_stress( - point_property, field_derivatives); + PointVelocityType velocity; + specfem::compute::load_on_device(index, field, velocity); - const auto F = point_stress * point_partial_derivatives; + PointBoundaryType point_boundary; + specfem::compute::load_on_device(index, boundaries, + point_boundary); - const int &ielement = iterator_index.ielement; + specfem::boundary_conditions::apply_boundary_conditions( + point_boundary, point_property, velocity, acceleration); - for (int icomponent = 0; icomponent < components; - ++icomponent) { - for (int idim = 0; idim < num_dimensions; ++idim) { - stress_integrand.F(ielement, index.iz, index.ix, idim, - icomponent) = F(idim, icomponent); + // Store forward boundary values for reconstruction during + // adjoint simulations. The function does nothing if the + // boundary tag is not stacey + if (wavefield == + specfem::wavefield::simulation_field::forward) { + specfem::compute::store_on_device( + istep, index, acceleration, boundary_values); } - } - }); - - team.team_barrier(); - - specfem::algorithms::divergence( - team, iterator, partial_derivatives, wgll, - element_quadrature.hprime_wgll, stress_integrand.F, - [&, istep = istep]( - const typename ChunkPolicyType::iterator_type::index_type - &iterator_index, - const typename PointAccelerationType::ViewType &result) { - const auto &index = iterator_index.index; - PointAccelerationType acceleration(result); - - for (int icomponent = 0; icomponent < components; - ++icomponent) { - acceleration.acceleration(icomponent) *= - static_cast(-1.0); - } - - PointPropertyType point_property; - specfem::compute::load_on_device(index, properties, - point_property); - - PointVelocityType velocity; - specfem::compute::load_on_device(index, field, velocity); - - PointBoundaryType point_boundary; - specfem::compute::load_on_device(index, boundaries, - point_boundary); - - specfem::boundary_conditions::apply_boundary_conditions( - point_boundary, point_property, velocity, acceleration); - - // Store forward boundary values for reconstruction during - // adjoint simulations. The function does nothing if the - // boundary tag is not stacey - if (wavefield == - specfem::wavefield::simulation_field::forward) { - specfem::compute::store_on_device(istep, index, acceleration, - boundary_values); - } - - specfem::compute::atomic_add_on_device(index, acceleration, - field); - }); - } - }); + + specfem::compute::atomic_add_on_device(index, acceleration, + field); + }); + } + }); } Kokkos::fence(); diff --git a/include/parallel_configuration/chunk_config.hpp b/include/parallel_configuration/chunk_config.hpp index 41d151ba1..51fd35b32 100644 --- a/include/parallel_configuration/chunk_config.hpp +++ b/include/parallel_configuration/chunk_config.hpp @@ -1,11 +1,29 @@ #pragma once #include "constants.hpp" +#include "datatypes/simd.hpp" #include "enumerations/dimension.hpp" #include namespace specfem { namespace parallel_config { + +namespace impl { +constexpr int cuda_chunk_size = 32; +constexpr int openmp_chunk_size = 1; +constexpr int serial_chunk_size = 1; +} // namespace impl + +#ifdef KOKKOS_ENABLE_CUDA +constexpr int storage_chunk_size = impl::cuda_chunk_size; +#elif KOKKOS_ENABLE_OPENMP +constexpr int simd_size = specfem::datatype::simd::size(); +constexpr int storage_chunk_size = impl::openmp_chunk_size * simd_size; +#else +constexpr int simd_size = specfem::datatype::simd::size(); +constexpr int storage_chunk_size = impl::serial_chunk_size * simd_size; +#endif + /** * @brief Parallel configuration for chunk policy. * @@ -48,24 +66,24 @@ struct default_chunk_config; #ifdef KOKKOS_ENABLE_CUDA template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif #ifdef KOKKOS_ENABLE_OPENMP template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif #ifdef KOKKOS_ENABLE_SERIAL template struct default_chunk_config - : chunk_config {}; + : chunk_config {}; #endif } // namespace parallel_config } // namespace specfem diff --git a/src/compute/compute_mesh.cpp b/src/compute/compute_mesh.cpp index 1885333b4..55f6bf0ac 100644 --- a/src/compute/compute_mesh.cpp +++ b/src/compute/compute_mesh.cpp @@ -3,6 +3,7 @@ #include "enumerations/material_definitions.hpp" #include "jacobian/interface.hpp" #include "kokkos_abstractions.h" +#include "parallel_configuration/chunk_config.hpp" #include "quadrature/interface.hpp" #include "specfem_setup.hpp" #include @@ -50,13 +51,22 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { std::vector cart_cord(nspec * ngllxz); - for (int ispec = 0; ispec < nspec; ispec++) { + using simd = specfem::datatype::simd; + + constexpr int chunk_size = specfem::parallel_config::storage_chunk_size; + + int nchunks = nspec / chunk_size; + int iloc = 0; + for (int ichunk = 0; ichunk < nchunks; ichunk++) { for (int iz = 0; iz < ngll; iz++) { for (int ix = 0; ix < ngll; ix++) { - int iloc = ix * nspec * ngll + iz * nspec + ispec; - cart_cord[iloc].x = global_coordinates(ispec, iz, ix, 0); - cart_cord[iloc].z = global_coordinates(ispec, iz, ix, 1); - cart_cord[iloc].iloc = iloc; + for (int ielement = 0; ielement < chunk_size; ielement++) { + int ispec = ichunk * chunk_size + ielement; + cart_cord[iloc].x = global_coordinates(ispec, iz, ix, 0); + cart_cord[iloc].z = global_coordinates(ispec, iz, ix, 1); + cart_cord[iloc].iloc = iloc; + iloc++; + } } } } @@ -101,40 +111,44 @@ assign_numbering(specfem::kokkos::HostView4d global_coordinates) { // Assign numbering to corresponding ispec, iz, ix std::vector iglob_counted(nglob, -1); - int iloc = 0; + iloc = 0; int inum = 0; type_real xmin = std::numeric_limits::max(); type_real xmax = std::numeric_limits::min(); type_real zmin = std::numeric_limits::max(); type_real zmax = std::numeric_limits::min(); - for (int ix = 0; ix < ngll; ix++) { + + for (int ichunk = 0; ichunk < nchunks; ichunk++) { for (int iz = 0; iz < ngll; iz++) { - for (int ispec = 0; ispec < nspec; ispec++) { - if (iglob_counted[copy_cart_cord[iloc].iglob] == -1) { - - const type_real x_cor = copy_cart_cord[iloc].x; - const type_real z_cor = copy_cart_cord[iloc].z; - if (xmin > x_cor) - xmin = x_cor; - if (zmin > z_cor) - zmin = z_cor; - if (xmax < x_cor) - xmax = x_cor; - if (zmax < z_cor) - zmax = z_cor; - - iglob_counted[copy_cart_cord[iloc].iglob] = inum; - points.h_index_mapping(ispec, iz, ix) = inum; - points.h_coord(0, ispec, iz, ix) = x_cor; - points.h_coord(1, ispec, iz, ix) = z_cor; - inum++; - } else { - points.h_index_mapping(ispec, iz, ix) = - iglob_counted[copy_cart_cord[iloc].iglob]; - points.h_coord(0, ispec, iz, ix) = copy_cart_cord[iloc].x; - points.h_coord(1, ispec, iz, ix) = copy_cart_cord[iloc].z; + for (int ix = 0; ix < ngll; ix++) { + for (int ielement = 0; ielement < chunk_size; ielement++) { + int ispec = ichunk * chunk_size + ielement; + if (iglob_counted[copy_cart_cord[iloc].iglob] == -1) { + + const type_real x_cor = copy_cart_cord[iloc].x; + const type_real z_cor = copy_cart_cord[iloc].z; + if (xmin > x_cor) + xmin = x_cor; + if (zmin > z_cor) + zmin = z_cor; + if (xmax < x_cor) + xmax = x_cor; + if (zmax < z_cor) + zmax = z_cor; + + iglob_counted[copy_cart_cord[iloc].iglob] = inum; + points.h_index_mapping(ispec, iz, ix) = inum; + points.h_coord(0, ispec, iz, ix) = x_cor; + points.h_coord(1, ispec, iz, ix) = z_cor; + inum++; + } else { + points.h_index_mapping(ispec, iz, ix) = + iglob_counted[copy_cart_cord[iloc].iglob]; + points.h_coord(0, ispec, iz, ix) = copy_cart_cord[iloc].x; + points.h_coord(1, ispec, iz, ix) = copy_cart_cord[iloc].z; + } + iloc++; } - iloc++; } } }