From 3b743429b165dbf8a6fd12c9c2bec6a6115a9a1f Mon Sep 17 00:00:00 2001 From: Thomas Padioleau Date: Mon, 4 Sep 2023 14:37:47 +0200 Subject: [PATCH] Generalize serial_host to serial --- examples/heat_equation.cpp | 21 ++- examples/heat_equation_spectral.cpp | 17 ++- include/ddc/deepcopy.hpp | 54 +++++++- include/ddc/fill.hpp | 46 ++++++- include/ddc/for_each.hpp | 70 +++++----- include/ddc/transform_reduce.hpp | 26 ++-- tests/CMakeLists.txt | 1 + tests/fft.hpp | 200 ++++++++++++++++++++++++++++ tests/for_each.cpp | 8 +- tests/nested_algorithms.cpp | 175 ++++++++++++++++++++++++ tests/relocatable_device_code.cpp | 2 +- tests/transform_reduce.cpp | 4 +- 12 files changed, 555 insertions(+), 69 deletions(-) create mode 100644 tests/fft.hpp create mode 100644 tests/nested_algorithms.cpp diff --git a/examples/heat_equation.cpp b/examples/heat_equation.cpp index 8aa8b7960..300f8ed6a 100644 --- a/examples/heat_equation.cpp +++ b/examples/heat_equation.cpp @@ -59,7 +59,7 @@ void display(double time, ChunkType temp) std::cout << " * temperature[y:" << ddc::get_domain(temp).size() / 2 << "] = {"; ddc::for_each( - ddc::policies::serial_host, + ddc::policies::serial, ddc::get_domain(temp), [=](ddc::DiscreteElement const ix) { std::cout << std::setw(6) << temp_slice(ix); @@ -239,7 +239,10 @@ int main(int argc, char** argv) //! [initial output] // display the initial data - ddc::deepcopy(ghosted_temp, ghosted_last_temp); + ddc::deepcopy( + ddc::policies::parallel_device, + ghosted_temp, + ghosted_last_temp); display(ddc::coordinate(time_domain.front()), ghosted_temp[x_domain][y_domain]); // time of the iteration where the last output happened @@ -254,15 +257,19 @@ int main(int argc, char** argv) //! [boundary conditions] // Periodic boundary conditions ddc::deepcopy( + ddc::policies::parallel_device, ghosted_last_temp[x_pre_ghost][y_domain], ghosted_last_temp[y_domain][x_domain_end]); ddc::deepcopy( + ddc::policies::parallel_device, ghosted_last_temp[y_domain][x_post_ghost], ghosted_last_temp[y_domain][x_domain_begin]); ddc::deepcopy( + ddc::policies::parallel_device, ghosted_last_temp[x_domain][y_pre_ghost], ghosted_last_temp[x_domain][y_domain_end]); ddc::deepcopy( + ddc::policies::parallel_device, ghosted_last_temp[x_domain][y_post_ghost], ghosted_last_temp[x_domain][y_domain_begin]); //! [boundary conditions] @@ -312,7 +319,10 @@ int main(int argc, char** argv) //! [output] if (iter - last_output >= t_output_period) { last_output = iter; - ddc::deepcopy(ghosted_temp, ghosted_last_temp); + ddc::deepcopy( + ddc::policies::parallel_device, + ghosted_temp, + ghosted_last_temp); display(ddc::coordinate(iter), ghosted_temp[x_domain][y_domain]); } @@ -326,7 +336,10 @@ int main(int argc, char** argv) //! [final output] if (last_output < time_domain.back()) { - ddc::deepcopy(ghosted_temp, ghosted_last_temp); + ddc::deepcopy( + ddc::policies::parallel_device, + ghosted_temp, + ghosted_last_temp); display(ddc::coordinate(time_domain.back()), ghosted_temp[x_domain][y_domain]); } diff --git a/examples/heat_equation_spectral.cpp b/examples/heat_equation_spectral.cpp index eb0517e66..c3398228d 100644 --- a/examples/heat_equation_spectral.cpp +++ b/examples/heat_equation_spectral.cpp @@ -60,7 +60,7 @@ void display(double time, ChunkType temp) std::cout << " * temperature[y:" << ddc::get_domain(temp).size() / 2 << "] = {"; ddc::for_each( - ddc::policies::serial_host, + ddc::policies::serial, ddc::get_domain(temp), [=](ddc::DiscreteElement const ix) { std::cout << std::setw(6) << temp_slice(ix); @@ -203,7 +203,10 @@ int main(int argc, char** argv) //! [initial output] // display the initial data - ddc::deepcopy(_host_temp, _last_temp); + ddc::deepcopy( + ddc::policies::parallel_device, + _host_temp, + _last_temp); display(ddc::coordinate(time_domain.front()), _host_temp[x_domain][y_domain]); // time of the iteration where the last output happened @@ -277,7 +280,10 @@ int main(int argc, char** argv) //! [output] if (iter - last_output >= t_output_period) { last_output = iter; - ddc::deepcopy(_host_temp, _last_temp); + ddc::deepcopy( + ddc::policies::parallel_device, + _host_temp, + _last_temp); display(ddc::coordinate(iter), _host_temp[x_domain][y_domain]); } @@ -291,7 +297,10 @@ int main(int argc, char** argv) //! [final output] if (last_output < time_domain.back()) { - ddc::deepcopy(_host_temp, _last_temp); + ddc::deepcopy( + ddc::policies::parallel_device, + _host_temp, + _last_temp); display(ddc::coordinate(time_domain.back()), _host_temp[x_domain][y_domain]); } diff --git a/include/ddc/deepcopy.hpp b/include/ddc/deepcopy.hpp index d4e468d22..ba898c051 100644 --- a/include/ddc/deepcopy.hpp +++ b/include/ddc/deepcopy.hpp @@ -7,6 +7,7 @@ #include #include "ddc/chunk_span.hpp" +#include "ddc/for_each.hpp" namespace ddc { @@ -16,7 +17,7 @@ namespace ddc { * @return dst as a ChunkSpan */ template -auto deepcopy(ChunkDst&& dst, ChunkSrc&& src) +auto deepcopy(parallel_host_policy, ChunkDst&& dst, ChunkSrc&& src) { static_assert(is_borrowed_chunk_v); static_assert(is_borrowed_chunk_v); @@ -28,4 +29,55 @@ auto deepcopy(ChunkDst&& dst, ChunkSrc&& src) return dst.span_view(); } +/** Copy the content of a borrowed chunk into another + * @param[out] dst the borrowed chunk in which to copy + * @param[in] src the borrowed chunk from which to copy + * @return dst as a ChunkSpan +*/ +template +auto deepcopy(parallel_device_policy, ChunkDst&& dst, ChunkSrc&& src) +{ + return deepcopy( + policies::parallel_host, + std::forward(dst), + std::forward(src)); +} + +/** Copy the content of a borrowed chunk into another + * @param[out] dst the borrowed chunk in which to copy + * @param[in] src the borrowed chunk from which to copy + * @return dst as a ChunkSpan +*/ +template +constexpr auto deepcopy(serial_policy, ChunkDst&& dst, ChunkSrc&& src) +{ + static_assert(is_borrowed_chunk_v); + static_assert(is_borrowed_chunk_v); + static_assert( + std::is_assignable_v, chunk_reference_t>, + "Not assignable"); + assert(dst.domain().extents() == src.domain().extents()); + KOKKOS_ENSURES((Kokkos::SpaceAccessibility< + DDC_CURRENT_KOKKOS_SPACE, + typename std::remove_cv_t>::memory_space>:: + accessible)); + KOKKOS_ENSURES((Kokkos::SpaceAccessibility< + DDC_CURRENT_KOKKOS_SPACE, + typename std::remove_cv_t>::memory_space>:: + accessible)); + for_each(policies::serial, dst.domain(), [&](auto elem) { dst(elem) = src(elem); }); + return dst.span_view(); +} + +/** Copy the content of a borrowed chunk into another + * @param[out] dst the borrowed chunk in which to copy + * @param[in] src the borrowed chunk from which to copy + * @return dst as a ChunkSpan +*/ +template +constexpr auto deepcopy(ChunkDst&& dst, ChunkSrc&& src) +{ + return deepcopy(policies::serial, std::forward(dst), std::forward(src)); +} + } // namespace ddc diff --git a/include/ddc/fill.hpp b/include/ddc/fill.hpp index 513392d28..a92809855 100644 --- a/include/ddc/fill.hpp +++ b/include/ddc/fill.hpp @@ -3,10 +3,14 @@ #pragma once #include +#include +#include #include #include "ddc/chunk_span.hpp" +#include "ddc/detail/macros.hpp" +#include "ddc/for_each.hpp" namespace ddc { @@ -16,7 +20,7 @@ namespace ddc { * @return dst as a ChunkSpan */ template -auto fill(ChunkDst&& dst, T const& value) +auto fill(parallel_host_policy, ChunkDst&& dst, T const& value) { static_assert(is_borrowed_chunk_v); static_assert(std::is_assignable_v, T>, "Not assignable"); @@ -24,4 +28,44 @@ auto fill(ChunkDst&& dst, T const& value) return dst.span_view(); } +/** Fill a borrowed chunk with a given value + * @param[out] dst the borrowed chunk in which to copy + * @param[in] value the value to fill `dst` + * @return dst as a ChunkSpan + */ +template +auto fill(parallel_device_policy, ChunkDst&& dst, T const& value) +{ + return fill(ddc::policies::parallel_host, std::forward(dst), value); +} + +/** Fill a borrowed chunk with a given value + * @param[out] dst the borrowed chunk in which to copy + * @param[in] value the value to fill `dst` + * @return dst as a ChunkSpan + */ +template +constexpr auto fill(serial_policy, ChunkDst&& dst, T const& value) +{ + static_assert(is_borrowed_chunk_v); + static_assert(std::is_assignable_v, T>, "Not assignable"); + KOKKOS_ENSURES((Kokkos::SpaceAccessibility< + DDC_CURRENT_KOKKOS_SPACE, + typename std::remove_cv_t>::memory_space>:: + accessible)); + for_each(policies::serial, dst.domain(), [&](auto elem) { dst(elem) = value; }); + return dst.span_view(); +} + +/** Fill a borrowed chunk with a given value + * @param[out] dst the borrowed chunk in which to copy + * @param[in] value the value to fill `dst` + * @return dst as a ChunkSpan + */ +template +constexpr auto fill(ChunkDst&& dst, T const& value) +{ + return fill(policies::serial, std::forward(dst), value); +} + } // namespace ddc diff --git a/include/ddc/for_each.hpp b/include/ddc/for_each.hpp index cf27f4352..c527644ee 100644 --- a/include/ddc/for_each.hpp +++ b/include/ddc/for_each.hpp @@ -20,7 +20,7 @@ namespace ddc { namespace detail { template -class ForEachKokkosLambdaAdapter +class ForEachKokkosAdapter { template using index_type = std::size_t; @@ -28,7 +28,7 @@ class ForEachKokkosLambdaAdapter F m_f; public: - ForEachKokkosLambdaAdapter(F const& f) : m_f(f) {} + explicit ForEachKokkosAdapter(F const& f) : m_f(f) {} template = true> KOKKOS_IMPL_FORCEINLINE void operator()([[maybe_unused]] index_type unused_id) const @@ -75,7 +75,7 @@ inline void for_each_kokkos( } template -inline void for_each_kokkos(DiscreteDomain const& domain, Functor const& f) noexcept +void for_each_kokkos(DiscreteDomain const& domain, Functor const& f) noexcept { DiscreteElement const ddc_begin = domain.front(); DiscreteElement const ddc_end = domain.front() + domain.extents(); @@ -84,18 +84,16 @@ inline void for_each_kokkos(DiscreteDomain const& domain, Functor const& if constexpr (need_annotated_operator()) { Kokkos::parallel_for( Kokkos::RangePolicy(begin, end), - ForEachKokkosLambdaAdapter(f)); + ForEachKokkosAdapter(f)); } else { Kokkos::parallel_for( Kokkos::RangePolicy(begin, end), - ForEachKokkosLambdaAdapter(f)); + ForEachKokkosAdapter(f)); } } template -inline void for_each_kokkos( - DiscreteDomain const& domain, - Functor&& f) noexcept +void for_each_kokkos(DiscreteDomain const& domain, Functor&& f) noexcept { DiscreteElement const ddc_begin = domain.front(); DiscreteElement const ddc_end = domain.front() + domain.extents(); @@ -114,7 +112,7 @@ inline void for_each_kokkos( Kokkos::Iterate::Right, Kokkos::Iterate::Right>, use_annotated_operator>(begin, end), - ForEachKokkosLambdaAdapter(f)); + ForEachKokkosAdapter(f)); } else { Kokkos::parallel_for( Kokkos::MDRangePolicy< @@ -123,18 +121,18 @@ inline void for_each_kokkos( 2 + sizeof...(DDims), Kokkos::Iterate::Right, Kokkos::Iterate::Right>>(begin, end), - ForEachKokkosLambdaAdapter(f)); + ForEachKokkosAdapter(f)); } } template -inline void for_each_serial( +constexpr KOKKOS_IMPL_FORCEINLINE void for_each_serial( std::array const& begin, std::array const& end, Functor const& f, Is const&... is) noexcept { - static constexpr std::size_t I = sizeof...(Is); + constexpr std::size_t I = sizeof...(Is); if constexpr (I == N) { f(RetType(is...)); } else { @@ -147,7 +145,7 @@ inline void for_each_serial( } // namespace detail /// Serial execution on the host -struct serial_host_policy +struct serial_policy { }; @@ -156,10 +154,7 @@ struct serial_host_policy * @param[in] f a functor taking an index as parameter */ template -inline void for_each( - serial_host_policy, - DiscreteDomain const& domain, - Functor&& f) noexcept +constexpr void for_each(serial_policy, DiscreteDomain const& domain, Functor&& f) noexcept { DiscreteElement const ddc_begin = domain.front(); DiscreteElement const ddc_end = domain.front() + domain.extents(); @@ -173,8 +168,8 @@ inline void for_each( * @param[in] f a functor taking an index as parameter */ template -inline void for_each_n( - serial_host_policy, +constexpr void for_each_n( + serial_policy, DiscreteVector const& extent, Functor&& f) noexcept { @@ -195,10 +190,7 @@ struct parallel_host_policy * @param[in] f a functor taking an index as parameter */ template -inline void for_each( - parallel_host_policy, - DiscreteDomain const& domain, - Functor&& f) noexcept +void for_each(parallel_host_policy, DiscreteDomain const& domain, Functor&& f) noexcept { detail::for_each_kokkos(domain, std::forward(f)); } @@ -213,19 +205,16 @@ struct parallel_device_policy * @param[in] f a functor taking an index as parameter */ template -inline void for_each( - parallel_device_policy, - DiscreteDomain const& domain, - Functor&& f) noexcept +void for_each(parallel_device_policy, DiscreteDomain const& domain, Functor&& f) noexcept { detail::for_each_kokkos(domain, std::forward(f)); } -using default_policy = serial_host_policy; +using default_policy = serial_policy; namespace policies { -inline constexpr serial_host_policy serial_host; +inline constexpr serial_policy serial; inline constexpr parallel_host_policy parallel_host; inline constexpr parallel_device_policy parallel_device; @@ -252,9 +241,9 @@ constexpr auto policy([[maybe_unused]] ExecSpace exec_space) * @param[in] f a functor taking an index as parameter */ template -inline void for_each(DiscreteDomain const& domain, Functor&& f) noexcept +constexpr void for_each(DiscreteDomain const& domain, Functor&& f) noexcept { - for_each(default_policy(), domain, std::forward(f)); + for_each(policies::serial, domain, std::forward(f)); } /** iterates over a nD extent using the default execution policy @@ -262,9 +251,9 @@ inline void for_each(DiscreteDomain const& domain, Functor&& f) noexce * @param[in] f a functor taking an index as parameter */ template -inline void for_each_n(DiscreteVector const& extent, Functor&& f) noexcept +constexpr void for_each_n(DiscreteVector const& extent, Functor&& f) noexcept { - for_each_n(default_policy(), extent, std::forward(f)); + for_each_n(policies::serial, extent, std::forward(f)); } template < @@ -272,18 +261,19 @@ template < class ElementType, class... DDims, class LayoutPolicy, + class MemorySpace, class Functor> -inline void for_each_elem( - ExecutionPolicy&& policy, - ChunkSpan, LayoutPolicy> chunk_span, +constexpr void for_each_elem( + ExecutionPolicy policy, + ChunkSpan, LayoutPolicy, MemorySpace> chunk_span, Functor&& f) noexcept { - for_each(std::forward(policy), chunk_span.domain(), std::forward(f)); + for_each(policy, chunk_span.domain(), std::forward(f)); } -template -inline void for_each_elem( - ChunkSpan, LayoutPolicy> chunk_span, +template +constexpr void for_each_elem( + ChunkSpan, LayoutPolicy, MemorySpace> chunk_span, Functor&& f) noexcept { for_each(chunk_span.domain(), std::forward(f)); diff --git a/include/ddc/transform_reduce.hpp b/include/ddc/transform_reduce.hpp index 3b0d358cd..fbdb3e46a 100644 --- a/include/ddc/transform_reduce.hpp +++ b/include/ddc/transform_reduce.hpp @@ -99,7 +99,7 @@ template < class BinaryReductionOp, class UnaryTransformOp, class... DCoords> -inline T transform_reduce_serial( +constexpr T transform_reduce_serial( DiscreteDomain const& domain, [[maybe_unused]] T const neutral, BinaryReductionOp const& reduce, @@ -160,7 +160,7 @@ class TransformReducerKokkosLambdaAdapter index_type... ids, typename Reducer::value_type& a) const { - a = reducer(a, functor(DiscreteElement(ids...))); + a = recuder(a, functor(DiscreteElement(ids...))); } @@ -226,9 +226,9 @@ inline T transform_reduce_kokkos( T result = neutral; if constexpr (need_annotated_operator()) { Kokkos::parallel_reduce( - Kokkos::RangePolicy< - ExecSpace, - use_annotated_operator>(domain.front().uid(), domain.back().uid() + 1), + Kokkos::RangePolicy( + select(domain).front().uid(), + select(domain).back().uid() + 1), TransformReducerKokkosLambdaAdapter< BinaryReductionOp, UnaryTransformOp, @@ -236,7 +236,9 @@ inline T transform_reduce_kokkos( ddc_to_kokkos_reducer_t(result)); } else { Kokkos::parallel_reduce( - Kokkos::RangePolicy(domain.front().uid(), domain.back().uid() + 1), + Kokkos::RangePolicy( + select(domain).front().uid(), + select(domain).back().uid() + 1), TransformReducerKokkosLambdaAdapter< BinaryReductionOp, UnaryTransformOp, @@ -316,8 +318,8 @@ inline T transform_reduce_kokkos( * range. The return type must be acceptable as input to reduce */ template -inline T transform_reduce( - [[maybe_unused]] serial_host_policy policy, +constexpr T transform_reduce( + [[maybe_unused]] serial_policy policy, DiscreteDomain const& domain, T neutral, BinaryReductionOp&& reduce, @@ -340,7 +342,7 @@ inline T transform_reduce( * range. The return type must be acceptable as input to reduce */ template -inline T transform_reduce( +T transform_reduce( [[maybe_unused]] parallel_host_policy policy, DiscreteDomain const& domain, T neutral, @@ -364,7 +366,7 @@ inline T transform_reduce( * range. The return type must be acceptable as input to reduce */ template -inline T transform_reduce( +T transform_reduce( [[maybe_unused]] parallel_device_policy policy, DiscreteDomain const& domain, T neutral, @@ -387,14 +389,14 @@ inline T transform_reduce( * range. The return type must be acceptable as input to reduce */ template -inline T transform_reduce( +constexpr T transform_reduce( DiscreteDomain const& domain, T neutral, BinaryReductionOp&& reduce, UnaryTransformOp&& transform) noexcept { return transform_reduce( - default_policy(), + policies::serial, domain, neutral, std::forward(reduce), diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index be6fb2e72..54cf25b34 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,6 +22,7 @@ add_executable(ddc_tests fill.cpp discrete_element.cpp discrete_vector.cpp + nested_algorithms.cpp ) target_compile_features(ddc_tests PUBLIC cxx_std_17) target_link_libraries(ddc_tests diff --git a/tests/fft.hpp b/tests/fft.hpp new file mode 100644 index 000000000..6ea634b30 --- /dev/null +++ b/tests/fft.hpp @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: MIT + +#include +#include + +#include + +template +using DDim = ddc::UniformPointSampling; + +template +using DElem = ddc::DiscreteElement; + +template +using DVect = ddc::DiscreteVector; + +template +using DDom = ddc::DiscreteDomain; + +template +using DFDim = ddc::PeriodicSampling; + +template +constexpr auto policy = [] { + if constexpr (std::is_same_v) { + return ddc::policies::serial; + } +#if fftw_omp_AVAIL + else if constexpr (std::is_same_v) { + return ddc::policies::parallel_host; + } +#endif + else { + return ddc::policies::parallel_device; + } +}; + +// TODO: +// - FFT multidim but according to a subset of dimensions +template +static void test_fft() +{ + constexpr bool full_fft + = ddc::detail::fft::is_complex_v && ddc::detail::fft::is_complex_v; + const double a = -10; + const double b = 10; + const std::size_t Nx = 64; // Optimal value is (b-a)^2/(2*pi) + + DDom...> const x_mesh( + ddc::init_discrete_space(DDim:: + init(ddc::Coordinate(a + (b - a) / Nx / 2), + ddc::Coordinate(b - (b - a) / Nx / 2), + DVect>(Nx)))...); + ddc::init_fourier_space(x_mesh); + DDom>...> const k_mesh = ddc::FourierMesh(x_mesh, full_fft); + + ddc::Chunk _f(x_mesh, ddc::KokkosAllocator()); + ddc::ChunkSpan f = _f.span_view(); + ddc::for_each( + policy(), + f.domain(), + DDC_LAMBDA(DElem...> const e) { + ddc::Real const xn2 + = (Kokkos::pow(ddc::coordinate(ddc::select>(e)), 2) + ...); + f(e) = Kokkos::exp(-xn2 / 2); + }); + ddc::Chunk f_bis_alloc(f.domain(), ddc::KokkosAllocator()); + ddc::ChunkSpan f_bis = f_bis_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, f_bis, f); + + ddc::Chunk Ff_alloc(k_mesh, ddc::KokkosAllocator()); + ddc::ChunkSpan Ff = Ff_alloc.span_view(); + ddc::fft(ExecSpace(), Ff, f_bis, {ddc::FFT_Normalization::FULL}); + Kokkos::fence(); + + // deepcopy of Ff because FFT C2R overwrites the input + ddc::Chunk Ff_bis_alloc(Ff.domain(), ddc::KokkosAllocator()); + ddc::ChunkSpan Ff_bis = Ff_bis_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, Ff_bis, Ff); + + ddc::Chunk FFf_alloc(f.domain(), ddc::KokkosAllocator()); + ddc::ChunkSpan FFf = FFf_alloc.span_view(); + ddc::ifft(ExecSpace(), FFf, Ff_bis, {ddc::FFT_Normalization::FULL}); + + ddc::Chunk f_host_alloc(f.domain(), ddc::HostAllocator()); + ddc::ChunkSpan f_host = f_host_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, f_host, f); + + ddc::Chunk Ff_host_alloc(Ff.domain(), ddc::HostAllocator()); + ddc::ChunkSpan Ff_host = Ff_host_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, Ff_host, Ff); + + ddc::Chunk FFf_host_alloc(FFf.domain(), ddc::HostAllocator()); + ddc::ChunkSpan FFf_host = FFf_host_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, FFf_host, FFf); + + auto const pow2 = DDC_LAMBDA(double x) + { + return x * x; + }; + + double criterion = Kokkos::sqrt(ddc::transform_reduce( + Ff_host.domain(), + 0., + ddc::reducer::sum(), + [=](DElem>...> const e) { + double const xn2 + = (pow2(ddc::coordinate(ddc::select>>(e))) + ...); + double const diff = Kokkos::abs(Ff_host(e)) - Kokkos::exp(-xn2 / 2); + std::size_t const denom + = (ddc::detail::fft::LastSelector(Nx / 2, Nx) * ...); + return pow2(diff) / denom; + })); + + double criterion2 = Kokkos::sqrt(ddc::transform_reduce( + FFf_host.domain(), + 0., + ddc::reducer::sum(), + [=](DElem...> const e) { + double const diff = Kokkos::abs(FFf_host(e)) - Kokkos::abs(f_host(e)); + return pow2(diff) / Kokkos::pow(Nx, sizeof...(X)); + })); + double epsilon = std::is_same_v, double> ? 1e-15 : 1e-7; + EXPECT_LE(criterion, epsilon) + << "Distance between analytical prediction and numerical result : " << criterion; + EXPECT_LE(criterion2, epsilon) + << "Distance between input and iFFT(FFT(input)) : " << criterion2; +} + +template +static void test_fft_norm(ddc::FFT_Normalization const norm) +{ + constexpr bool full_fft + = ddc::detail::fft::is_complex_v && ddc::detail::fft::is_complex_v; + + DDom> const x_mesh = ddc::init_discrete_space(DDim:: + init(ddc::Coordinate(-1. / 4), + ddc::Coordinate(1. / 4), + DVect>(2))); + ddc::init_fourier_space(x_mesh); + DDom>> const k_mesh = ddc::FourierMesh(x_mesh, full_fft); + + ddc::Chunk f_alloc = ddc::Chunk(x_mesh, ddc::KokkosAllocator()); + ddc::ChunkSpan f = f_alloc.span_view(); + ddc::for_each( + policy(), + f.domain(), + DDC_LAMBDA(DElem> const e) { f(e) = static_cast(1); }); + + + ddc::Chunk f_bis_alloc = ddc::Chunk(f.domain(), ddc::KokkosAllocator()); + ddc::ChunkSpan f_bis = f_bis_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, f_bis, f); + + ddc::Chunk Ff_alloc = ddc::Chunk(k_mesh, ddc::KokkosAllocator()); + ddc::ChunkSpan Ff = Ff_alloc.span_view(); + ddc::fft(ExecSpace(), Ff, f_bis, {norm}); + Kokkos::fence(); + + // deepcopy of Ff because FFT C2R overwrites the input + ddc::Chunk Ff_bis_alloc = ddc::Chunk(Ff.domain(), ddc::KokkosAllocator()); + ddc::ChunkSpan Ff_bis = Ff_bis_alloc.span_view(); + ddc::deepcopy(ddc::policies::parallel_device, Ff_bis, Ff); + + ddc::Chunk FFf_alloc = ddc::Chunk(x_mesh, ddc::KokkosAllocator()); + ddc::ChunkSpan FFf = FFf_alloc.span_view(); + ddc::ifft(ExecSpace(), FFf, Ff_bis, {norm}); + + double const f_sum = ddc::transform_reduce(f.domain(), 0., ddc::reducer::sum(), f); + + double Ff0_expected; + double FFf_expected; + switch (norm) { + case ddc::FFT_Normalization::OFF: + Ff0_expected = f_sum; + FFf_expected = f_sum; + break; + case ddc::FFT_Normalization::FORWARD: + Ff0_expected = 1; + FFf_expected = 1; + break; + case ddc::FFT_Normalization::BACKWARD: + Ff0_expected = f_sum; + FFf_expected = 1; + break; + case ddc::FFT_Normalization::ORTHO: + Ff0_expected = Kokkos::sqrt(f_sum); + FFf_expected = 1; + break; + case ddc::FFT_Normalization::FULL: + Ff0_expected = 1 / Kokkos::sqrt(2 * Kokkos::numbers::pi); + FFf_expected = 1; + break; + } + + double epsilon = 1e-6; + EXPECT_NEAR(Kokkos::abs(Ff(Ff.domain().front())), Ff0_expected, epsilon); + EXPECT_NEAR(FFf(FFf.domain().front()), FFf_expected, epsilon); + EXPECT_NEAR(FFf(FFf.domain().back()), FFf_expected, epsilon); +} diff --git a/tests/for_each.cpp b/tests/for_each.cpp index 7056576c9..1e4c41ad6 100644 --- a/tests/for_each.cpp +++ b/tests/for_each.cpp @@ -40,7 +40,7 @@ TEST(ForEachSerialHost, Empty) DDomX const dom(lbound_x, DVectX(0)); std::vector storage(dom.size(), 0); ddc::ChunkSpan view(storage.data(), dom); - ddc::for_each(ddc::policies::serial_host, dom, [=](DElemX const ix) { view(ix) += 1; }); + ddc::for_each(ddc::policies::serial, dom, [=](DElemX const ix) { view(ix) += 1; }); EXPECT_EQ(std::count(storage.begin(), storage.end(), 1), dom.size()) << std::count(storage.begin(), storage.end(), 1) << std::endl; } @@ -50,7 +50,7 @@ TEST(ForEachSerialHost, ZeroDimension) DDom0D const dom; int storage = 0; ddc::ChunkSpan view(&storage, dom); - ddc::for_each(ddc::policies::serial_host, dom, [=](DElem0D const ii) { view(ii) += 1; }); + ddc::for_each(ddc::policies::serial, dom, [=](DElem0D const ii) { view(ii) += 1; }); EXPECT_EQ(storage, 1) << storage << std::endl; } @@ -59,7 +59,7 @@ TEST(ForEachSerialHost, OneDimension) DDomX const dom(lbound_x, nelems_x); std::vector storage(dom.size(), 0); ddc::ChunkSpan view(storage.data(), dom); - ddc::for_each(ddc::policies::serial_host, dom, [=](DElemX const ix) { view(ix) += 1; }); + ddc::for_each(ddc::policies::serial, dom, [=](DElemX const ix) { view(ix) += 1; }); EXPECT_EQ(std::count(storage.begin(), storage.end(), 1), dom.size()); } @@ -68,7 +68,7 @@ TEST(ForEachSerialHost, TwoDimensions) DDomXY const dom(lbound_x_y, nelems_x_y); std::vector storage(dom.size(), 0); ddc::ChunkSpan view(storage.data(), dom); - ddc::for_each(ddc::policies::serial_host, dom, [=](DElemXY const ixy) { view(ixy) += 1; }); + ddc::for_each(ddc::policies::serial, dom, [=](DElemXY const ixy) { view(ixy) += 1; }); EXPECT_EQ(std::count(storage.begin(), storage.end(), 1), dom.size()); } diff --git a/tests/nested_algorithms.cpp b/tests/nested_algorithms.cpp new file mode 100644 index 000000000..f0793d36c --- /dev/null +++ b/tests/nested_algorithms.cpp @@ -0,0 +1,175 @@ +#include + +#include + +namespace { + +struct DDimX; +using DElemX = ddc::DiscreteElement; +using DVectX = ddc::DiscreteVector; +using DDomX = ddc::DiscreteDomain; + +static DElemX constexpr lbound_x(0); +static DVectX constexpr nelems_x(10); + +} // namespace + +void test_nested_algorithms_for_each() +{ + DDomX ddom(lbound_x, nelems_x); + ddc::Chunk chk(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chks = chk.span_view(); + ddc::for_each( + ddc::policies::parallel_device, + ddom.take_first(DVectX(1)), + DDC_LAMBDA(DElemX) { + ddc::for_each(ddc::policies::serial, chks.domain(), [&](DElemX elem) { + chks(elem) = 10; + }); + }); + int res = ddc::transform_reduce( + ddc::policies::parallel_device, + ddom, + 0, + ddc::reducer::sum(), + chks); + EXPECT_EQ(res, 10 * ddom.size()); +} + +TEST(NestedAlgorithms, ForEach) +{ + test_nested_algorithms_for_each(); +} + +void test_nested_algorithms_for_each_n() +{ + DDomX ddom(lbound_x, nelems_x); + ddc::Chunk chk(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chks = chk.span_view(); + ddc::for_each( + ddc::policies::parallel_device, + ddom.take_first(DVectX(1)), + DDC_LAMBDA(DElemX elem) { + ddc::for_each_n(ddc::policies::serial, ddom.extents(), [&](DVectX vect_x) { + chks(elem + vect_x) = 10; + }); + }); + int res = ddc::transform_reduce( + ddc::policies::parallel_device, + ddom, + 0, + ddc::reducer::sum(), + chks); + EXPECT_EQ(res, 10 * ddom.size()); +} + +TEST(NestedAlgorithms, ForEachN) +{ + test_nested_algorithms_for_each_n(); +} + +void test_nested_algorithms_for_each_elem() +{ + DDomX ddom(lbound_x, nelems_x); + ddc::Chunk chk(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chks = chk.span_view(); + ddc::for_each( + ddc::policies::parallel_device, + ddom.take_first(DVectX(1)), + DDC_LAMBDA(DElemX) { + ddc::for_each_elem(ddc::policies::serial, chks, [&](DElemX elem) { + chks(elem) = 10; + }); + }); + int res = ddc::transform_reduce( + ddc::policies::parallel_device, + ddom, + 0, + ddc::reducer::sum(), + chks); + EXPECT_EQ(res, 10 * ddom.size()); +} + +TEST(NestedAlgorithms, ForEachElem) +{ + test_nested_algorithms_for_each_elem(); +} + +void test_nested_algorithms_transform_reduce() +{ + DDomX ddom(lbound_x, nelems_x); + ddc::Chunk chk(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chks = chk.span_view(); + ddc::for_each( + ddc::policies::parallel_device, + ddom, + DDC_LAMBDA(DElemX elem) { + chks(elem) = ddc::transform_reduce( + ddc::policies::serial, + DDomX(lbound_x, DVectX(10)), + 0, + ddc::reducer::sum(), + [&](DElemX) { return 1; }); + }); + int res = ddc::transform_reduce( + ddc::policies::parallel_device, + ddom, + 0, + ddc::reducer::sum(), + chks); + EXPECT_EQ(res, 10 * ddom.size()); +} + +TEST(NestedAlgorithms, TransformReduce) +{ + test_nested_algorithms_transform_reduce(); +} + +void test_nested_algorithms_fill() +{ + DDomX ddom(lbound_x, nelems_x); + ddc::Chunk chk(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chks = chk.span_view(); + ddc::for_each( + ddc::policies::parallel_device, + ddom.take_first(DVectX(1)), + DDC_LAMBDA(DElemX) { ddc::fill(ddc::policies::serial, chks, 10); }); + int res = ddc::transform_reduce( + ddc::policies::parallel_device, + ddom, + 0, + ddc::reducer::sum(), + chks); + EXPECT_EQ(res, 10 * ddom.size()); +} + +TEST(NestedAlgorithms, Fill) +{ + test_nested_algorithms_fill(); +} + +void test_nested_algorithms_deepcopy() +{ + DDomX ddom(lbound_x, nelems_x); + ddc::Chunk chk(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chks = chk.span_view(); + ddc::fill(ddc::policies::parallel_device, chks, 10); + ddc::Chunk chk2(ddom, ddc::DeviceAllocator()); + ddc::ChunkSpan chk2s = chk2.span_view(); + ddc::for_each( + ddc::policies::parallel_device, + ddom.take_first(DVectX(1)), + DDC_LAMBDA(DElemX) { ddc::deepcopy(ddc::policies::serial, chk2s, chks); }); + int res = ddc::transform_reduce( + ddc::policies::parallel_device, + ddom, + 0, + ddc::reducer::sum(), + chk2s); + EXPECT_EQ(res, 10 * ddom.size()); +} + +TEST(NestedAlgorithms, Deepcopy) +{ + test_nested_algorithms_deepcopy(); +} diff --git a/tests/relocatable_device_code.cpp b/tests/relocatable_device_code.cpp index d3e13110e..dc67ef88f 100644 --- a/tests/relocatable_device_code.cpp +++ b/tests/relocatable_device_code.cpp @@ -19,7 +19,7 @@ std::pair, ddc::Coordinate> read_from_devi dom_x.take_last(rdc::DVectX(1)), DDC_LAMBDA(rdc::DElemX const ix) { array(ix) = ddc::step(); }); ddc::Chunk allocation_h(dom_x, ddc::HostAllocator()); - ddc::deepcopy(allocation_h, allocation_d); + ddc::deepcopy(ddc::policies::parallel_device, allocation_h, allocation_d); return std::pair< ddc::Coordinate, ddc::Coordinate>(allocation_h(rdc::DElemX(0)), allocation_h(rdc::DElemX(1))); diff --git a/tests/transform_reduce.cpp b/tests/transform_reduce.cpp index 900f6d4e0..bf46063f9 100644 --- a/tests/transform_reduce.cpp +++ b/tests/transform_reduce.cpp @@ -59,7 +59,7 @@ TEST(TransformReduceSerialHost, OneDimension) ddc::for_each(dom, [&](DElemX const ix) { chunk(ix) = count++; }); EXPECT_EQ( ddc::transform_reduce( - ddc::policies::serial_host, + ddc::policies::serial, dom, 0, ddc::reducer::sum(), @@ -76,7 +76,7 @@ TEST(TransformReduceSerialHost, TwoDimensions) ddc::for_each(dom, [&](DElemXY const ixy) { chunk(ixy) = count++; }); EXPECT_EQ( ddc::transform_reduce( - ddc::policies::serial_host, + ddc::policies::serial, dom, 0, ddc::reducer::sum(),