From 169efae0e236760724d7afc1f15b018a70bddf30 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 10:55:27 +0200 Subject: [PATCH 01/36] align reference preconditioner size on default --- benchmarks/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index f6bda61f0..3228c7105 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -191,7 +191,7 @@ unsigned int preconditionner_max_block_size_ref = 1u; #elif defined(KOKKOS_ENABLE_OPENMP) std::string chip = "cpu"; std::size_t cols_per_chunk_ref = 8192; -unsigned int preconditionner_max_block_size_ref = 32u; +unsigned int preconditionner_max_block_size_ref = 1u; #elif defined(KOKKOS_ENABLE_SERIAL) std::string chip = "cpu"; std::size_t cols_per_chunk_ref = 8192; From 20ce63c6dfb6333483ec9aab766e9efec8990096 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 13:01:28 +0200 Subject: [PATCH 02/36] wip --- benchmarks/splines.cpp | 64 ++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 3228c7105..e45d1b22e 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -16,21 +16,22 @@ namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP) { - static constexpr std::size_t s_degree_x = 3; - struct X { static constexpr bool PERIODIC = true; }; + template struct BSplinesX : ddc::UniformBSplines { }; + template using GrevillePoints = ddc::GrevilleInterpolationPoints< - BSplinesX, + BSplinesX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC>; - struct DDimX : GrevillePoints::interpolation_mesh_type + template + struct DDimX : GrevillePoints::interpolation_mesh_type { }; @@ -59,6 +60,7 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem } } +template static void characteristics_advection(benchmark::State& state) { size_t freeMem = 0; @@ -81,31 +83,32 @@ static void characteristics_advection(benchmark::State& state) std::ref(maxUsedMem)); ddc::init_discrete_space< - BSplinesX>(ddc::Coordinate(-1.), ddc::Coordinate(1.), state.range(0)); - ddc::init_discrete_space(ddc::GrevilleInterpolationPoints< - BSplinesX, - ddc::BoundCond::PERIODIC, - ddc::BoundCond::PERIODIC>::get_sampling()); + BSplinesX>(ddc::Coordinate(-1.), ddc::Coordinate(1.), state.range(0)); + ddc::init_discrete_space>( + ddc::GrevilleInterpolationPoints< + BSplinesX, + ddc::BoundCond::PERIODIC, + ddc::BoundCond::PERIODIC>::template get_sampling>()); ddc::DiscreteDomain y_domain = ddc::init_discrete_space(DDimY::init( ddc::Coordinate(-1.), ddc::Coordinate(1.), ddc::DiscreteVector(state.range(1)))); auto const x_domain = ddc::GrevilleInterpolationPoints< - BSplinesX, + BSplinesX, ddc::BoundCond::PERIODIC, - ddc::BoundCond::PERIODIC>::get_domain(); + ddc::BoundCond::PERIODIC>::template get_domain>(); ddc::Chunk density_alloc( - ddc::DiscreteDomain(x_domain, y_domain), + ddc::DiscreteDomain, DDimY>(x_domain, y_domain), ddc::DeviceAllocator()); ddc::ChunkSpan const density = density_alloc.span_view(); // Initialize the density on the main domain - ddc::DiscreteDomain x_mesh - = ddc::DiscreteDomain(x_domain, y_domain); + ddc::DiscreteDomain, DDimY> x_mesh + = ddc::DiscreteDomain, DDimY>(x_domain, y_domain); ddc::parallel_for_each( x_mesh, - KOKKOS_LAMBDA(ddc::DiscreteElement const ixy) { - double const x = ddc::coordinate(ddc::select(ixy)); + KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const ixy) { + double const x = ddc::coordinate(ddc::select>(ixy)); double const y = ddc::coordinate(ddc::select(ixy)); density(ixy) = 9.999 * Kokkos::exp(-(x * x + y * y) / 0.1 / 2); // initial_density(ixy) = 9.999 * ((x * x + y * y) < 0.25); @@ -113,23 +116,23 @@ static void characteristics_advection(benchmark::State& state) ddc::SplineBuilder< Kokkos::DefaultExecutionSpace, Kokkos::DefaultExecutionSpace::memory_space, - BSplinesX, - DDimX, + BSplinesX, + DDimX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC, ddc::SplineSolver::GINKGO, - DDimX, + DDimX, DDimY> spline_builder(x_mesh, state.range(2), state.range(3)); ddc::PeriodicExtrapolationRule periodic_extrapolation; ddc::SplineEvaluator< Kokkos::DefaultExecutionSpace, Kokkos::DefaultExecutionSpace::memory_space, - BSplinesX, - DDimX, + BSplinesX, + DDimX, ddc::PeriodicExtrapolationRule, ddc::PeriodicExtrapolationRule, - DDimX, + DDimX, DDimY> spline_evaluator(periodic_extrapolation, periodic_extrapolation); ddc::Chunk coef_alloc( @@ -147,9 +150,9 @@ static void characteristics_advection(benchmark::State& state) Kokkos::Profiling::pushRegion("FeetCharacteristics"); ddc::parallel_for_each( feet_coords.domain(), - KOKKOS_LAMBDA(ddc::DiscreteElement const e) { + KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const e) { feet_coords(e) = ddc::Coordinate( - ddc::coordinate(ddc::select(e)) + ddc::coordinate(ddc::select>(e)) - ddc::Coordinate(0.0176429863), ddc::coordinate(ddc::select(e))); }); @@ -175,13 +178,18 @@ static void characteristics_advection(benchmark::State& state) /// The reason is it acts on underlying global /// /// variables, which is always a bad idea. /// //////////////////////////////////////////////////// - ddc::detail::g_discrete_space_dual.reset(); - ddc::detail::g_discrete_space_dual>.reset(); - ddc::detail::g_discrete_space_dual.reset(); + ddc::detail::g_discrete_space_dual>.reset(); + ddc::detail::g_discrete_space_dual>>.reset(); + ddc::detail::g_discrete_space_dual>.reset(); ddc::detail::g_discrete_space_dual.reset(); //////////////////////////////////////////////////// } +static void run(benchmark::State& state) +{ + characteristics_advection<3>(state); +} + // Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU #ifdef KOKKOS_ENABLE_CUDA @@ -198,7 +206,7 @@ std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 32u; #endif -BENCHMARK(characteristics_advection) +BENCHMARK(run) ->RangeMultiplier(2) ->Ranges( {{64, 1024}, From c4983063df390f905b68b325d1533246d16673e8 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 13:13:31 +0200 Subject: [PATCH 03/36] wip --- benchmarks/splines.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index e45d1b22e..b89ea9ed6 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -83,7 +83,7 @@ static void characteristics_advection(benchmark::State& state) std::ref(maxUsedMem)); ddc::init_discrete_space< - BSplinesX>(ddc::Coordinate(-1.), ddc::Coordinate(1.), state.range(0)); + BSplinesX>(ddc::Coordinate(-1.), ddc::Coordinate(1.), state.range(1)); ddc::init_discrete_space>( ddc::GrevilleInterpolationPoints< BSplinesX, @@ -92,7 +92,7 @@ static void characteristics_advection(benchmark::State& state) ddc::DiscreteDomain y_domain = ddc::init_discrete_space(DDimY::init( ddc::Coordinate(-1.), ddc::Coordinate(1.), - ddc::DiscreteVector(state.range(1)))); + ddc::DiscreteVector(state.range(2)))); auto const x_domain = ddc::GrevilleInterpolationPoints< BSplinesX, @@ -123,7 +123,7 @@ static void characteristics_advection(benchmark::State& state) ddc::SplineSolver::GINKGO, DDimX, DDimY> - spline_builder(x_mesh, state.range(2), state.range(3)); + spline_builder(x_mesh, state.range(3), state.range(4)); ddc::PeriodicExtrapolationRule periodic_extrapolation; ddc::SplineEvaluator< Kokkos::DefaultExecutionSpace, @@ -168,7 +168,7 @@ static void characteristics_advection(benchmark::State& state) monitorThread.join(); state.SetBytesProcessed( int64_t(state.iterations()) - * int64_t(state.range(0) * state.range(1) * sizeof(double))); + * int64_t(state.range(1) * state.range(2) * sizeof(double))); state.counters["gpu_mem_occupancy"] = maxUsedMem - initUsedMem; //////////////////////////////////////////////////// /// --------------- HUGE WARNING --------------- /// @@ -187,7 +187,8 @@ static void characteristics_advection(benchmark::State& state) static void run(benchmark::State& state) { - characteristics_advection<3>(state); + static std::function benchs = characteristics_advection<3>; + benchs(state); } // Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU @@ -206,6 +207,17 @@ std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 32u; #endif +BENCHMARK(run) + ->RangeMultiplier(2) + ->Ranges( + {{3, 5}, + {64, 1024}, + {100000, 100000}, + {cols_per_chunk_ref, cols_per_chunk_ref}, + {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) + ->MinTime(3) + ->UseRealTime(); +/* BENCHMARK(run) ->RangeMultiplier(2) ->Ranges( @@ -215,6 +227,7 @@ BENCHMARK(run) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); +*/ /* BENCHMARK(characteristics_advection) ->RangeMultiplier(2) From cce3cfdee84f023fc6b179c92b6e49ce761b88d9 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 13:16:11 +0200 Subject: [PATCH 04/36] wip --- benchmarks/splines.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index b89ea9ed6..3dce17383 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -187,8 +187,9 @@ static void characteristics_advection(benchmark::State& state) static void run(benchmark::State& state) { - static std::function benchs = characteristics_advection<3>; - benchs(state); + std::array, 1> benchs; + benchs[0] = (characteristics_advection<3>); + benchs[0](state); } // Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU From 8cc0125540288f91d938562b5a8808e3a95ad2fa Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 13:19:16 +0200 Subject: [PATCH 05/36] sweep on splines degree --- benchmarks/splines.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 3dce17383..a88117f9b 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -187,9 +187,14 @@ static void characteristics_advection(benchmark::State& state) static void run(benchmark::State& state) { - std::array, 1> benchs; - benchs[0] = (characteristics_advection<3>); - benchs[0](state); + // Preallocate 3 benchs for each spline degree to benchmark (determined at compile-time, that's why we need to build explicitely 3 variants of the bench) + std::array, 3> benchs; + benchs[0] = (characteristics_advection<3>); + benchs[1] = (characteristics_advection<4>); + benchs[2] = (characteristics_advection<5>); + + // Run the required bench + benchs[state.range(0) - 3](state); } // Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU @@ -212,8 +217,8 @@ BENCHMARK(run) ->RangeMultiplier(2) ->Ranges( {{3, 5}, - {64, 1024}, - {100000, 100000}, + {64, 1024}, + {100000, 100000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) From 76c146cd089cb6c189874fc6bc06bd3f3cd5dad9 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 13:56:09 +0200 Subject: [PATCH 06/36] wip --- benchmarks/splines.cpp | 100 ++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index a88117f9b..c9dfc36bd 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -21,17 +21,21 @@ namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP) static constexpr bool PERIODIC = true; }; - template - struct BSplinesX : ddc::UniformBSplines + template + struct BSplinesX + : std::conditional_t< + NonUniform::value, + ddc::NonUniformBSplines, + ddc::UniformBSplines> { }; - template + template using GrevillePoints = ddc::GrevilleInterpolationPoints< - BSplinesX, + BSplinesX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC>; - template - struct DDimX : GrevillePoints::interpolation_mesh_type + template + struct DDimX : GrevillePoints::interpolation_mesh_type { }; @@ -60,8 +64,8 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem } } -template -static void characteristics_advection(benchmark::State& state) +template +static void characteristics_advection_unitary(benchmark::State& state) { size_t freeMem = 0; size_t totalMem = 0; @@ -82,33 +86,43 @@ static void characteristics_advection(benchmark::State& state) std::ref(monitorFlag), std::ref(maxUsedMem)); - ddc::init_discrete_space< - BSplinesX>(ddc::Coordinate(-1.), ddc::Coordinate(1.), state.range(1)); - ddc::init_discrete_space>( + if constexpr (!NonUniform::value) { + ddc::init_discrete_space>(ddc::Coordinate(0.), ddc::Coordinate(1.), state.range(1)); + } else { + std::vector> breaks(state.range(1) + 1); + for (std::size_t i(0); i < state.range(1) + 1; ++i) { + breaks[i] = ddc::Coordinate(static_cast(i) / state.range(1)); + } + ddc::init_discrete_space>(breaks); + } + ddc::init_discrete_space>( ddc::GrevilleInterpolationPoints< - BSplinesX, + BSplinesX, ddc::BoundCond::PERIODIC, - ddc::BoundCond::PERIODIC>::template get_sampling>()); + ddc::BoundCond::PERIODIC>:: + template get_sampling>()); ddc::DiscreteDomain y_domain = ddc::init_discrete_space(DDimY::init( ddc::Coordinate(-1.), ddc::Coordinate(1.), ddc::DiscreteVector(state.range(2)))); auto const x_domain = ddc::GrevilleInterpolationPoints< - BSplinesX, + BSplinesX, ddc::BoundCond::PERIODIC, - ddc::BoundCond::PERIODIC>::template get_domain>(); + ddc::BoundCond::PERIODIC>::template get_domain>(); ddc::Chunk density_alloc( - ddc::DiscreteDomain, DDimY>(x_domain, y_domain), + ddc::DiscreteDomain, DDimY>(x_domain, y_domain), ddc::DeviceAllocator()); ddc::ChunkSpan const density = density_alloc.span_view(); // Initialize the density on the main domain - ddc::DiscreteDomain, DDimY> x_mesh - = ddc::DiscreteDomain, DDimY>(x_domain, y_domain); + ddc::DiscreteDomain, DDimY> x_mesh + = ddc::DiscreteDomain, DDimY>(x_domain, y_domain); ddc::parallel_for_each( x_mesh, - KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const ixy) { - double const x = ddc::coordinate(ddc::select>(ixy)); + KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const ixy) { + double const x = ddc::coordinate(ddc::select>(ixy)); double const y = ddc::coordinate(ddc::select(ixy)); density(ixy) = 9.999 * Kokkos::exp(-(x * x + y * y) / 0.1 / 2); // initial_density(ixy) = 9.999 * ((x * x + y * y) < 0.25); @@ -116,23 +130,23 @@ static void characteristics_advection(benchmark::State& state) ddc::SplineBuilder< Kokkos::DefaultExecutionSpace, Kokkos::DefaultExecutionSpace::memory_space, - BSplinesX, - DDimX, + BSplinesX, + DDimX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC, ddc::SplineSolver::GINKGO, - DDimX, + DDimX, DDimY> spline_builder(x_mesh, state.range(3), state.range(4)); ddc::PeriodicExtrapolationRule periodic_extrapolation; ddc::SplineEvaluator< Kokkos::DefaultExecutionSpace, Kokkos::DefaultExecutionSpace::memory_space, - BSplinesX, - DDimX, + BSplinesX, + DDimX, ddc::PeriodicExtrapolationRule, ddc::PeriodicExtrapolationRule, - DDimX, + DDimX, DDimY> spline_evaluator(periodic_extrapolation, periodic_extrapolation); ddc::Chunk coef_alloc( @@ -150,9 +164,9 @@ static void characteristics_advection(benchmark::State& state) Kokkos::Profiling::pushRegion("FeetCharacteristics"); ddc::parallel_for_each( feet_coords.domain(), - KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const e) { + KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const e) { feet_coords(e) = ddc::Coordinate( - ddc::coordinate(ddc::select>(e)) + ddc::coordinate(ddc::select>(e)) - ddc::Coordinate(0.0176429863), ddc::coordinate(ddc::select(e))); }); @@ -178,23 +192,26 @@ static void characteristics_advection(benchmark::State& state) /// The reason is it acts on underlying global /// /// variables, which is always a bad idea. /// //////////////////////////////////////////////////// - ddc::detail::g_discrete_space_dual>.reset(); - ddc::detail::g_discrete_space_dual>>.reset(); - ddc::detail::g_discrete_space_dual>.reset(); + ddc::detail::g_discrete_space_dual>.reset(); + ddc::detail::g_discrete_space_dual>>.reset(); + ddc::detail::g_discrete_space_dual>.reset(); ddc::detail::g_discrete_space_dual.reset(); //////////////////////////////////////////////////// } -static void run(benchmark::State& state) +static void characteristics_advection(benchmark::State& state) { - // Preallocate 3 benchs for each spline degree to benchmark (determined at compile-time, that's why we need to build explicitely 3 variants of the bench) - std::array, 3> benchs; - benchs[0] = (characteristics_advection<3>); - benchs[1] = (characteristics_advection<4>); - benchs[2] = (characteristics_advection<5>); + // Preallocate 6 unitary benchs for each combination of uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 6 variants of the bench even if we call only one of them) + std::array, 6> benchs; + benchs[0] = characteristics_advection_unitary; + benchs[1] = characteristics_advection_unitary; + benchs[2] = characteristics_advection_unitary; + benchs[3] = characteristics_advection_unitary; + benchs[4] = characteristics_advection_unitary; + benchs[5] = characteristics_advection_unitary; - // Run the required bench - benchs[state.range(0) - 3](state); + // Run the desired bench + benchs[state.range(0) * 3 + state.range(1) - 3](state); } // Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU @@ -213,10 +230,11 @@ std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 32u; #endif -BENCHMARK(run) +BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( - {{3, 5}, + {{0, 1}, + {3, 5}, {64, 1024}, {100000, 100000}, {cols_per_chunk_ref, cols_per_chunk_ref}, From b0e0dde6a7af9f52f1de1eab264b270836ac44fc Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 14:13:29 +0200 Subject: [PATCH 07/36] non-uniform/uniform --- benchmarks/splines.cpp | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index c9dfc36bd..24b4395fc 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -67,6 +67,11 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem template static void characteristics_advection_unitary(benchmark::State& state) { + std::size_t nx = state.range(2); + std::size_t ny = state.range(3); + int cols_per_chunk = state.range(4); + int preconditionner_max_block_size = state.range(5); + size_t freeMem = 0; size_t totalMem = 0; #if defined(__CUDACC__) @@ -89,11 +94,11 @@ static void characteristics_advection_unitary(benchmark::State& state) if constexpr (!NonUniform::value) { ddc::init_discrete_space>(ddc::Coordinate(0.), ddc::Coordinate(1.), state.range(1)); + s_degree_x>>(ddc::Coordinate(0.), ddc::Coordinate(1.), nx); } else { - std::vector> breaks(state.range(1) + 1); - for (std::size_t i(0); i < state.range(1) + 1; ++i) { - breaks[i] = ddc::Coordinate(static_cast(i) / state.range(1)); + std::vector> breaks(nx + 1); + for (std::size_t i(0); i < nx + 1; ++i) { + breaks[i] = ddc::Coordinate(static_cast(i) / nx); } ddc::init_discrete_space>(breaks); } @@ -106,7 +111,7 @@ static void characteristics_advection_unitary(benchmark::State& state) ddc::DiscreteDomain y_domain = ddc::init_discrete_space(DDimY::init( ddc::Coordinate(-1.), ddc::Coordinate(1.), - ddc::DiscreteVector(state.range(2)))); + ddc::DiscreteVector(ny))); auto const x_domain = ddc::GrevilleInterpolationPoints< BSplinesX, @@ -137,7 +142,7 @@ static void characteristics_advection_unitary(benchmark::State& state) ddc::SplineSolver::GINKGO, DDimX, DDimY> - spline_builder(x_mesh, state.range(3), state.range(4)); + spline_builder(x_mesh, cols_per_chunk, preconditionner_max_block_size); ddc::PeriodicExtrapolationRule periodic_extrapolation; ddc::SplineEvaluator< Kokkos::DefaultExecutionSpace, @@ -180,9 +185,7 @@ static void characteristics_advection_unitary(benchmark::State& state) } monitorFlag = false; monitorThread.join(); - state.SetBytesProcessed( - int64_t(state.iterations()) - * int64_t(state.range(1) * state.range(2) * sizeof(double))); + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(nx * ny * sizeof(double))); state.counters["gpu_mem_occupancy"] = maxUsedMem - initUsedMem; //////////////////////////////////////////////////// /// --------------- HUGE WARNING --------------- /// @@ -193,7 +196,11 @@ static void characteristics_advection_unitary(benchmark::State& state) /// variables, which is always a bad idea. /// //////////////////////////////////////////////////// ddc::detail::g_discrete_space_dual>.reset(); - ddc::detail::g_discrete_space_dual>>.reset(); + if constexpr (!NonUniform::value) { + ddc::detail::g_discrete_space_dual>>.reset(); + } else { + ddc::detail::g_discrete_space_dual>>.reset(); + } ddc::detail::g_discrete_space_dual>.reset(); ddc::detail::g_discrete_space_dual.reset(); //////////////////////////////////////////////////// From 2d217c70d5c7e2889d59d831b8d6464a2b4d40ab Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 14:58:36 +0200 Subject: [PATCH 08/36] wip --- benchmarks/splines.cpp | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 24b4395fc..6f694dda8 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -221,8 +221,9 @@ static void characteristics_advection(benchmark::State& state) benchs[state.range(0) * 3 + state.range(1) - 3](state); } -// Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU - +// Reference parameters: the benchmarks sweep on two parameters and fix all the others according to those reference parameters. +bool non_uniform_ref = false; +std::size_t degree_x_ref = 3; #ifdef KOKKOS_ENABLE_CUDA std::string chip = "gpu"; std::size_t cols_per_chunk_ref = 65535; @@ -236,23 +237,28 @@ std::string chip = "cpu"; std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 32u; #endif +std::size_t ny_ref = 100000; +// Sweep on uniform/non-uniform and spline order BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( {{0, 1}, {3, 5}, {64, 1024}, - {100000, 100000}, + {ny_ref, ny_ref}, {cols_per_chunk_ref, cols_per_chunk_ref}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); /* +// Sweep on nx and ny BENCHMARK(run) ->RangeMultiplier(2) ->Ranges( - {{64, 1024}, + {{non_uniform_ref, non_uniform_ref}, + {degree_x_ref, degree_x_ref}, + {64, 1024}, {100, 200000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) @@ -260,16 +266,32 @@ BENCHMARK(run) ->UseRealTime(); */ /* +// Sweep on nx and cols_per_chunk BENCHMARK(characteristics_advection) ->RangeMultiplier(2) - ->Ranges({{64, 1024}, {100000, 100000}, {64,65535}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) - ->MinTime(3)->UseRealTime(); + ->Ranges( + {{non_uniform_ref, non_uniform_ref}, + {degree_x_ref, degree_x_ref}, + {64, 1024}, + {ny_ref, ny_ref}, + {64, 65535}, + {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) + ->MinTime(3) + ->UseRealTime(); */ /* +// Sweep on nx and preconditionne_max_block_size BENCHMARK(characteristics_advection) ->RangeMultiplier(2) - ->Ranges({{64, 1024}, {100000, 100000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {1, 32}}) - ->MinTime(3)->UseRealTime(); + ->Ranges( + {{non_uniform_ref, non_uniform_ref}, + {degree_x_ref, degree_x_ref}, + {64, 1024}, + {ny_ref, ny_ref}, + {cols_per_chunk_ref, cols_per_chunk_ref}, + {1, 32}}) + ->MinTime(3) + ->UseRealTime(); */ int main(int argc, char** argv) From 28fe503aba0f3f881309130f6180cd330de49182 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 15:42:17 +0200 Subject: [PATCH 09/36] sweep on exec_space --- benchmarks/splines.cpp | 90 +++++++++++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 6f694dda8..cbfa4f7ef 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -64,13 +64,13 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem } } -template +template static void characteristics_advection_unitary(benchmark::State& state) { - std::size_t nx = state.range(2); - std::size_t ny = state.range(3); - int cols_per_chunk = state.range(4); - int preconditionner_max_block_size = state.range(5); + std::size_t nx = state.range(3); + std::size_t ny = state.range(4); + int cols_per_chunk = state.range(5); + int preconditionner_max_block_size = state.range(6); size_t freeMem = 0; size_t totalMem = 0; @@ -119,12 +119,13 @@ static void characteristics_advection_unitary(benchmark::State& state) ddc::BoundCond::PERIODIC>::template get_domain>(); ddc::Chunk density_alloc( ddc::DiscreteDomain, DDimY>(x_domain, y_domain), - ddc::DeviceAllocator()); + ddc::KokkosAllocator()); ddc::ChunkSpan const density = density_alloc.span_view(); // Initialize the density on the main domain ddc::DiscreteDomain, DDimY> x_mesh = ddc::DiscreteDomain, DDimY>(x_domain, y_domain); ddc::parallel_for_each( + ExecSpace(), x_mesh, KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const ixy) { double const x = ddc::coordinate(ddc::select>(ixy)); @@ -133,8 +134,8 @@ static void characteristics_advection_unitary(benchmark::State& state) // initial_density(ixy) = 9.999 * ((x * x + y * y) < 0.25); }); ddc::SplineBuilder< - Kokkos::DefaultExecutionSpace, - Kokkos::DefaultExecutionSpace::memory_space, + ExecSpace, + typename ExecSpace::memory_space, BSplinesX, DDimX, ddc::BoundCond::PERIODIC, @@ -145,8 +146,8 @@ static void characteristics_advection_unitary(benchmark::State& state) spline_builder(x_mesh, cols_per_chunk, preconditionner_max_block_size); ddc::PeriodicExtrapolationRule periodic_extrapolation; ddc::SplineEvaluator< - Kokkos::DefaultExecutionSpace, - Kokkos::DefaultExecutionSpace::memory_space, + ExecSpace, + typename ExecSpace::memory_space, BSplinesX, DDimX, ddc::PeriodicExtrapolationRule, @@ -156,18 +157,17 @@ static void characteristics_advection_unitary(benchmark::State& state) spline_evaluator(periodic_extrapolation, periodic_extrapolation); ddc::Chunk coef_alloc( spline_builder.batched_spline_domain(), - ddc::KokkosAllocator()); + ddc::KokkosAllocator()); ddc::ChunkSpan coef = coef_alloc.span_view(); ddc::Chunk feet_coords_alloc( spline_builder.batched_interpolation_domain(), - ddc::KokkosAllocator< - ddc::Coordinate, - Kokkos::DefaultExecutionSpace::memory_space>()); + ddc::KokkosAllocator, typename ExecSpace::memory_space>()); ddc::ChunkSpan feet_coords = feet_coords_alloc.span_view(); for (auto _ : state) { Kokkos::Profiling::pushRegion("FeetCharacteristics"); ddc::parallel_for_each( + ExecSpace(), feet_coords.domain(), KOKKOS_LAMBDA(ddc::DiscreteElement, DDimY> const e) { feet_coords(e) = ddc::Coordinate( @@ -208,20 +208,50 @@ static void characteristics_advection_unitary(benchmark::State& state) static void characteristics_advection(benchmark::State& state) { - // Preallocate 6 unitary benchs for each combination of uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 6 variants of the bench even if we call only one of them) - std::array, 6> benchs; - benchs[0] = characteristics_advection_unitary; - benchs[1] = characteristics_advection_unitary; - benchs[2] = characteristics_advection_unitary; - benchs[3] = characteristics_advection_unitary; - benchs[4] = characteristics_advection_unitary; - benchs[5] = characteristics_advection_unitary; + // Preallocate 12 unitary benchs for each combination of cpu/gpu execution space, uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 12 variants of the bench even if we call only one of them) + std::array, 12> benchs; + benchs[0] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::false_type, + 3>; + benchs[1] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::false_type, + 4>; + benchs[2] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::false_type, + 5>; + benchs[3] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 3>; + benchs[4] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 4>; + benchs[5] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 5>; + benchs[6] + = characteristics_advection_unitary; + benchs[7] + = characteristics_advection_unitary; + benchs[8] + = characteristics_advection_unitary; + benchs[9] = characteristics_advection_unitary; + benchs[10] + = characteristics_advection_unitary; + benchs[11] + = characteristics_advection_unitary; // Run the desired bench - benchs[state.range(0) * 3 + state.range(1) - 3](state); + benchs[state.range(0) * 6 + state.range(1) * 3 + state.range(2) - 3](state); } // Reference parameters: the benchmarks sweep on two parameters and fix all the others according to those reference parameters. +bool on_gpu_ref = true; bool non_uniform_ref = false; std::size_t degree_x_ref = 3; #ifdef KOKKOS_ENABLE_CUDA @@ -243,7 +273,8 @@ std::size_t ny_ref = 100000; BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( - {{0, 1}, + {{false, true}, + {false, true}, {3, 5}, {64, 1024}, {ny_ref, ny_ref}, @@ -256,7 +287,8 @@ BENCHMARK(characteristics_advection) BENCHMARK(run) ->RangeMultiplier(2) ->Ranges( - {{non_uniform_ref, non_uniform_ref}, + {{false, true}, + {non_uniform_ref, non_uniform_ref}, {degree_x_ref, degree_x_ref}, {64, 1024}, {100, 200000}, @@ -270,7 +302,8 @@ BENCHMARK(run) BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( - {{non_uniform_ref, non_uniform_ref}, + {{false, true}, + {non_uniform_ref, non_uniform_ref}, {degree_x_ref, degree_x_ref}, {64, 1024}, {ny_ref, ny_ref}, @@ -280,11 +313,12 @@ BENCHMARK(characteristics_advection) ->UseRealTime(); */ /* -// Sweep on nx and preconditionne_max_block_size +// Sweep on nx and preconditionner_max_block_size BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( - {{non_uniform_ref, non_uniform_ref}, + {{on_gpu_ref, on_gpu_ref}, + {non_uniform_ref, non_uniform_ref}, {degree_x_ref, degree_x_ref}, {64, 1024}, {ny_ref, ny_ref}, From c741315790b42e8986efaff510c28919840239ba Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 18:05:54 +0200 Subject: [PATCH 10/36] wip --- benchmarks/splines.cpp | 9 +++++---- benchmarks/splines_plot.py | 30 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index cbfa4f7ef..a1d9aeb79 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -270,6 +270,7 @@ unsigned int preconditionner_max_block_size_ref = 32u; std::size_t ny_ref = 100000; // Sweep on uniform/non-uniform and spline order +/* BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -282,21 +283,21 @@ BENCHMARK(characteristics_advection) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -/* +*/ // Sweep on nx and ny -BENCHMARK(run) +BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( {{false, true}, {non_uniform_ref, non_uniform_ref}, {degree_x_ref, degree_x_ref}, {64, 1024}, - {100, 200000}, + // {100, 200000}, + {100, 2000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -*/ /* // Sweep on nx and cols_per_chunk BENCHMARK(characteristics_advection) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 7f3790aa8..32c494ee6 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -20,16 +20,16 @@ data = json.load(file); # Extract the values at the end of "name" and corresponding "bytes_per_second" -nx_values = sorted(set(int(benchmark["name"].split("/")[1]) for benchmark in data["benchmarks"])) -data_groups = {nx: {"ny": [], "cols_per_chunk": [], "preconditionner_max_block_size": [], "bytes_per_second": [], "gpu_mem_occupancy": []} for nx in nx_values} +nx_values = sorted(set(int(benchmark["name"].split("/")[4]) for benchmark in data["benchmarks"])) +data_groups = {"nx": {nx: {"ny": [], "cols_per_chunk": [], "preconditionner_max_block_size": [], "bytes_per_second": [], "gpu_mem_occupancy": []} for nx in nx_values}} for benchmark in data["benchmarks"]: - nx = int(benchmark["name"].split("/")[1]) - data_groups[nx]["ny"].append(int(benchmark["name"].split("/")[2])) - data_groups[nx]["cols_per_chunk"].append(int(benchmark["name"].split("/")[3])) - data_groups[nx]["preconditionner_max_block_size"].append(int(benchmark["name"].split("/")[4])) - data_groups[nx]["bytes_per_second"].append(benchmark["bytes_per_second"]) - data_groups[nx]["gpu_mem_occupancy"].append(benchmark["gpu_mem_occupancy"]) + nx = int(benchmark["name"].split("/")[4]) + data_groups["nx"][nx]["ny"].append(int(benchmark["name"].split("/")[5])) + data_groups["nx"][nx]["cols_per_chunk"].append(int(benchmark["name"].split("/")[6])) + data_groups["nx"][nx]["preconditionner_max_block_size"].append(int(benchmark["name"].split("/")[7])) + data_groups["nx"][nx]["bytes_per_second"].append(benchmark["bytes_per_second"]) + data_groups["nx"][nx]["gpu_mem_occupancy"].append(benchmark["gpu_mem_occupancy"]) ######## ## ny ## @@ -37,13 +37,13 @@ # Plotting the data for each group plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups.items(): +for nx, group_data in data_groups["nx"].items(): ny = group_data["ny"] throughput = [group_data["bytes_per_second"][i] for i in range(len(ny))] plt.plot(ny, throughput, marker='o', markersize=5, label=f'nx={nx}') x = np.linspace(min(ny), 20*min(ny)) -plt.plot(x, np.mean([data_groups[nx]["bytes_per_second"][0] for nx in nx_values])/min(ny)*x, linestyle='--', color='black', label='perfect scaling') +plt.plot(x, np.mean([data_groups["nx"][nx]["bytes_per_second"][0] for nx in nx_values])/min(ny)*x, linestyle='--', color='black', label='perfect scaling') # Plotting the data plt.grid() @@ -56,7 +56,7 @@ #gpu_mem plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups.items(): +for nx, group_data in data_groups["nx"].items(): ny = [group_data["ny"][i] for i in range(len(group_data["ny"])) if group_data["ny"][i]>=8e3] gpu_mem_overhead = [(group_data["gpu_mem_occupancy"][i]-nx*group_data["ny"][i]*8)/(nx*group_data["ny"][i]*8)*100 for i in range(len(group_data["ny"])) if group_data["ny"][i]>=8e3] plt.plot(ny, gpu_mem_overhead, marker='o', markersize=5, label=f'nx={nx}') @@ -76,13 +76,13 @@ # Plotting the data for each group plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups.items(): +for nx, group_data in data_groups["nx"].items(): cols_per_chunk = group_data["cols_per_chunk"] throughput = [group_data["bytes_per_second"][i] for i in range(len(cols_per_chunk))] plt.plot(cols_per_chunk, throughput, marker='o', markersize=5, label=f'nx={nx}') x = [(int)(data["context"]["cols_per_chunk_ref"]), (int)(data["context"]["cols_per_chunk_ref"])*1.001]; -plt.plot(x, [0.99*min([min(group_data["bytes_per_second"]) for nx, group_data in data_groups.items()]), 1.01*max([max(group_data["bytes_per_second"]) for nx, group_data in data_groups.items()])], linestyle='dotted', color='black', label='reference config') +plt.plot(x, [0.99*min([min(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()]), 1.01*max([max(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()])], linestyle='dotted', color='black', label='reference config') # Plotting the data plt.grid() @@ -99,13 +99,13 @@ # Plotting the data for each group plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups.items(): +for nx, group_data in data_groups["nx"].items(): preconditionner_max_block_size = group_data["preconditionner_max_block_size"] throughput = [group_data["bytes_per_second"][i] for i in range(len(preconditionner_max_block_size))] plt.plot(preconditionner_max_block_size, throughput, marker='o', markersize=5, label=f'nx={nx}') x = [(int)(data["context"]["preconditionner_max_block_size_ref"]), (int)(data["context"]["preconditionner_max_block_size_ref"])*1.001]; -plt.plot(x, [0.99*min([min(group_data["bytes_per_second"]) for nx, group_data in data_groups.items()]), 1.01*max([max(group_data["bytes_per_second"]) for nx, group_data in data_groups.items()])], linestyle='dotted', color='black', label='reference config') +plt.plot(x, [0.99*min([min(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()]), 1.01*max([max(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()])], linestyle='dotted', color='black', label='reference config') # Plotting the data plt.grid() From 5dba37432a798b2f828c844ac41b3295fdf73a87 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 19:54:07 +0200 Subject: [PATCH 11/36] wip --- benchmarks/splines_plot.py | 47 ++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 32c494ee6..b70c07c5c 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -8,6 +8,7 @@ # python3 splines_plot.py /path/to/splines_bench.json import argparse +from operator import itemgetter import matplotlib.pyplot as plt import json import numpy as np @@ -31,21 +32,37 @@ data_groups["nx"][nx]["bytes_per_second"].append(benchmark["bytes_per_second"]) data_groups["nx"][nx]["gpu_mem_occupancy"].append(benchmark["gpu_mem_occupancy"]) +# +data_dict = [{ +"on_gpu": int(benchmark["name"].split("/")[1]), +"nx": int(benchmark["name"].split("/")[4]), +"ny": int(benchmark["name"].split("/")[5]), +"cols_per_chunk": int(benchmark["name"].split("/")[6]), +"preconditionner_max_block_size": int(benchmark["name"].split("/")[7]), +"bytes_per_second": benchmark["bytes_per_second"], +"gpu_mem_occupancy": benchmark["gpu_mem_occupancy"] +} for benchmark in data["benchmarks"]] + + + + +plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f'nx={nx}') + ######## ## ny ## ######## -# Plotting the data for each group +data_dict_sorted = sorted(data_dict, key=itemgetter("nx","ny")) plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups["nx"].items(): - ny = group_data["ny"] - throughput = [group_data["bytes_per_second"][i] for i in range(len(ny))] - plt.plot(ny, throughput, marker='o', markersize=5, label=f'nx={nx}') + +for nx in nx_values: + filter = lambda item : item["nx"]==nx and item["on_gpu"] + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, filter) -x = np.linspace(min(ny), 20*min(ny)) -plt.plot(x, np.mean([data_groups["nx"][nx]["bytes_per_second"][0] for nx in nx_values])/min(ny)*x, linestyle='--', color='black', label='perfect scaling') +ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) +x = np.linspace(ny_min, 20*ny_min) +plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') -# Plotting the data plt.grid() plt.xscale("log") plt.xlabel("ny") @@ -54,14 +71,16 @@ plt.legend() plt.savefig("throughput_ny.png") -#gpu_mem +############# +## gpu_mem ## +############# + plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups["nx"].items(): - ny = [group_data["ny"][i] for i in range(len(group_data["ny"])) if group_data["ny"][i]>=8e3] - gpu_mem_overhead = [(group_data["gpu_mem_occupancy"][i]-nx*group_data["ny"][i]*8)/(nx*group_data["ny"][i]*8)*100 for i in range(len(group_data["ny"])) if group_data["ny"][i]>=8e3] - plt.plot(ny, gpu_mem_overhead, marker='o', markersize=5, label=f'nx={nx}') -# Plotting the data +for nx in nx_values: + filter = lambda item : item["nx"]==nx and item["on_gpu"] and item["ny"]>=8e3 + plt.plot([item["ny"] for item in data_dict_sorted if filter(item)], [(item["gpu_mem_occupancy"]-nx*item["ny"]*8)/(nx*item["ny"]*8)*100 for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f'nx={nx}') + plt.grid() plt.xscale("log") plt.xlabel("ny") From 93ffde4df1ac4cc362a9dd24a74729c474c3cf28 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 20:08:04 +0200 Subject: [PATCH 12/36] wip --- benchmarks/splines_plot.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index b70c07c5c..4a42baf9f 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -53,8 +53,26 @@ ######## data_dict_sorted = sorted(data_dict, key=itemgetter("nx","ny")) -plt.figure(figsize=(8, 6)) - +plt.figure(figsize=(16, 6)) + +plt.subplot(1, 2, 1) +for nx in nx_values: + filter = lambda item : item["nx"]==nx and not item["on_gpu"] + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, filter) + +ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) +x = np.linspace(ny_min, 20*ny_min) +plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') + +plt.grid() +plt.xscale("log") +plt.xlabel("ny") +plt.ylabel("Throughput [B/s]") +plt.title("Throughput on CPU"); +plt.legend() +plt.savefig("throughput_ny.png") + +plt.subplot(1, 2, 2) for nx in nx_values: filter = lambda item : item["nx"]==nx and item["on_gpu"] plotter(plt, "ny", "bytes_per_second", data_dict_sorted, filter) @@ -67,7 +85,7 @@ plt.xscale("log") plt.xlabel("ny") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on "+str.upper(data["context"]["chip"])); +plt.title("Throughput on GPU"); plt.legend() plt.savefig("throughput_ny.png") From 9310a7068e9217ae3e77d53c5e8f8103ddae0f49 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 20:26:15 +0200 Subject: [PATCH 13/36] wip --- benchmarks/splines.cpp | 7 ++++--- benchmarks/splines_plot.py | 25 ++++++++++--------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index a1d9aeb79..74a91fbe1 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -284,6 +284,7 @@ BENCHMARK(characteristics_advection) ->MinTime(3) ->UseRealTime(); */ +/* // Sweep on nx and ny BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -298,7 +299,7 @@ BENCHMARK(characteristics_advection) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -/* +*/ // Sweep on nx and cols_per_chunk BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -308,11 +309,11 @@ BENCHMARK(characteristics_advection) {degree_x_ref, degree_x_ref}, {64, 1024}, {ny_ref, ny_ref}, - {64, 65535}, + // {64, 65535}, + {64, 200}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -*/ /* // Sweep on nx and preconditionner_max_block_size BENCHMARK(characteristics_advection) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 4a42baf9f..7e2f094e7 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -57,8 +57,7 @@ plt.subplot(1, 2, 1) for nx in nx_values: - filter = lambda item : item["nx"]==nx and not item["on_gpu"] - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, filter) + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) x = np.linspace(ny_min, 20*ny_min) @@ -68,14 +67,13 @@ plt.xscale("log") plt.xlabel("ny") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU"); +plt.title("Throughput on CPU") plt.legend() plt.savefig("throughput_ny.png") plt.subplot(1, 2, 2) for nx in nx_values: - filter = lambda item : item["nx"]==nx and item["on_gpu"] - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, filter) + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) x = np.linspace(ny_min, 20*ny_min) @@ -85,7 +83,7 @@ plt.xscale("log") plt.xlabel("ny") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU"); +plt.title("Throughput on GPU") plt.legend() plt.savefig("throughput_ny.png") @@ -111,22 +109,19 @@ ## cols_per_chunk ## ######################## -# Plotting the data for each group -plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups["nx"].items(): - cols_per_chunk = group_data["cols_per_chunk"] - throughput = [group_data["bytes_per_second"][i] for i in range(len(cols_per_chunk))] - plt.plot(cols_per_chunk, throughput, marker='o', markersize=5, label=f'nx={nx}') +data_dict_sorted = sorted(data_dict, key=itemgetter("nx","cols_per_chunk")) +plt.figure(figsize=(16, 6)) -x = [(int)(data["context"]["cols_per_chunk_ref"]), (int)(data["context"]["cols_per_chunk_ref"])*1.001]; -plt.plot(x, [0.99*min([min(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()]), 1.01*max([max(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()])], linestyle='dotted', color='black', label='reference config') +plt.subplot(1, 2, 1) +for nx in nx_values: + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) # Plotting the data plt.grid() plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on "+str.upper(data["context"]["chip"])+" (with ny=100000)"); +plt.title("Throughput on CPU (with ny="+[item["ny"] for item in data_dict_sorted if item["on_gpu"]][0]+")"); plt.legend() plt.savefig("throughput_cols.png") From a4d64b653b02d24e3831f4fa6be079b3343c9066 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 20:45:21 +0200 Subject: [PATCH 14/36] wip --- benchmarks/splines_plot.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 7e2f094e7..289514c71 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -114,14 +114,27 @@ plt.subplot(1, 2, 1) for nx in nx_values: - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) + plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) + +# Plotting the data +plt.grid() +plt.xscale("log") +plt.xlabel("cols_per_chunk") +plt.ylabel("Throughput [B/s]") +plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted if not item["on_gpu"]][0])+")"); +plt.legend() +plt.savefig("throughput_cols.png") + +plt.subplot(1, 2, 2) +for nx in nx_values: + plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) # Plotting the data plt.grid() plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+[item["ny"] for item in data_dict_sorted if item["on_gpu"]][0]+")"); +plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted if item["on_gpu"]][0])+")"); plt.legend() plt.savefig("throughput_cols.png") From 0692da111ee2e7f4a32f1665ceea48a61227f8e4 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 21:04:12 +0200 Subject: [PATCH 15/36] restructurate plotter --- benchmarks/splines.cpp | 13 ++++----- benchmarks/splines_plot.py | 60 +++++++++++++++++--------------------- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 74a91fbe1..39bce9243 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -267,7 +267,8 @@ std::string chip = "cpu"; std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 32u; #endif -std::size_t ny_ref = 100000; +// std::size_t ny_ref = 100000; +std::size_t ny_ref = 1000; // Sweep on uniform/non-uniform and spline order /* @@ -293,13 +294,13 @@ BENCHMARK(characteristics_advection) {non_uniform_ref, non_uniform_ref}, {degree_x_ref, degree_x_ref}, {64, 1024}, - // {100, 200000}, - {100, 2000}, + {100, 200000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); */ +/* // Sweep on nx and cols_per_chunk BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -309,12 +310,11 @@ BENCHMARK(characteristics_advection) {degree_x_ref, degree_x_ref}, {64, 1024}, {ny_ref, ny_ref}, - // {64, 65535}, - {64, 200}, + {64, 65535}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -/* +*/ // Sweep on nx and preconditionner_max_block_size BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -328,7 +328,6 @@ BENCHMARK(characteristics_advection) {1, 32}}) ->MinTime(3) ->UseRealTime(); -*/ int main(int argc, char** argv) { diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 289514c71..43fea3bef 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -20,19 +20,7 @@ with open(args.json_file, 'r') as file: data = json.load(file); -# Extract the values at the end of "name" and corresponding "bytes_per_second" nx_values = sorted(set(int(benchmark["name"].split("/")[4]) for benchmark in data["benchmarks"])) -data_groups = {"nx": {nx: {"ny": [], "cols_per_chunk": [], "preconditionner_max_block_size": [], "bytes_per_second": [], "gpu_mem_occupancy": []} for nx in nx_values}} - -for benchmark in data["benchmarks"]: - nx = int(benchmark["name"].split("/")[4]) - data_groups["nx"][nx]["ny"].append(int(benchmark["name"].split("/")[5])) - data_groups["nx"][nx]["cols_per_chunk"].append(int(benchmark["name"].split("/")[6])) - data_groups["nx"][nx]["preconditionner_max_block_size"].append(int(benchmark["name"].split("/")[7])) - data_groups["nx"][nx]["bytes_per_second"].append(benchmark["bytes_per_second"]) - data_groups["nx"][nx]["gpu_mem_occupancy"].append(benchmark["gpu_mem_occupancy"]) - -# data_dict = [{ "on_gpu": int(benchmark["name"].split("/")[1]), "nx": int(benchmark["name"].split("/")[4]), @@ -43,9 +31,6 @@ "gpu_mem_occupancy": benchmark["gpu_mem_occupancy"] } for benchmark in data["benchmarks"]] - - - plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f'nx={nx}') ######## @@ -60,8 +45,9 @@ plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) -x = np.linspace(ny_min, 20*ny_min) -plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') +if len([item for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]]) != 0: + x = np.linspace(ny_min, 20*ny_min) + plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') plt.grid() plt.xscale("log") @@ -69,15 +55,15 @@ plt.ylabel("Throughput [B/s]") plt.title("Throughput on CPU") plt.legend() -plt.savefig("throughput_ny.png") plt.subplot(1, 2, 2) for nx in nx_values: plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) -x = np.linspace(ny_min, 20*ny_min) -plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') +if len([item for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]]) != 0: + x = np.linspace(ny_min, 20*ny_min) + plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') plt.grid() plt.xscale("log") @@ -121,9 +107,8 @@ plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted if not item["on_gpu"]][0])+")"); +plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); plt.legend() -plt.savefig("throughput_cols.png") plt.subplot(1, 2, 2) for nx in nx_values: @@ -134,7 +119,7 @@ plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted if item["on_gpu"]][0])+")"); +plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); plt.legend() plt.savefig("throughput_cols.png") @@ -142,23 +127,32 @@ ## preconditionner ## ##################### -# Plotting the data for each group -plt.figure(figsize=(8, 6)) -for nx, group_data in data_groups["nx"].items(): - preconditionner_max_block_size = group_data["preconditionner_max_block_size"] - throughput = [group_data["bytes_per_second"][i] for i in range(len(preconditionner_max_block_size))] - plt.plot(preconditionner_max_block_size, throughput, marker='o', markersize=5, label=f'nx={nx}') +data_dict_sorted = sorted(data_dict, key=itemgetter("nx","cols_per_chunk")) +plt.figure(figsize=(16, 6)) -x = [(int)(data["context"]["preconditionner_max_block_size_ref"]), (int)(data["context"]["preconditionner_max_block_size_ref"])*1.001]; -plt.plot(x, [0.99*min([min(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()]), 1.01*max([max(group_data["bytes_per_second"]) for nx, group_data in data_groups["nx"].items()])], linestyle='dotted', color='black', label='reference config') +plt.subplot(1, 2, 1) +for nx in nx_values: + plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) # Plotting the data plt.grid() plt.xscale("log") plt.xlabel("preconditionner_max_block_size") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on "+str.upper(data["context"]["chip"])+" (with ny=100000)"); +plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); +plt.legend() + +plt.subplot(1, 2, 2) +for nx in nx_values: + plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) + +# Plotting the data +plt.grid() +plt.xscale("log") +plt.xlabel("cols_per_chunk") +plt.ylabel("Throughput [B/s]") +plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); plt.legend() -plt.savefig("throughput_precond.png") +plt.savefig("throughput_cols.png") plt.close(); From 21c4516a02e703314f44ed7d3d3b0b35cdecfa22 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 21:38:42 +0200 Subject: [PATCH 16/36] add degree and uniformity --- benchmarks/splines.cpp | 4 ++-- benchmarks/splines_plot.py | 48 +++++++++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 39bce9243..9c766cae4 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -271,7 +271,6 @@ unsigned int preconditionner_max_block_size_ref = 32u; std::size_t ny_ref = 1000; // Sweep on uniform/non-uniform and spline order -/* BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -284,7 +283,6 @@ BENCHMARK(characteristics_advection) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -*/ /* // Sweep on nx and ny BENCHMARK(characteristics_advection) @@ -315,6 +313,7 @@ BENCHMARK(characteristics_advection) ->MinTime(3) ->UseRealTime(); */ +/* // Sweep on nx and preconditionner_max_block_size BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -328,6 +327,7 @@ BENCHMARK(characteristics_advection) {1, 32}}) ->MinTime(3) ->UseRealTime(); +*/ int main(int argc, char** argv) { diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 43fea3bef..0bc00aaf8 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -23,6 +23,8 @@ nx_values = sorted(set(int(benchmark["name"].split("/")[4]) for benchmark in data["benchmarks"])) data_dict = [{ "on_gpu": int(benchmark["name"].split("/")[1]), +"non_uniform": int(benchmark["name"].split("/")[2]), +"degree_x": int(benchmark["name"].split("/")[3]), "nx": int(benchmark["name"].split("/")[4]), "ny": int(benchmark["name"].split("/")[5]), "cols_per_chunk": int(benchmark["name"].split("/")[6]), @@ -31,7 +33,39 @@ "gpu_mem_occupancy": benchmark["gpu_mem_occupancy"] } for benchmark in data["benchmarks"]] -plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f'nx={nx}') +plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non_uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") + +############################# +## non_uniform && degree_x ## +############################# + +data_dict_sorted = sorted(data_dict, key=itemgetter("nx", "non_uniform", "degree_x")) +plt.figure(figsize=(16, 6)) + +plt.subplot(1, 2, 1) +for non_uniform in (False,True): + for nx in nx_values: + plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) + +plt.grid() +plt.xscale("log") +plt.xlabel("degree_x") +plt.ylabel("Throughput [B/s]") +plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.legend() + +plt.subplot(1, 2, 2) +for non_uniform in (False,True): + for nx in nx_values: + plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) + +plt.grid() +plt.xscale("log") +plt.xlabel("degree_x") +plt.ylabel("Throughput [B/s]") +plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.legend() +plt.savefig("throughput_uniformity_degree_x.png") ######## ## ny ## @@ -102,24 +136,22 @@ for nx in nx_values: plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) -# Plotting the data plt.grid() plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); +plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.subplot(1, 2, 2) for nx in nx_values: plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) -# Plotting the data plt.grid() plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); +plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.savefig("throughput_cols.png") @@ -134,24 +166,22 @@ for nx in nx_values: plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) -# Plotting the data plt.grid() plt.xscale("log") plt.xlabel("preconditionner_max_block_size") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); +plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.subplot(1, 2, 2) for nx in nx_values: plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) -# Plotting the data plt.grid() plt.xscale("log") plt.xlabel("cols_per_chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")"); +plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.savefig("throughput_cols.png") From d6eaac3204bab93cc73965895b80cdf7ca563b08 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 18 Jun 2024 21:51:04 +0200 Subject: [PATCH 17/36] finish plotter restructurateion --- benchmarks/splines_plot.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 0bc00aaf8..dbb84b847 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -33,7 +33,7 @@ "gpu_mem_occupancy": benchmark["gpu_mem_occupancy"] } for benchmark in data["benchmarks"]] -plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non_uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") +plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") ############################# ## non_uniform && degree_x ## @@ -48,8 +48,8 @@ plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) plt.grid() -plt.xscale("log") -plt.xlabel("degree_x") +plt.xscale("linear") +plt.xlabel("Splines degree") plt.ylabel("Throughput [B/s]") plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() @@ -60,8 +60,8 @@ plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) plt.grid() -plt.xscale("log") -plt.xlabel("degree_x") +plt.xscale("linear") +plt.xlabel("Splines degree") plt.ylabel("Throughput [B/s]") plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() @@ -85,7 +85,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("ny") +plt.xlabel("Batch size") plt.ylabel("Throughput [B/s]") plt.title("Throughput on CPU") plt.legend() @@ -101,7 +101,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("ny") +plt.xlabel("Batch size") plt.ylabel("Throughput [B/s]") plt.title("Throughput on GPU") plt.legend() @@ -119,7 +119,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("ny") +plt.xlabel("Batch size") plt.ylabel("Relative memory overhead [%]") plt.title("Relative memory occupancy overhead") plt.legend() @@ -138,7 +138,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("cols_per_chunk") +plt.xlabel("Number of right_hand sides per chunk") plt.ylabel("Throughput [B/s]") plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() @@ -149,7 +149,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("cols_per_chunk") +plt.xlabel("Number of right_hand sides per chunk") plt.ylabel("Throughput [B/s]") plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() @@ -168,7 +168,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("preconditionner_max_block_size") +plt.xlabel("Max block size of preconditioner") plt.ylabel("Throughput [B/s]") plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() @@ -179,10 +179,10 @@ plt.grid() plt.xscale("log") -plt.xlabel("cols_per_chunk") +plt.xlabel("Max block size of preconditioner") plt.ylabel("Throughput [B/s]") plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() -plt.savefig("throughput_cols.png") +plt.savefig("throughput_precond.png") plt.close(); From 3232c76e93c663f3839d1d8e74310baacad3287a Mon Sep 17 00:00:00 2001 From: blegouix Date: Wed, 19 Jun 2024 11:19:36 +0200 Subject: [PATCH 18/36] minor --- benchmarks/splines_plot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index dbb84b847..5bac84cc7 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -85,7 +85,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("Batch size") +plt.xlabel("ny (batch size)") plt.ylabel("Throughput [B/s]") plt.title("Throughput on CPU") plt.legend() @@ -101,7 +101,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("Batch size") +plt.xlabel("ny (batch size)") plt.ylabel("Throughput [B/s]") plt.title("Throughput on GPU") plt.legend() @@ -119,7 +119,7 @@ plt.grid() plt.xscale("log") -plt.xlabel("Batch size") +plt.xlabel("ny (batch size)") plt.ylabel("Relative memory overhead [%]") plt.title("Relative memory occupancy overhead") plt.legend() From d02968826b3c62b078b42c6bf60b1613f55b4d40 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 10:51:35 +0200 Subject: [PATCH 19/36] wip --- benchmarks/splines.cpp | 12 +++++++----- benchmarks/splines_plot.py | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 9c766cae4..d29138a85 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -14,6 +14,8 @@ #include +static const ddc::SplineSolver Backend = ddc::SplineSolver::GINKGO; + namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP) { struct X @@ -29,17 +31,20 @@ namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP) ddc::UniformBSplines> { }; + template using GrevillePoints = ddc::GrevilleInterpolationPoints< BSplinesX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC>; + template struct DDimX : GrevillePoints::interpolation_mesh_type { }; struct Y; + struct DDimY : ddc::UniformPointSampling { }; @@ -140,7 +145,7 @@ static void characteristics_advection_unitary(benchmark::State& state) DDimX, ddc::BoundCond::PERIODIC, ddc::BoundCond::PERIODIC, - ddc::SplineSolver::GINKGO, + Backend, DDimX, DDimY> spline_builder(x_mesh, cols_per_chunk, preconditionner_max_block_size); @@ -255,15 +260,12 @@ bool on_gpu_ref = true; bool non_uniform_ref = false; std::size_t degree_x_ref = 3; #ifdef KOKKOS_ENABLE_CUDA -std::string chip = "gpu"; std::size_t cols_per_chunk_ref = 65535; unsigned int preconditionner_max_block_size_ref = 1u; #elif defined(KOKKOS_ENABLE_OPENMP) -std::string chip = "cpu"; std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 1u; #elif defined(KOKKOS_ENABLE_SERIAL) -std::string chip = "cpu"; std::size_t cols_per_chunk_ref = 8192; unsigned int preconditionner_max_block_size_ref = 32u; #endif @@ -332,7 +334,7 @@ BENCHMARK(characteristics_advection) int main(int argc, char** argv) { ::benchmark::Initialize(&argc, argv); - ::benchmark::AddCustomContext("chip", chip); + ::benchmark::AddCustomContext("backend", "Ginkgo"); ::benchmark::AddCustomContext("cols_per_chunk_ref", std::to_string(cols_per_chunk_ref)); ::benchmark::AddCustomContext( "preconditionner_max_block_size_ref", diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 5bac84cc7..add97f53d 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -20,6 +20,7 @@ with open(args.json_file, 'r') as file: data = json.load(file); +backend = data["context"]["backend"] nx_values = sorted(set(int(benchmark["name"].split("/")[4]) for benchmark in data["benchmarks"])) data_dict = [{ "on_gpu": int(benchmark["name"].split("/")[1]), @@ -51,7 +52,7 @@ plt.xscale("linear") plt.xlabel("Splines degree") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.subplot(1, 2, 2) @@ -63,7 +64,7 @@ plt.xscale("linear") plt.xlabel("Splines degree") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.savefig("throughput_uniformity_degree_x.png") @@ -87,7 +88,7 @@ plt.xscale("log") plt.xlabel("ny (batch size)") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU") +plt.title(str(backend)+": Throughput on CPU") plt.legend() plt.subplot(1, 2, 2) @@ -103,7 +104,7 @@ plt.xscale("log") plt.xlabel("ny (batch size)") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU") +plt.title(str(backend)+": Throughput on GPU") plt.legend() plt.savefig("throughput_ny.png") @@ -120,8 +121,8 @@ plt.grid() plt.xscale("log") plt.xlabel("ny (batch size)") -plt.ylabel("Relative memory overhead [%]") -plt.title("Relative memory occupancy overhead") +plt.ylabel("Relative GPU memory overhead [%]") +plt.title(str(backend)+": Relative GPU memory occupancy overhead (100%=nx*ny*8 Bytes)") plt.legend() plt.savefig("gpu_mem_occupancy.png") @@ -140,7 +141,7 @@ plt.xscale("log") plt.xlabel("Number of right_hand sides per chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.subplot(1, 2, 2) @@ -151,7 +152,7 @@ plt.xscale("log") plt.xlabel("Number of right_hand sides per chunk") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.savefig("throughput_cols.png") @@ -170,7 +171,7 @@ plt.xscale("log") plt.xlabel("Max block size of preconditioner") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.subplot(1, 2, 2) @@ -181,7 +182,7 @@ plt.xscale("log") plt.xlabel("Max block size of preconditioner") plt.ylabel("Throughput [B/s]") -plt.title("Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") +plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() plt.savefig("throughput_precond.png") From d4fa4283744e536dac92a055f574e66b7abaf3ad Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 11:10:44 +0200 Subject: [PATCH 20/36] wip --- benchmarks/splines.cpp | 4 ++-- benchmarks/splines_plot.py | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index d29138a85..f2d17dd46 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -272,6 +272,7 @@ unsigned int preconditionner_max_block_size_ref = 32u; // std::size_t ny_ref = 100000; std::size_t ny_ref = 1000; +/* // Sweep on uniform/non-uniform and spline order BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -285,7 +286,7 @@ BENCHMARK(characteristics_advection) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -/* +*/ // Sweep on nx and ny BENCHMARK(characteristics_advection) ->RangeMultiplier(2) @@ -299,7 +300,6 @@ BENCHMARK(characteristics_advection) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -*/ /* // Sweep on nx and cols_per_chunk BENCHMARK(characteristics_advection) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index add97f53d..91c5003c6 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -9,6 +9,7 @@ import argparse from operator import itemgetter +import itertools import matplotlib.pyplot as plt import json import numpy as np @@ -44,8 +45,7 @@ plt.figure(figsize=(16, 6)) plt.subplot(1, 2, 1) -for non_uniform in (False,True): - for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) plt.grid() @@ -56,8 +56,7 @@ plt.legend() plt.subplot(1, 2, 2) -for non_uniform in (False,True): - for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) plt.grid() @@ -76,7 +75,7 @@ plt.figure(figsize=(16, 6)) plt.subplot(1, 2, 1) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) @@ -92,7 +91,7 @@ plt.legend() plt.subplot(1, 2, 2) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) @@ -114,9 +113,9 @@ plt.figure(figsize=(8, 6)) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): filter = lambda item : item["nx"]==nx and item["on_gpu"] and item["ny"]>=8e3 - plt.plot([item["ny"] for item in data_dict_sorted if filter(item)], [(item["gpu_mem_occupancy"]-nx*item["ny"]*8)/(nx*item["ny"]*8)*100 for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f'nx={nx}') + plt.plot([item["ny"] for item in data_dict_sorted if filter(item)], [(item["gpu_mem_occupancy"]-nx*item["ny"]*8)/(nx*item["ny"]*8)*100 for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") plt.grid() plt.xscale("log") @@ -134,7 +133,7 @@ plt.figure(figsize=(16, 6)) plt.subplot(1, 2, 1) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) plt.grid() @@ -145,7 +144,7 @@ plt.legend() plt.subplot(1, 2, 2) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) plt.grid() @@ -164,7 +163,7 @@ plt.figure(figsize=(16, 6)) plt.subplot(1, 2, 1) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) plt.grid() @@ -175,7 +174,7 @@ plt.legend() plt.subplot(1, 2, 2) -for nx in nx_values: +for (non_uniform, nx) in itertools.product((False,True), nx_values): plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) plt.grid() From 2a4977fba3faaacded88a9502f8929b98e9f3807 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 13:55:24 +0200 Subject: [PATCH 21/36] misc --- benchmarks/splines.cpp | 9 +++++---- benchmarks/splines_plot.py | 14 +++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index f2d17dd46..ef557cd5b 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -57,7 +57,8 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem size_t freeMem = 0; size_t totalMem = 0; while (monitorFlag) { - std::this_thread::sleep_for(std::chrono::milliseconds(1)); // Adjust the interval as needed + std::this_thread::sleep_for( + std::chrono::microseconds(100)); // Adjust the interval as needed // Acquire a lock to ensure thread safety when accessing CUDA functions std::lock_guard lock(mutex); @@ -292,7 +293,7 @@ BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( {{false, true}, - {non_uniform_ref, non_uniform_ref}, + {false, true}, {degree_x_ref, degree_x_ref}, {64, 1024}, {100, 200000}, @@ -306,7 +307,7 @@ BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( {{false, true}, - {non_uniform_ref, non_uniform_ref}, + {false, true}, {degree_x_ref, degree_x_ref}, {64, 1024}, {ny_ref, ny_ref}, @@ -321,7 +322,7 @@ BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( {{on_gpu_ref, on_gpu_ref}, - {non_uniform_ref, non_uniform_ref}, + {false, true}, {degree_x_ref, degree_x_ref}, {64, 1024}, {ny_ref, ny_ref}, diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 91c5003c6..a5dbc9b4c 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -76,7 +76,7 @@ plt.subplot(1, 2, 1) for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) if len([item for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]]) != 0: @@ -92,7 +92,7 @@ plt.subplot(1, 2, 2) for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) if len([item for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]]) != 0: @@ -114,7 +114,7 @@ plt.figure(figsize=(8, 6)) for (non_uniform, nx) in itertools.product((False,True), nx_values): - filter = lambda item : item["nx"]==nx and item["on_gpu"] and item["ny"]>=8e3 + filter = lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"] and item["ny"]>=8e3 plt.plot([item["ny"] for item in data_dict_sorted if filter(item)], [(item["gpu_mem_occupancy"]-nx*item["ny"]*8)/(nx*item["ny"]*8)*100 for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") plt.grid() @@ -134,7 +134,7 @@ plt.subplot(1, 2, 1) for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) + plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) plt.grid() plt.xscale("log") @@ -145,7 +145,7 @@ plt.subplot(1, 2, 2) for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) + plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) plt.grid() plt.xscale("log") @@ -164,7 +164,7 @@ plt.subplot(1, 2, 1) for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and not item["on_gpu"]) + plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) plt.grid() plt.xscale("log") @@ -175,7 +175,7 @@ plt.subplot(1, 2, 2) for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["on_gpu"]) + plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) plt.grid() plt.xscale("log") From 0b7baa4f6ad40d550688890b6aa9edb43a319825 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 16:12:22 +0200 Subject: [PATCH 22/36] _suffix the file names --- benchmarks/splines_plot.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index a5dbc9b4c..0a90c741d 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -65,7 +65,7 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() -plt.savefig("throughput_uniformity_degree_x.png") +plt.savefig("throughput_uniformity_degree_x_.png") ######## ## ny ## @@ -105,7 +105,7 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU") plt.legend() -plt.savefig("throughput_ny.png") +plt.savefig("throughput_ny_.png") ############# ## gpu_mem ## @@ -123,7 +123,7 @@ plt.ylabel("Relative GPU memory overhead [%]") plt.title(str(backend)+": Relative GPU memory occupancy overhead (100%=nx*ny*8 Bytes)") plt.legend() -plt.savefig("gpu_mem_occupancy.png") +plt.savefig("gpu_mem_occupancy_.png") ######################## ## cols_per_chunk ## @@ -153,7 +153,7 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() -plt.savefig("throughput_cols.png") +plt.savefig("throughput_cols_.png") ##################### ## preconditionner ## @@ -183,6 +183,6 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() -plt.savefig("throughput_precond.png") +plt.savefig("throughput_precond_.png") plt.close(); From 67155d69073652024000ebc89d4851797abeaad1 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 19:19:10 +0200 Subject: [PATCH 23/36] ifs in plotter --- benchmarks/splines.cpp | 36 +++--- benchmarks/splines_plot.py | 259 +++++++++++++++++++------------------ 2 files changed, 154 insertions(+), 141 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index ef557cd5b..1a9d32868 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -274,7 +274,8 @@ unsigned int preconditionner_max_block_size_ref = 32u; std::size_t ny_ref = 1000; /* -// Sweep on uniform/non-uniform and spline order +// Sweep on spline order +name = "degree" BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -288,21 +289,22 @@ BENCHMARK(characteristics_advection) ->MinTime(3) ->UseRealTime(); */ -// Sweep on nx and ny -BENCHMARK(characteristics_advection) - ->RangeMultiplier(2) - ->Ranges( - {{false, true}, - {false, true}, - {degree_x_ref, degree_x_ref}, - {64, 1024}, - {100, 200000}, - {cols_per_chunk_ref, cols_per_chunk_ref}, - {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) - ->MinTime(3) - ->UseRealTime(); +// Sweep on ny +name = "ny" BENCHMARK(characteristics_advection) + ->RangeMultiplier(2) + ->Ranges( + {{false, true}, + {false, true}, + {degree_x_ref, degree_x_ref}, + {64, 1024}, + {100, 200000}, + {cols_per_chunk_ref, cols_per_chunk_ref}, + {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) + ->MinTime(3) + ->UseRealTime(); /* -// Sweep on nx and cols_per_chunk +// Sweep on cols_per_chunk +name = "cols_per_chunk" BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -317,7 +319,8 @@ BENCHMARK(characteristics_advection) ->UseRealTime(); */ /* -// Sweep on nx and preconditionner_max_block_size +// Sweep on preconditionner_max_block_size +name = "preconditionner_max_block_size" BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -335,6 +338,7 @@ BENCHMARK(characteristics_advection) int main(int argc, char** argv) { ::benchmark::Initialize(&argc, argv); + ::benchmark::AddCustomContext("name", name); ::benchmark::AddCustomContext("backend", "Ginkgo"); ::benchmark::AddCustomContext("cols_per_chunk_ref", std::to_string(cols_per_chunk_ref)); ::benchmark::AddCustomContext( diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 0a90c741d..058f107ac 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -37,152 +37,161 @@ plotter = lambda plt, x_name, y_name, data_dict_sorted, filter : plt.plot([item[x_name] for item in data_dict_sorted if filter(item)], [item[y_name] for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") -############################# -## non_uniform && degree_x ## -############################# - -data_dict_sorted = sorted(data_dict, key=itemgetter("nx", "non_uniform", "degree_x")) -plt.figure(figsize=(16, 6)) - -plt.subplot(1, 2, 1) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) - -plt.grid() -plt.xscale("linear") -plt.xlabel("Splines degree") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") -plt.legend() - -plt.subplot(1, 2, 2) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) - -plt.grid() -plt.xscale("linear") -plt.xlabel("Splines degree") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") -plt.legend() -plt.savefig("throughput_uniformity_degree_x_.png") +############## +## degree_x ## +############## + +if name=="degree_x": + data_dict_sorted = sorted(data_dict, key=itemgetter("nx", "non_uniform", "degree_x")) + plt.figure(figsize=(16, 6)) + + plt.subplot(1, 2, 1) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) + + plt.grid() + plt.xscale("linear") + plt.xlabel("Splines degree") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") + plt.legend() + + plt.subplot(1, 2, 2) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "degree_x", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) + + plt.grid() + plt.xscale("linear") + plt.xlabel("Splines degree") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") + plt.legend() + plt.savefig("throughput_uniformity_degree_x.png") ######## ## ny ## ######## -data_dict_sorted = sorted(data_dict, key=itemgetter("nx","ny")) -plt.figure(figsize=(16, 6)) - -plt.subplot(1, 2, 1) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) - -ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) -if len([item for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]]) != 0: - x = np.linspace(ny_min, 20*ny_min) - plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') - -plt.grid() -plt.xscale("log") -plt.xlabel("ny (batch size)") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on CPU") -plt.legend() - -plt.subplot(1, 2, 2) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) - -ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) -if len([item for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]]) != 0: - x = np.linspace(ny_min, 20*ny_min) - plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') - -plt.grid() -plt.xscale("log") -plt.xlabel("ny (batch size)") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on GPU") -plt.legend() -plt.savefig("throughput_ny_.png") +if name=="ny": + data_dict_sorted = sorted(data_dict, key=itemgetter("nx","ny")) + plt.figure(figsize=(16, 6)) + + plt.subplot(1, 2, 1) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) + + """ + ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) + if len([item for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]]) != 0: + x = np.linspace(ny_min, 20*ny_min) + plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and not item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') + """ + + plt.grid() + plt.xscale("log") + plt.xlabel("ny (batch size)") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on CPU") + plt.legend() + + plt.subplot(1, 2, 2) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "ny", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) + + """ + ny_min = min([item["ny"] for item in data_dict_sorted if item["on_gpu"]]) + if len([item for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]]) != 0: + x = np.linspace(ny_min, 20*ny_min) + plt.plot(x, np.mean([item["bytes_per_second"] for item in data_dict_sorted if item["ny"]==ny_min and item["on_gpu"]])/ny_min*x, linestyle='--', color='black', label='perfect scaling') + """ + + plt.grid() + plt.xscale("log") + plt.xlabel("ny (batch size)") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on GPU") + plt.legend() + plt.savefig("throughput_ny.png") ############# ## gpu_mem ## ############# -plt.figure(figsize=(8, 6)) +if name=="ny": + plt.figure(figsize=(8, 6)) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - filter = lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"] and item["ny"]>=8e3 - plt.plot([item["ny"] for item in data_dict_sorted if filter(item)], [(item["gpu_mem_occupancy"]-nx*item["ny"]*8)/(nx*item["ny"]*8)*100 for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") + for (non_uniform, nx) in itertools.product((False,True), nx_values): + filter = lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"] and item["ny"]>=8e3 + plt.plot([item["ny"] for item in data_dict_sorted if filter(item)], [(item["gpu_mem_occupancy"]-nx*item["ny"]*8)/(nx*item["ny"]*8)*100 for item in data_dict_sorted if filter(item)], marker='o', markersize=5, label=f"{'non uniform' if any(filter(item) and item['non_uniform'] for item in data_dict_sorted) else 'uniform'} nx={nx}") -plt.grid() -plt.xscale("log") -plt.xlabel("ny (batch size)") -plt.ylabel("Relative GPU memory overhead [%]") -plt.title(str(backend)+": Relative GPU memory occupancy overhead (100%=nx*ny*8 Bytes)") -plt.legend() -plt.savefig("gpu_mem_occupancy_.png") + plt.grid() + plt.xscale("log") + plt.xlabel("ny (batch size)") + plt.ylabel("Relative GPU memory overhead [%]") + plt.title(str(backend)+": Relative GPU memory occupancy overhead (100%=nx*ny*8 Bytes)") + plt.legend() + plt.savefig("gpu_mem_occupancy.png") ######################## ## cols_per_chunk ## ######################## -data_dict_sorted = sorted(data_dict, key=itemgetter("nx","cols_per_chunk")) -plt.figure(figsize=(16, 6)) - -plt.subplot(1, 2, 1) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) - -plt.grid() -plt.xscale("log") -plt.xlabel("Number of right_hand sides per chunk") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") -plt.legend() - -plt.subplot(1, 2, 2) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) - -plt.grid() -plt.xscale("log") -plt.xlabel("Number of right_hand sides per chunk") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") -plt.legend() -plt.savefig("throughput_cols_.png") +if name=="cols_per_chunk": + data_dict_sorted = sorted(data_dict, key=itemgetter("nx","cols_per_chunk")) + plt.figure(figsize=(16, 6)) + + plt.subplot(1, 2, 1) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) + + plt.grid() + plt.xscale("log") + plt.xlabel("Number of right_hand sides per chunk") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") + plt.legend() + + plt.subplot(1, 2, 2) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "cols_per_chunk", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) + + plt.grid() + plt.xscale("log") + plt.xlabel("Number of right_hand sides per chunk") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") + plt.legend() + plt.savefig("throughput_cols.png") ##################### ## preconditionner ## ##################### -data_dict_sorted = sorted(data_dict, key=itemgetter("nx","cols_per_chunk")) -plt.figure(figsize=(16, 6)) - -plt.subplot(1, 2, 1) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) - -plt.grid() -plt.xscale("log") -plt.xlabel("Max block size of preconditioner") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") -plt.legend() - -plt.subplot(1, 2, 2) -for (non_uniform, nx) in itertools.product((False,True), nx_values): - plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) - -plt.grid() -plt.xscale("log") -plt.xlabel("Max block size of preconditioner") -plt.ylabel("Throughput [B/s]") -plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") -plt.legend() -plt.savefig("throughput_precond_.png") +if name=="preconditionner_max_block_size": + data_dict_sorted = sorted(data_dict, key=itemgetter("nx","cols_per_chunk")) + plt.figure(figsize=(16, 6)) + + plt.subplot(1, 2, 1) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) + + plt.grid() + plt.xscale("log") + plt.xlabel("Max block size of preconditioner") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") + plt.legend() + + plt.subplot(1, 2, 2) + for (non_uniform, nx) in itertools.product((False,True), nx_values): + plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) + + plt.grid() + plt.xscale("log") + plt.xlabel("Max block size of preconditioner") + plt.ylabel("Throughput [B/s]") + plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") + plt.legend() + plt.savefig("throughput_precond.png") plt.close(); From fd031435dcd6100da19380fc6aa0b0942cf536e6 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 19:55:39 +0200 Subject: [PATCH 24/36] minor --- benchmarks/splines.cpp | 35 ++++++++++++++++++----------------- benchmarks/splines_plot.py | 1 + 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 1a9d32868..58a9d163c 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -273,9 +273,8 @@ unsigned int preconditionner_max_block_size_ref = 32u; // std::size_t ny_ref = 100000; std::size_t ny_ref = 1000; -/* // Sweep on spline order -name = "degree" +std::string name = "degree_x"; BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -288,23 +287,25 @@ BENCHMARK(characteristics_advection) {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) ->MinTime(3) ->UseRealTime(); -*/ +/* // Sweep on ny -name = "ny" BENCHMARK(characteristics_advection) - ->RangeMultiplier(2) - ->Ranges( - {{false, true}, - {false, true}, - {degree_x_ref, degree_x_ref}, - {64, 1024}, - {100, 200000}, - {cols_per_chunk_ref, cols_per_chunk_ref}, - {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) - ->MinTime(3) - ->UseRealTime(); +std::string name = "ny"; +BENCHMARK(characteristics_advection) + ->RangeMultiplier(2) + ->Ranges( + {{false, true}, + {false, true}, + {degree_x_ref, degree_x_ref}, + {64, 1024}, + {100, 200000}, + {cols_per_chunk_ref, cols_per_chunk_ref}, + {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}}) + ->MinTime(3) + ->UseRealTime(); +*/ /* // Sweep on cols_per_chunk -name = "cols_per_chunk" +std::string name = "cols_per_chunk"; BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( @@ -320,7 +321,7 @@ BENCHMARK(characteristics_advection) */ /* // Sweep on preconditionner_max_block_size -name = "preconditionner_max_block_size" +std::string name = "preconditionner_max_block_size" BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 058f107ac..d30adda83 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -21,6 +21,7 @@ with open(args.json_file, 'r') as file: data = json.load(file); +name = data["context"]["name"] backend = data["context"]["backend"] nx_values = sorted(set(int(benchmark["name"].split("/")[4]) for benchmark in data["benchmarks"])) data_dict = [{ From 7c897a63f8f2c9387de2ac339e5a53fd1c089033 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 20:30:07 +0200 Subject: [PATCH 25/36] backend suffix --- benchmarks/splines_plot.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index d30adda83..84429f9d8 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -67,7 +67,7 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() - plt.savefig("throughput_uniformity_degree_x.png") + plt.savefig("throughput_degree_"+str(backend).lower()+".png") ######## ## ny ## @@ -112,7 +112,7 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU") plt.legend() - plt.savefig("throughput_ny.png") + plt.savefig("throughput_ny_"+str(backend).lower()+".png") ############# ## gpu_mem ## @@ -131,7 +131,7 @@ plt.ylabel("Relative GPU memory overhead [%]") plt.title(str(backend)+": Relative GPU memory occupancy overhead (100%=nx*ny*8 Bytes)") plt.legend() - plt.savefig("gpu_mem_occupancy.png") + plt.savefig("gpu_mem_occupancy_"+str(backend).lower()+".png") ######################## ## cols_per_chunk ## @@ -162,7 +162,7 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() - plt.savefig("throughput_cols.png") + plt.savefig("throughput_cols_"+str(backend).lower()+".png") ##################### ## preconditionner ## @@ -193,6 +193,6 @@ plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() - plt.savefig("throughput_precond.png") + plt.savefig("throughput_precond_"+str(backend).lower()+".png") plt.close(); From d11ae298e5ca3b7a154387d8c78944f0131af116 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 20:36:55 +0200 Subject: [PATCH 26/36] minor --- benchmarks/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 58a9d163c..9b1d20c1d 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -260,7 +260,7 @@ static void characteristics_advection(benchmark::State& state) bool on_gpu_ref = true; bool non_uniform_ref = false; std::size_t degree_x_ref = 3; -#ifdef KOKKOS_ENABLE_CUDA +#ifdef KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_HIP std::size_t cols_per_chunk_ref = 65535; unsigned int preconditionner_max_block_size_ref = 1u; #elif defined(KOKKOS_ENABLE_OPENMP) From a1d4475df5884c96401954bdf1fb4f6a44a1f5b5 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 20:42:23 +0200 Subject: [PATCH 27/36] minor --- benchmarks/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 9b1d20c1d..ad03e585a 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -260,7 +260,7 @@ static void characteristics_advection(benchmark::State& state) bool on_gpu_ref = true; bool non_uniform_ref = false; std::size_t degree_x_ref = 3; -#ifdef KOKKOS_ENABLE_CUDA or KOKKOS_ENABLE_HIP +#if (defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP)) std::size_t cols_per_chunk_ref = 65535; unsigned int preconditionner_max_block_size_ref = 1u; #elif defined(KOKKOS_ENABLE_OPENMP) From 54e71cd008c0ae997520b7667e7d3be9911c7ff8 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 21:00:48 +0200 Subject: [PATCH 28/36] minor --- benchmarks/splines.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index ad03e585a..bd52d9826 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -340,7 +340,9 @@ int main(int argc, char** argv) { ::benchmark::Initialize(&argc, argv); ::benchmark::AddCustomContext("name", name); - ::benchmark::AddCustomContext("backend", "Ginkgo"); + ::benchmark::AddCustomContext( + "backend", + std::is_same_v ? "GINKGO" : "LAPACK"); ::benchmark::AddCustomContext("cols_per_chunk_ref", std::to_string(cols_per_chunk_ref)); ::benchmark::AddCustomContext( "preconditionner_max_block_size_ref", From 5b0bb19e8ce3b1bc6002ebc5117e1e48a74f02f5 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 21:08:35 +0200 Subject: [PATCH 29/36] fix --- benchmarks/splines.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index bd52d9826..0fd98d302 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -340,9 +340,8 @@ int main(int argc, char** argv) { ::benchmark::Initialize(&argc, argv); ::benchmark::AddCustomContext("name", name); - ::benchmark::AddCustomContext( - "backend", - std::is_same_v ? "GINKGO" : "LAPACK"); + ::benchmark:: + AddCustomContext("backend", backend == ddc::SplineSolver::GINKGO ? "GINKGO" : "LAPACK"); ::benchmark::AddCustomContext("cols_per_chunk_ref", std::to_string(cols_per_chunk_ref)); ::benchmark::AddCustomContext( "preconditionner_max_block_size_ref", From bc6510dc09e4eb301249884ece00cc1953ebd785 Mon Sep 17 00:00:00 2001 From: blegouix Date: Thu, 20 Jun 2024 21:15:37 +0200 Subject: [PATCH 30/36] fix --- benchmarks/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 0fd98d302..4a72df4a6 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -341,7 +341,7 @@ int main(int argc, char** argv) ::benchmark::Initialize(&argc, argv); ::benchmark::AddCustomContext("name", name); ::benchmark:: - AddCustomContext("backend", backend == ddc::SplineSolver::GINKGO ? "GINKGO" : "LAPACK"); + AddCustomContext("backend", Backend == ddc::SplineSolver::GINKGO ? "GINKGO" : "LAPACK"); ::benchmark::AddCustomContext("cols_per_chunk_ref", std::to_string(cols_per_chunk_ref)); ::benchmark::AddCustomContext( "preconditionner_max_block_size_ref", From 0770af3848e93c47e68789c802941920465089f4 Mon Sep 17 00:00:00 2001 From: blegouix Date: Mon, 1 Jul 2024 17:02:01 +0200 Subject: [PATCH 31/36] minor --- benchmarks/splines.cpp | 2 +- benchmarks/splines_plot.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 4a72df4a6..7dc26d83f 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -321,7 +321,7 @@ BENCHMARK(characteristics_advection) */ /* // Sweep on preconditionner_max_block_size -std::string name = "preconditionner_max_block_size" +std::string name = "preconditionner_max_block_size"; BENCHMARK(characteristics_advection) ->RangeMultiplier(2) ->Ranges( diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 84429f9d8..257655965 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -158,7 +158,7 @@ plt.grid() plt.xscale("log") - plt.xlabel("Number of right_hand sides per chunk") + plt.xlabel("Number of right-hand sides per chunk") plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") plt.legend() From 55278314b9d7a6dcf4dd60f0f33ece204bd5ff57 Mon Sep 17 00:00:00 2001 From: blegouix Date: Tue, 2 Jul 2024 09:15:05 +0200 Subject: [PATCH 32/36] lin scale precond --- benchmarks/splines_plot.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/splines_plot.py b/benchmarks/splines_plot.py index 257655965..7bd869615 100644 --- a/benchmarks/splines_plot.py +++ b/benchmarks/splines_plot.py @@ -177,7 +177,6 @@ plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and not item["on_gpu"]) plt.grid() - plt.xscale("log") plt.xlabel("Max block size of preconditioner") plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on CPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") @@ -188,7 +187,6 @@ plotter(plt, "preconditionner_max_block_size", "bytes_per_second", data_dict_sorted, lambda item : item["nx"]==nx and item["non_uniform"]==non_uniform and item["on_gpu"]) plt.grid() - plt.xscale("log") plt.xlabel("Max block size of preconditioner") plt.ylabel("Throughput [B/s]") plt.title(str(backend)+": Throughput on GPU (with ny="+str([item["ny"] for item in data_dict_sorted][0])+")") From b4f969bda13545750c354dc2f9c2273a3ab5330f Mon Sep 17 00:00:00 2001 From: Baptiste Legouix Date: Thu, 4 Jul 2024 12:47:20 +0200 Subject: [PATCH 33/36] Update benchmarks/splines.cpp --- benchmarks/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 7dc26d83f..e5b62bc32 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -58,7 +58,7 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem size_t totalMem = 0; while (monitorFlag) { std::this_thread::sleep_for( - std::chrono::microseconds(100)); // Adjust the interval as needed + std::chrono::microseconds(10)); // Adjust the interval as needed // Acquire a lock to ensure thread safety when accessing CUDA functions std::lock_guard lock(mutex); From b46adea081662136ce8cf34b7234585254e9ea8a Mon Sep 17 00:00:00 2001 From: Baptiste Legouix Date: Mon, 22 Jul 2024 10:16:36 +0200 Subject: [PATCH 34/36] Update benchmarks/splines.cpp --- benchmarks/splines.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index b851af172..9460c7795 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -14,7 +14,7 @@ #include -static const ddc::SplineSolver Backend = ddc::SplineSolver::GINKGO; +static const ddc::SplineSolver Backend = ddc::SplineSolver::LAPACK; namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP) { From 66d932b894879fa3f90762494b61c1cf33ad4682 Mon Sep 17 00:00:00 2001 From: blegouix Date: Mon, 29 Jul 2024 13:59:00 +0200 Subject: [PATCH 35/36] use mapping --- benchmarks/splines.cpp | 54 ++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 9460c7795..1d9e2b540 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -212,46 +212,54 @@ static void characteristics_advection_unitary(benchmark::State& state) static void characteristics_advection(benchmark::State& state) { + long const host = 0; + long const dev = 1; + long const uniform = 0; + long const non_uniform = 1; // Preallocate 12 unitary benchs for each combination of cpu/gpu execution space, uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 12 variants of the bench even if we call only one of them) - std::array, 12> benchs; - benchs[0] = characteristics_advection_unitary< + std::map, std::function> benchs; + benchs[std::array {host, uniform, static_cast(3)}] = characteristics_advection_unitary< Kokkos::DefaultHostExecutionSpace, std::false_type, 3>; - benchs[1] = characteristics_advection_unitary< + benchs[std::array {host, uniform, static_cast(4)}] = characteristics_advection_unitary< Kokkos::DefaultHostExecutionSpace, std::false_type, 4>; - benchs[2] = characteristics_advection_unitary< + benchs[std::array {host, uniform, static_cast(5)}] = characteristics_advection_unitary< Kokkos::DefaultHostExecutionSpace, std::false_type, 5>; - benchs[3] = characteristics_advection_unitary< - Kokkos::DefaultHostExecutionSpace, - std::true_type, - 3>; - benchs[4] = characteristics_advection_unitary< - Kokkos::DefaultHostExecutionSpace, - std::true_type, - 4>; - benchs[5] = characteristics_advection_unitary< - Kokkos::DefaultHostExecutionSpace, - std::true_type, - 5>; - benchs[6] + benchs[std::array {host, non_uniform, static_cast(3)}] + = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 3>; + benchs[std::array {host, non_uniform, static_cast(4)}] + = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 4>; + benchs[std::array {host, non_uniform, static_cast(5)}] + = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 5>; + benchs[std::array {dev, uniform, static_cast(3)}] = characteristics_advection_unitary; - benchs[7] + benchs[std::array {dev, uniform, static_cast(4)}] = characteristics_advection_unitary; - benchs[8] + benchs[std::array {dev, uniform, static_cast(5)}] = characteristics_advection_unitary; - benchs[9] = characteristics_advection_unitary; - benchs[10] + benchs[std::array {dev, non_uniform, static_cast(3)}] + = characteristics_advection_unitary; + benchs[std::array {dev, non_uniform, static_cast(4)}] = characteristics_advection_unitary; - benchs[11] + benchs[std::array {dev, non_uniform, static_cast(5)}] = characteristics_advection_unitary; // Run the desired bench - benchs[state.range(0) * 6 + state.range(1) * 3 + state.range(2) - 3](state); + benchs.at(std::array {state.range(0), state.range(1), state.range(2)})(state); } // Reference parameters: the benchmarks sweep on two parameters and fix all the others according to those reference parameters. From a260644e938e9f6ba42acfacbf5baa95b2677a24 Mon Sep 17 00:00:00 2001 From: blegouix Date: Wed, 31 Jul 2024 08:02:52 +0200 Subject: [PATCH 36/36] L for long --- benchmarks/splines.cpp | 45 ++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/benchmarks/splines.cpp b/benchmarks/splines.cpp index 1d9e2b540..f0311fcb9 100644 --- a/benchmarks/splines.cpp +++ b/benchmarks/splines.cpp @@ -218,44 +218,41 @@ static void characteristics_advection(benchmark::State& state) long const non_uniform = 1; // Preallocate 12 unitary benchs for each combination of cpu/gpu execution space, uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 12 variants of the bench even if we call only one of them) std::map, std::function> benchs; - benchs[std::array {host, uniform, static_cast(3)}] = characteristics_advection_unitary< + benchs[std::array {host, uniform, 3L}] = characteristics_advection_unitary< Kokkos::DefaultHostExecutionSpace, std::false_type, 3>; - benchs[std::array {host, uniform, static_cast(4)}] = characteristics_advection_unitary< + benchs[std::array {host, uniform, 4L}] = characteristics_advection_unitary< Kokkos::DefaultHostExecutionSpace, std::false_type, 4>; - benchs[std::array {host, uniform, static_cast(5)}] = characteristics_advection_unitary< + benchs[std::array {host, uniform, 5L}] = characteristics_advection_unitary< Kokkos::DefaultHostExecutionSpace, std::false_type, 5>; - benchs[std::array {host, non_uniform, static_cast(3)}] - = characteristics_advection_unitary< - Kokkos::DefaultHostExecutionSpace, - std::true_type, - 3>; - benchs[std::array {host, non_uniform, static_cast(4)}] - = characteristics_advection_unitary< - Kokkos::DefaultHostExecutionSpace, - std::true_type, - 4>; - benchs[std::array {host, non_uniform, static_cast(5)}] - = characteristics_advection_unitary< - Kokkos::DefaultHostExecutionSpace, - std::true_type, - 5>; - benchs[std::array {dev, uniform, static_cast(3)}] + benchs[std::array {host, non_uniform, 3L}] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 3>; + benchs[std::array {host, non_uniform, 4L}] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 4>; + benchs[std::array {host, non_uniform, 5L}] = characteristics_advection_unitary< + Kokkos::DefaultHostExecutionSpace, + std::true_type, + 5>; + benchs[std::array {dev, uniform, 3L}] = characteristics_advection_unitary; - benchs[std::array {dev, uniform, static_cast(4)}] + benchs[std::array {dev, uniform, 4L}] = characteristics_advection_unitary; - benchs[std::array {dev, uniform, static_cast(5)}] + benchs[std::array {dev, uniform, 5L}] = characteristics_advection_unitary; - benchs[std::array {dev, non_uniform, static_cast(3)}] + benchs[std::array {dev, non_uniform, 3L}] = characteristics_advection_unitary; - benchs[std::array {dev, non_uniform, static_cast(4)}] + benchs[std::array {dev, non_uniform, 4L}] = characteristics_advection_unitary; - benchs[std::array {dev, non_uniform, static_cast(5)}] + benchs[std::array {dev, non_uniform, 5L}] = characteristics_advection_unitary; // Run the desired bench