Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Splines improve benchmark #496

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
169efae
align reference preconditioner size on default
blegouix Jun 18, 2024
20ce63c
wip
blegouix Jun 18, 2024
c498306
wip
blegouix Jun 18, 2024
cce3cfd
wip
blegouix Jun 18, 2024
8cc0125
sweep on splines degree
blegouix Jun 18, 2024
76c146c
wip
blegouix Jun 18, 2024
b0e0dde
non-uniform/uniform
blegouix Jun 18, 2024
2d217c7
wip
blegouix Jun 18, 2024
28fe503
sweep on exec_space
blegouix Jun 18, 2024
c741315
wip
blegouix Jun 18, 2024
5dba374
wip
blegouix Jun 18, 2024
93ffde4
wip
blegouix Jun 18, 2024
9310a70
wip
blegouix Jun 18, 2024
a4d64b6
wip
blegouix Jun 18, 2024
0692da1
restructurate plotter
blegouix Jun 18, 2024
21c4516
add degree and uniformity
blegouix Jun 18, 2024
d6eaac3
finish plotter restructurateion
blegouix Jun 18, 2024
3232c76
minor
blegouix Jun 19, 2024
d029688
wip
blegouix Jun 20, 2024
d4fa428
wip
blegouix Jun 20, 2024
2a4977f
misc
blegouix Jun 20, 2024
0b7baa4
_suffix the file names
blegouix Jun 20, 2024
67155d6
ifs in plotter
blegouix Jun 20, 2024
fd03143
minor
blegouix Jun 20, 2024
7c897a6
backend suffix
blegouix Jun 20, 2024
d11ae29
minor
blegouix Jun 20, 2024
a1d4475
minor
blegouix Jun 20, 2024
54e71cd
minor
blegouix Jun 20, 2024
5b0bb19
fix
blegouix Jun 20, 2024
bc6510d
fix
blegouix Jun 20, 2024
9065d88
Merge branch 'main' into splines-improve-benchmark
blegouix Jun 25, 2024
386ad4f
Merge branch 'main' into splines-improve-benchmark
blegouix Jun 28, 2024
0770af3
minor
blegouix Jul 1, 2024
26561fe
Merge branch 'splines-improve-benchmark' of github.com:Maison-de-la-S…
blegouix Jul 1, 2024
5527831
lin scale precond
blegouix Jul 2, 2024
b4f969b
Update benchmarks/splines.cpp
blegouix Jul 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 176 additions & 58 deletions benchmarks/splines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,37 @@

#include <benchmark/benchmark.h>

static const ddc::SplineSolver Backend = ddc::SplineSolver::GINKGO;

namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP)
{
static constexpr std::size_t s_degree_x = 3;

struct X
{
static constexpr bool PERIODIC = true;
};

struct BSplinesX : ddc::UniformBSplines<X, s_degree_x>
template <typename NonUniform, std::size_t s_degree_x>
struct BSplinesX
: std::conditional_t<
NonUniform::value,
ddc::NonUniformBSplines<X, s_degree_x>,
ddc::UniformBSplines<X, s_degree_x>>
{
};

template <typename NonUniform, std::size_t s_degree_x>
using GrevillePoints = ddc::GrevilleInterpolationPoints<
BSplinesX,
BSplinesX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>;
struct DDimX : GrevillePoints::interpolation_mesh_type

template <typename NonUniform, std::size_t s_degree_x>
struct DDimX : GrevillePoints<NonUniform, s_degree_x>::interpolation_mesh_type
{
};

struct Y;

struct DDimY : ddc::UniformPointSampling<Y>
{
};
Expand All @@ -47,7 +57,8 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem
size_t freeMem = 0;
size_t totalMem = 0;
while (monitorFlag) {
std::this_thread::sleep_for(std::chrono::milliseconds(1)); // Adjust the interval as needed
std::this_thread::sleep_for(
std::chrono::microseconds(10)); // Adjust the interval as needed

// Acquire a lock to ensure thread safety when accessing CUDA functions
std::lock_guard<std::mutex> lock(mutex);
Expand All @@ -59,8 +70,14 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem
}
}

static void characteristics_advection(benchmark::State& state)
template <typename ExecSpace, typename NonUniform, std::size_t s_degree_x>
static void characteristics_advection_unitary(benchmark::State& state)
{
std::size_t nx = state.range(3);
std::size_t ny = state.range(4);
int cols_per_chunk = state.range(5);
int preconditionner_max_block_size = state.range(6);

size_t freeMem = 0;
size_t totalMem = 0;
#if defined(__CUDACC__)
Expand All @@ -80,76 +97,87 @@ static void characteristics_advection(benchmark::State& state)
std::ref(monitorFlag),
std::ref(maxUsedMem));

ddc::init_discrete_space<
BSplinesX>(ddc::Coordinate<X>(-1.), ddc::Coordinate<X>(1.), state.range(0));
ddc::init_discrete_space<DDimX>(ddc::GrevilleInterpolationPoints<
BSplinesX,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::get_sampling<DDimX>());
if constexpr (!NonUniform::value) {
ddc::init_discrete_space<BSplinesX<
NonUniform,
s_degree_x>>(ddc::Coordinate<X>(0.), ddc::Coordinate<X>(1.), nx);
} else {
std::vector<ddc::Coordinate<X>> breaks(nx + 1);
for (std::size_t i(0); i < nx + 1; ++i) {
breaks[i] = ddc::Coordinate<X>(static_cast<double>(i) / nx);
}
ddc::init_discrete_space<BSplinesX<NonUniform, s_degree_x>>(breaks);
}
ddc::init_discrete_space<DDimX<NonUniform, s_degree_x>>(
ddc::GrevilleInterpolationPoints<
BSplinesX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::
template get_sampling<DDimX<NonUniform, s_degree_x>>());
ddc::DiscreteDomain<DDimY> y_domain = ddc::init_discrete_space<DDimY>(DDimY::init<DDimY>(
ddc::Coordinate<Y>(-1.),
ddc::Coordinate<Y>(1.),
ddc::DiscreteVector<DDimY>(state.range(1))));
ddc::DiscreteVector<DDimY>(ny)));

auto const x_domain = ddc::GrevilleInterpolationPoints<
BSplinesX,
BSplinesX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::get_domain<DDimX>();
ddc::BoundCond::PERIODIC>::template get_domain<DDimX<NonUniform, s_degree_x>>();
ddc::Chunk density_alloc(
ddc::DiscreteDomain<DDimX, DDimY>(x_domain, y_domain),
ddc::DeviceAllocator<double>());
ddc::DiscreteDomain<DDimX<NonUniform, s_degree_x>, DDimY>(x_domain, y_domain),
ddc::KokkosAllocator<double, typename ExecSpace::memory_space>());
ddc::ChunkSpan const density = density_alloc.span_view();
// Initialize the density on the main domain
ddc::DiscreteDomain<DDimX, DDimY> x_mesh
= ddc::DiscreteDomain<DDimX, DDimY>(x_domain, y_domain);
ddc::DiscreteDomain<DDimX<NonUniform, s_degree_x>, DDimY> x_mesh
= ddc::DiscreteDomain<DDimX<NonUniform, s_degree_x>, DDimY>(x_domain, y_domain);
ddc::parallel_for_each(
ExecSpace(),
x_mesh,
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX, DDimY> const ixy) {
double const x = ddc::coordinate(ddc::select<DDimX>(ixy));
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX<NonUniform, s_degree_x>, DDimY> const ixy) {
double const x = ddc::coordinate(ddc::select<DDimX<NonUniform, s_degree_x>>(ixy));
double const y = ddc::coordinate(ddc::select<DDimY>(ixy));
density(ixy) = 9.999 * Kokkos::exp(-(x * x + y * y) / 0.1 / 2);
// initial_density(ixy) = 9.999 * ((x * x + y * y) < 0.25);
});
ddc::SplineBuilder<
Kokkos::DefaultExecutionSpace,
Kokkos::DefaultExecutionSpace::memory_space,
BSplinesX,
DDimX,
ExecSpace,
typename ExecSpace::memory_space,
BSplinesX<NonUniform, s_degree_x>,
DDimX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC,
ddc::SplineSolver::GINKGO,
DDimX,
Backend,
DDimX<NonUniform, s_degree_x>,
DDimY>
spline_builder(x_mesh, state.range(2), state.range(3));
spline_builder(x_mesh, cols_per_chunk, preconditionner_max_block_size);
ddc::PeriodicExtrapolationRule<X> periodic_extrapolation;
ddc::SplineEvaluator<
Kokkos::DefaultExecutionSpace,
Kokkos::DefaultExecutionSpace::memory_space,
BSplinesX,
DDimX,
ExecSpace,
typename ExecSpace::memory_space,
BSplinesX<NonUniform, s_degree_x>,
DDimX<NonUniform, s_degree_x>,
ddc::PeriodicExtrapolationRule<X>,
ddc::PeriodicExtrapolationRule<X>,
DDimX,
DDimX<NonUniform, s_degree_x>,
DDimY>
spline_evaluator(periodic_extrapolation, periodic_extrapolation);
ddc::Chunk coef_alloc(
spline_builder.batched_spline_domain(),
ddc::KokkosAllocator<double, Kokkos::DefaultExecutionSpace::memory_space>());
ddc::KokkosAllocator<double, typename ExecSpace::memory_space>());
ddc::ChunkSpan coef = coef_alloc.span_view();
ddc::Chunk feet_coords_alloc(
spline_builder.batched_interpolation_domain(),
ddc::KokkosAllocator<
ddc::Coordinate<X, Y>,
Kokkos::DefaultExecutionSpace::memory_space>());
ddc::KokkosAllocator<ddc::Coordinate<X, Y>, typename ExecSpace::memory_space>());
ddc::ChunkSpan feet_coords = feet_coords_alloc.span_view();

for (auto _ : state) {
Kokkos::Profiling::pushRegion("FeetCharacteristics");
ddc::parallel_for_each(
ExecSpace(),
feet_coords.domain(),
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX, DDimY> const e) {
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX<NonUniform, s_degree_x>, DDimY> const e) {
feet_coords(e) = ddc::Coordinate<X, Y>(
ddc::coordinate(ddc::select<DDimX>(e))
ddc::coordinate(ddc::select<DDimX<NonUniform, s_degree_x>>(e))
- ddc::Coordinate<X>(0.0176429863),
ddc::coordinate(ddc::select<DDimY>(e)));
});
Expand All @@ -163,9 +191,7 @@ static void characteristics_advection(benchmark::State& state)
}
monitorFlag = false;
monitorThread.join();
state.SetBytesProcessed(
int64_t(state.iterations())
* int64_t(state.range(0) * state.range(1) * sizeof(double)));
state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(nx * ny * sizeof(double)));
state.counters["gpu_mem_occupancy"] = maxUsedMem - initUsedMem;
////////////////////////////////////////////////////
/// --------------- HUGE WARNING --------------- ///
Expand All @@ -175,55 +201,147 @@ static void characteristics_advection(benchmark::State& state)
/// The reason is it acts on underlying global ///
/// variables, which is always a bad idea. ///
////////////////////////////////////////////////////
ddc::detail::g_discrete_space_dual<BSplinesX>.reset();
ddc::detail::g_discrete_space_dual<ddc::UniformBsplinesKnots<BSplinesX>>.reset();
ddc::detail::g_discrete_space_dual<DDimX>.reset();
ddc::detail::g_discrete_space_dual<BSplinesX<NonUniform, s_degree_x>>.reset();
if constexpr (!NonUniform::value) {
ddc::detail::g_discrete_space_dual<ddc::UniformBsplinesKnots<BSplinesX<NonUniform, s_degree_x>>>.reset();
} else {
ddc::detail::g_discrete_space_dual<ddc::NonUniformBsplinesKnots<BSplinesX<NonUniform, s_degree_x>>>.reset();
}
ddc::detail::g_discrete_space_dual<DDimX<NonUniform, s_degree_x>>.reset();
ddc::detail::g_discrete_space_dual<DDimY>.reset();
////////////////////////////////////////////////////
}

// Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU
static void characteristics_advection(benchmark::State& state)
{
// Preallocate 12 unitary benchs for each combination of cpu/gpu execution space, uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 12 variants of the bench even if we call only one of them)
std::array<std::function<void(benchmark::State&)>, 12> benchs;
benchs[0] = characteristics_advection_unitary<
Kokkos::DefaultHostExecutionSpace,
std::false_type,
3>;
benchs[1] = characteristics_advection_unitary<
Kokkos::DefaultHostExecutionSpace,
std::false_type,
4>;
benchs[2] = characteristics_advection_unitary<
Kokkos::DefaultHostExecutionSpace,
std::false_type,
5>;
benchs[3] = characteristics_advection_unitary<
Kokkos::DefaultHostExecutionSpace,
std::true_type,
3>;
benchs[4] = characteristics_advection_unitary<
Kokkos::DefaultHostExecutionSpace,
std::true_type,
4>;
benchs[5] = characteristics_advection_unitary<
Kokkos::DefaultHostExecutionSpace,
std::true_type,
5>;
benchs[6]
= characteristics_advection_unitary<Kokkos::DefaultExecutionSpace, std::false_type, 3>;
benchs[7]
= characteristics_advection_unitary<Kokkos::DefaultExecutionSpace, std::false_type, 4>;
benchs[8]
= characteristics_advection_unitary<Kokkos::DefaultExecutionSpace, std::false_type, 5>;
benchs[9] = characteristics_advection_unitary<Kokkos::DefaultExecutionSpace, std::true_type, 3>;
benchs[10]
= characteristics_advection_unitary<Kokkos::DefaultExecutionSpace, std::true_type, 4>;
benchs[11]
= characteristics_advection_unitary<Kokkos::DefaultExecutionSpace, std::true_type, 5>;

// Run the desired bench
benchs[state.range(0) * 6 + state.range(1) * 3 + state.range(2) - 3](state);
}

#ifdef KOKKOS_ENABLE_CUDA
std::string chip = "gpu";
// Reference parameters: the benchmarks sweep on two parameters and fix all the others according to those reference parameters.
bool on_gpu_ref = true;
bool non_uniform_ref = false;
std::size_t degree_x_ref = 3;
#if (defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP))
std::size_t cols_per_chunk_ref = 65535;
unsigned int preconditionner_max_block_size_ref = 1u;
#elif defined(KOKKOS_ENABLE_OPENMP)
std::string chip = "cpu";
std::size_t cols_per_chunk_ref = 8192;
unsigned int preconditionner_max_block_size_ref = 32u;
unsigned int preconditionner_max_block_size_ref = 1u;
#elif defined(KOKKOS_ENABLE_SERIAL)
std::string chip = "cpu";
std::size_t cols_per_chunk_ref = 8192;
unsigned int preconditionner_max_block_size_ref = 32u;
#endif
// std::size_t ny_ref = 100000;
std::size_t ny_ref = 1000;

// Sweep on spline order
std::string name = "degree_x";
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges(
{{false, true},
{false, true},
{3, 5},
{64, 1024},
{ny_ref, ny_ref},
{cols_per_chunk_ref, cols_per_chunk_ref},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)
->UseRealTime();
/*
// Sweep on ny
std::string name = "ny";
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges(
{{64, 1024},
{{false, true},
{false, true},
{degree_x_ref, degree_x_ref},
{64, 1024},
{100, 200000},
{cols_per_chunk_ref, cols_per_chunk_ref},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)
->UseRealTime();
*/
/*
// Sweep on cols_per_chunk
std::string name = "cols_per_chunk";
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {64,65535}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)->UseRealTime();
->Ranges(
{{false, true},
{false, true},
{degree_x_ref, degree_x_ref},
{64, 1024},
{ny_ref, ny_ref},
{64, 65535},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)
->UseRealTime();
*/
/*
// Sweep on preconditionner_max_block_size
std::string name = "preconditionner_max_block_size";
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {1, 32}})
->MinTime(3)->UseRealTime();
->Ranges(
{{on_gpu_ref, on_gpu_ref},
{false, true},
{degree_x_ref, degree_x_ref},
{64, 1024},
{ny_ref, ny_ref},
{cols_per_chunk_ref, cols_per_chunk_ref},
{1, 32}})
->MinTime(3)
->UseRealTime();
*/

int main(int argc, char** argv)
{
::benchmark::Initialize(&argc, argv);
::benchmark::AddCustomContext("chip", chip);
::benchmark::AddCustomContext("name", name);
::benchmark::
AddCustomContext("backend", Backend == ddc::SplineSolver::GINKGO ? "GINKGO" : "LAPACK");
::benchmark::AddCustomContext("cols_per_chunk_ref", std::to_string(cols_per_chunk_ref));
::benchmark::AddCustomContext(
"preconditionner_max_block_size_ref",
Expand Down
Loading