Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Splines improve benchmark #496

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Changes from 8 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
169efae
align reference preconditioner size on default
blegouix Jun 18, 2024
20ce63c
wip
blegouix Jun 18, 2024
c498306
wip
blegouix Jun 18, 2024
cce3cfd
wip
blegouix Jun 18, 2024
8cc0125
sweep on splines degree
blegouix Jun 18, 2024
76c146c
wip
blegouix Jun 18, 2024
b0e0dde
non-uniform/uniform
blegouix Jun 18, 2024
2d217c7
wip
blegouix Jun 18, 2024
28fe503
sweep on exec_space
blegouix Jun 18, 2024
c741315
wip
blegouix Jun 18, 2024
5dba374
wip
blegouix Jun 18, 2024
93ffde4
wip
blegouix Jun 18, 2024
9310a70
wip
blegouix Jun 18, 2024
a4d64b6
wip
blegouix Jun 18, 2024
0692da1
restructurate plotter
blegouix Jun 18, 2024
21c4516
add degree and uniformity
blegouix Jun 18, 2024
d6eaac3
finish plotter restructurateion
blegouix Jun 18, 2024
3232c76
minor
blegouix Jun 19, 2024
d029688
wip
blegouix Jun 20, 2024
d4fa428
wip
blegouix Jun 20, 2024
2a4977f
misc
blegouix Jun 20, 2024
0b7baa4
_suffix the file names
blegouix Jun 20, 2024
67155d6
ifs in plotter
blegouix Jun 20, 2024
fd03143
minor
blegouix Jun 20, 2024
7c897a6
backend suffix
blegouix Jun 20, 2024
d11ae29
minor
blegouix Jun 20, 2024
a1d4475
minor
blegouix Jun 20, 2024
54e71cd
minor
blegouix Jun 20, 2024
5b0bb19
fix
blegouix Jun 20, 2024
bc6510d
fix
blegouix Jun 20, 2024
9065d88
Merge branch 'main' into splines-improve-benchmark
blegouix Jun 25, 2024
386ad4f
Merge branch 'main' into splines-improve-benchmark
blegouix Jun 28, 2024
0770af3
minor
blegouix Jul 1, 2024
26561fe
Merge branch 'splines-improve-benchmark' of github.com:Maison-de-la-S…
blegouix Jul 1, 2024
5527831
lin scale precond
blegouix Jul 2, 2024
b4f969b
Update benchmarks/splines.cpp
blegouix Jul 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 116 additions & 42 deletions benchmarks/splines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,26 @@

namespace DDC_HIP_5_7_ANONYMOUS_NAMESPACE_WORKAROUND(SPLINES_CPP)
{
static constexpr std::size_t s_degree_x = 3;

struct X
{
static constexpr bool PERIODIC = true;
};

struct BSplinesX : ddc::UniformBSplines<X, s_degree_x>
template <typename NonUniform, std::size_t s_degree_x>
struct BSplinesX
: std::conditional_t<
NonUniform::value,
ddc::NonUniformBSplines<X, s_degree_x>,
ddc::UniformBSplines<X, s_degree_x>>
{
};
template <typename NonUniform, std::size_t s_degree_x>
using GrevillePoints = ddc::GrevilleInterpolationPoints<
BSplinesX,
BSplinesX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>;
struct DDimX : GrevillePoints::interpolation_mesh_type
template <typename NonUniform, std::size_t s_degree_x>
struct DDimX : GrevillePoints<NonUniform, s_degree_x>::interpolation_mesh_type
{
};

Expand Down Expand Up @@ -59,8 +64,14 @@ void monitorMemoryAsync(std::mutex& mutex, bool& monitorFlag, size_t& maxUsedMem
}
}

static void characteristics_advection(benchmark::State& state)
template <typename NonUniform, std::size_t s_degree_x>
static void characteristics_advection_unitary(benchmark::State& state)
{
std::size_t nx = state.range(2);
std::size_t ny = state.range(3);
int cols_per_chunk = state.range(4);
int preconditionner_max_block_size = state.range(5);

size_t freeMem = 0;
size_t totalMem = 0;
#if defined(__CUDACC__)
Expand All @@ -80,56 +91,67 @@ static void characteristics_advection(benchmark::State& state)
std::ref(monitorFlag),
std::ref(maxUsedMem));

ddc::init_discrete_space<
BSplinesX>(ddc::Coordinate<X>(-1.), ddc::Coordinate<X>(1.), state.range(0));
ddc::init_discrete_space<DDimX>(ddc::GrevilleInterpolationPoints<
BSplinesX,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::get_sampling<DDimX>());
if constexpr (!NonUniform::value) {
ddc::init_discrete_space<BSplinesX<
NonUniform,
s_degree_x>>(ddc::Coordinate<X>(0.), ddc::Coordinate<X>(1.), nx);
} else {
std::vector<ddc::Coordinate<X>> breaks(nx + 1);
for (std::size_t i(0); i < nx + 1; ++i) {
breaks[i] = ddc::Coordinate<X>(static_cast<double>(i) / nx);
}
ddc::init_discrete_space<BSplinesX<NonUniform, s_degree_x>>(breaks);
}
ddc::init_discrete_space<DDimX<NonUniform, s_degree_x>>(
ddc::GrevilleInterpolationPoints<
BSplinesX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::
template get_sampling<DDimX<NonUniform, s_degree_x>>());
ddc::DiscreteDomain<DDimY> y_domain = ddc::init_discrete_space<DDimY>(DDimY::init<DDimY>(
ddc::Coordinate<Y>(-1.),
ddc::Coordinate<Y>(1.),
ddc::DiscreteVector<DDimY>(state.range(1))));
ddc::DiscreteVector<DDimY>(ny)));

auto const x_domain = ddc::GrevilleInterpolationPoints<
BSplinesX,
BSplinesX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC>::get_domain<DDimX>();
ddc::BoundCond::PERIODIC>::template get_domain<DDimX<NonUniform, s_degree_x>>();
ddc::Chunk density_alloc(
ddc::DiscreteDomain<DDimX, DDimY>(x_domain, y_domain),
ddc::DiscreteDomain<DDimX<NonUniform, s_degree_x>, DDimY>(x_domain, y_domain),
ddc::DeviceAllocator<double>());
ddc::ChunkSpan const density = density_alloc.span_view();
// Initialize the density on the main domain
ddc::DiscreteDomain<DDimX, DDimY> x_mesh
= ddc::DiscreteDomain<DDimX, DDimY>(x_domain, y_domain);
ddc::DiscreteDomain<DDimX<NonUniform, s_degree_x>, DDimY> x_mesh
= ddc::DiscreteDomain<DDimX<NonUniform, s_degree_x>, DDimY>(x_domain, y_domain);
ddc::parallel_for_each(
x_mesh,
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX, DDimY> const ixy) {
double const x = ddc::coordinate(ddc::select<DDimX>(ixy));
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX<NonUniform, s_degree_x>, DDimY> const ixy) {
double const x = ddc::coordinate(ddc::select<DDimX<NonUniform, s_degree_x>>(ixy));
double const y = ddc::coordinate(ddc::select<DDimY>(ixy));
density(ixy) = 9.999 * Kokkos::exp(-(x * x + y * y) / 0.1 / 2);
// initial_density(ixy) = 9.999 * ((x * x + y * y) < 0.25);
});
ddc::SplineBuilder<
Kokkos::DefaultExecutionSpace,
Kokkos::DefaultExecutionSpace::memory_space,
BSplinesX,
DDimX,
BSplinesX<NonUniform, s_degree_x>,
DDimX<NonUniform, s_degree_x>,
ddc::BoundCond::PERIODIC,
ddc::BoundCond::PERIODIC,
ddc::SplineSolver::GINKGO,
DDimX,
DDimX<NonUniform, s_degree_x>,
DDimY>
spline_builder(x_mesh, state.range(2), state.range(3));
spline_builder(x_mesh, cols_per_chunk, preconditionner_max_block_size);
ddc::PeriodicExtrapolationRule<X> periodic_extrapolation;
ddc::SplineEvaluator<
Kokkos::DefaultExecutionSpace,
Kokkos::DefaultExecutionSpace::memory_space,
BSplinesX,
DDimX,
BSplinesX<NonUniform, s_degree_x>,
DDimX<NonUniform, s_degree_x>,
ddc::PeriodicExtrapolationRule<X>,
ddc::PeriodicExtrapolationRule<X>,
DDimX,
DDimX<NonUniform, s_degree_x>,
DDimY>
spline_evaluator(periodic_extrapolation, periodic_extrapolation);
ddc::Chunk coef_alloc(
Expand All @@ -147,9 +169,9 @@ static void characteristics_advection(benchmark::State& state)
Kokkos::Profiling::pushRegion("FeetCharacteristics");
ddc::parallel_for_each(
feet_coords.domain(),
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX, DDimY> const e) {
KOKKOS_LAMBDA(ddc::DiscreteElement<DDimX<NonUniform, s_degree_x>, DDimY> const e) {
feet_coords(e) = ddc::Coordinate<X, Y>(
ddc::coordinate(ddc::select<DDimX>(e))
ddc::coordinate(ddc::select<DDimX<NonUniform, s_degree_x>>(e))
- ddc::Coordinate<X>(0.0176429863),
ddc::coordinate(ddc::select<DDimY>(e)));
});
Expand All @@ -163,9 +185,7 @@ static void characteristics_advection(benchmark::State& state)
}
monitorFlag = false;
monitorThread.join();
state.SetBytesProcessed(
int64_t(state.iterations())
* int64_t(state.range(0) * state.range(1) * sizeof(double)));
state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(nx * ny * sizeof(double)));
state.counters["gpu_mem_occupancy"] = maxUsedMem - initUsedMem;
////////////////////////////////////////////////////
/// --------------- HUGE WARNING --------------- ///
Expand All @@ -175,49 +195,103 @@ static void characteristics_advection(benchmark::State& state)
/// The reason is it acts on underlying global ///
/// variables, which is always a bad idea. ///
////////////////////////////////////////////////////
ddc::detail::g_discrete_space_dual<BSplinesX>.reset();
ddc::detail::g_discrete_space_dual<ddc::UniformBsplinesKnots<BSplinesX>>.reset();
ddc::detail::g_discrete_space_dual<DDimX>.reset();
ddc::detail::g_discrete_space_dual<BSplinesX<NonUniform, s_degree_x>>.reset();
if constexpr (!NonUniform::value) {
ddc::detail::g_discrete_space_dual<ddc::UniformBsplinesKnots<BSplinesX<NonUniform, s_degree_x>>>.reset();
} else {
ddc::detail::g_discrete_space_dual<ddc::NonUniformBsplinesKnots<BSplinesX<NonUniform, s_degree_x>>>.reset();
}
ddc::detail::g_discrete_space_dual<DDimX<NonUniform, s_degree_x>>.reset();
ddc::detail::g_discrete_space_dual<DDimY>.reset();
////////////////////////////////////////////////////
}

// Tuning : 512 cols and 8 precond on CPU, 16384 cols and 1 precond on GPU
static void characteristics_advection(benchmark::State& state)
{
// Preallocate 6 unitary benchs for each combination of uniform/non-uniform and spline degree we may want to benchmark (those are determined at compile-time, that's why we need to build explicitely 6 variants of the bench even if we call only one of them)
std::array<std::function<void(benchmark::State&)>, 6> benchs;
benchs[0] = characteristics_advection_unitary<std::false_type, 3>;
benchs[1] = characteristics_advection_unitary<std::false_type, 4>;
benchs[2] = characteristics_advection_unitary<std::false_type, 5>;
benchs[3] = characteristics_advection_unitary<std::true_type, 3>;
benchs[4] = characteristics_advection_unitary<std::true_type, 4>;
benchs[5] = characteristics_advection_unitary<std::true_type, 5>;

// Run the desired bench
benchs[state.range(0) * 3 + state.range(1) - 3](state);
}
Copy link
Collaborator Author

@blegouix blegouix Jun 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tpadioleau any opinion about this ? It is hacky but helps a lot to make the benchmarks flexible. I did not find yet any way to write it as "nested loops", maybe there is one still.

Copy link
Collaborator Author

@blegouix blegouix Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can imagine an improvement of TypeSeq with something like:

template<std::size_t N, typename List>
struct TypeByIndex;

template<std::typename Head, typename... Tail>
struct TypeByIndex<0, ddc::detail::TypeSeq<Head, Tail...>> {
    using type = Head;
};

template<std::size_t N, typename Head, typename... Tail>
struct TypeByIndex<N, ddc::detail::TypeSeq<Head, Tail...>> {
    using type = typename TypeAt<N-1, TypeList<Tail...>>::type;
};

And upon this a nested parameter pack expansions on TypeSeq<Kokkos::DefaultHostExecutionSpace, Kokkos::DefaultExecutionSpace>, TypeSeq<std::false_type, std::true_type> and TypeSeq<3, 4, 5> should be feasible, but it may be a bit too complicated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tpadioleau any opinion about this ? It is hacky but helps a lot to make the benchmarks flexible. I did not find yet any way to write it as "nested loops", maybe there is one still.

Don't you think you need different tests for Ginkgo and Lapack ? They have different parameters ? cols_per_chunk_ref and preconditionner_max_block_size_ref do not make sense for Lapack right ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to select a benchmark based on a runtime parameter could we use a string and store the different benchmarks in a std::map<std::string, benchmark> my_benchmarks ? you would just call my_benchmarks.at("host")

Copy link
Collaborator Author

@blegouix blegouix Jun 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tpadioleau any opinion about this ? It is hacky but helps a lot to make the benchmarks flexible. I did not find yet any way to write it as "nested loops", maybe there is one still.

Don't you think you need different tests for Ginkgo and Lapack ? They have different parameters ? cols_per_chunk_ref and preconditionner_max_block_size_ref do not make sense for Lapack right ?

Atm the choice of Backend still has to be manually changed by editing the file. cols_per_chunk_ref and preconditionner_max_block_size_ref do not make sense for Lapack indeed, and this is weird to have them passed as optional arguments of SplineBuilder. But this is the current implementation, we could do something better with a kwArgs struct (similar to what we have in the FFT) but this requires more developments.

About the benchmark, the current way to use it is to comment/uncomment the benchmarks we want to run so we just have to not run the preconditioner or cols_per_chunk benchmark with Lapack.

If you want to select a benchmark based on a runtime parameter could we use a string and store the different benchmarks in a std::map<std::string, benchmark> my_benchmarks ? you would just call my_benchmarks.at("host")

Making a more interfaced benchmark system would be great but requires development. And I think it would not impact the way benchmarks are performed (this would be a software overlay build upon it). Maybe gtest already provides something to do it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(my question was more about instanciating all the possible variations of the class we would like to benchmark, combining explicitly compile-time parameters like ExecutionSpace, Degree and Uniformity)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I think I get the problem, would this google benchmark feature https://github.com/google/benchmark/blob/main/docs/user_guide.md#templated-benchmarks-that-take-arguments help you ?

Copy link
Collaborator Author

@blegouix blegouix Jun 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so because I think it does not allow to "sweep" on those template parameters (i.e. degree={3,4,5}) so it would be at least less conveniant than my solution (if I understand well the capabilities of this feature).


// Reference parameters: the benchmarks sweep on two parameters and fix all the others according to those reference parameters.
bool non_uniform_ref = false;
std::size_t degree_x_ref = 3;
#ifdef KOKKOS_ENABLE_CUDA
std::string chip = "gpu";
std::size_t cols_per_chunk_ref = 65535;
unsigned int preconditionner_max_block_size_ref = 1u;
#elif defined(KOKKOS_ENABLE_OPENMP)
std::string chip = "cpu";
std::size_t cols_per_chunk_ref = 8192;
unsigned int preconditionner_max_block_size_ref = 32u;
unsigned int preconditionner_max_block_size_ref = 1u;
#elif defined(KOKKOS_ENABLE_SERIAL)
std::string chip = "cpu";
std::size_t cols_per_chunk_ref = 8192;
unsigned int preconditionner_max_block_size_ref = 32u;
#endif
std::size_t ny_ref = 100000;

// Sweep on uniform/non-uniform and spline order
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges(
{{64, 1024},
{{0, 1},
{3, 5},
{64, 1024},
{ny_ref, ny_ref},
{cols_per_chunk_ref, cols_per_chunk_ref},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)
->UseRealTime();
/*
// Sweep on nx and ny
BENCHMARK(run)
->RangeMultiplier(2)
->Ranges(
{{non_uniform_ref, non_uniform_ref},
{degree_x_ref, degree_x_ref},
{64, 1024},
{100, 200000},
{cols_per_chunk_ref, cols_per_chunk_ref},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)
->UseRealTime();
*/
/*
// Sweep on nx and cols_per_chunk
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {64,65535}, {preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)->UseRealTime();
->Ranges(
{{non_uniform_ref, non_uniform_ref},
{degree_x_ref, degree_x_ref},
{64, 1024},
{ny_ref, ny_ref},
{64, 65535},
{preconditionner_max_block_size_ref, preconditionner_max_block_size_ref}})
->MinTime(3)
->UseRealTime();
*/
/*
// Sweep on nx and preconditionne_max_block_size
BENCHMARK(characteristics_advection)
->RangeMultiplier(2)
->Ranges({{64, 1024}, {100000, 100000}, {cols_per_chunk_ref, cols_per_chunk_ref}, {1, 32}})
->MinTime(3)->UseRealTime();
->Ranges(
{{non_uniform_ref, non_uniform_ref},
{degree_x_ref, degree_x_ref},
{64, 1024},
{ny_ref, ny_ref},
{cols_per_chunk_ref, cols_per_chunk_ref},
{1, 32}})
->MinTime(3)
->UseRealTime();
*/

int main(int argc, char** argv)
Expand Down
Loading