Skip to content

Commit

Permalink
Merge pull request #354 from bluescarni/pr/opt
Browse files Browse the repository at this point in the history
Performance optimisations
  • Loading branch information
bluescarni authored Oct 30, 2023
2 parents ba51dd0 + 442ac24 commit 2a0d493
Show file tree
Hide file tree
Showing 17 changed files with 731 additions and 65 deletions.
1 change: 1 addition & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ ADD_HEYOKA_BENCHMARK(bandf)
ADD_HEYOKA_BENCHMARK(bandf_odeint)
ADD_HEYOKA_BENCHMARK(penc_comparison)
ADD_HEYOKA_BENCHMARK(large_cfunc)
ADD_HEYOKA_BENCHMARK(sims_flanagan_jac)

if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
ADD_HEYOKA_BENCHMARK(pendulum_mp)
Expand Down
157 changes: 157 additions & 0 deletions benchmark/sims_flanagan_jac.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Copyright 2020, 2021, 2022, 2023 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <algorithm>
#include <array>
#include <iostream>
#include <iterator>
#include <stdexcept>
#include <tuple>
#include <utility>
#include <vector>

#include <boost/program_options.hpp>

#include <spdlog/spdlog.h>

#include <heyoka/expression.hpp>
#include <heyoka/llvm_state.hpp>
#include <heyoka/logging.hpp>
#include <heyoka/math/cos.hpp>
#include <heyoka/math/kepDE.hpp>
#include <heyoka/math/sin.hpp>
#include <heyoka/math/sqrt.hpp>

#include <fmt/ranges.h>

using namespace heyoka;

using v_ex_t = std::array<expression, 3>;

const auto mu = make_vars("mu")[0];
const auto tm = make_vars("t")[0];

std::pair<v_ex_t, v_ex_t> make_lp(const v_ex_t &pos_0, const v_ex_t &vel_0)
{
const auto &[x0, y0, z0] = pos_0;
const auto &[vx0, vy0, vz0] = vel_0;

auto v02 = vx0 * vx0 + vy0 * vy0 + vz0 * vz0;
auto r0 = sqrt(x0 * x0 + y0 * y0 + z0 * z0);
auto eps = v02 * 0.5 - mu / r0;
auto a = -mu / (2. * eps);

auto sigma0 = (x0 * vx0 + y0 * vy0 + z0 * vz0) / sqrt(mu);
auto s0 = sigma0 / sqrt(a);
auto c0 = 1. - r0 / a;

auto n = sqrt(mu / (a * a * a));
auto DM = n * tm;

auto DE = kepDE(s0, c0, DM);

auto cDE = cos(DE);
auto sDE = sin(DE);

auto r = a + (r0 - a) * cDE + sigma0 * sqrt(a) * sDE;

auto F = 1. - a / r0 * (1. - cDE);
auto G = a * sigma0 / sqrt(mu) * (1. - cDE) + r0 * sqrt(a / mu) * sDE;
auto Ft = -sqrt(mu * a) / (r * r0) * sDE;
auto Gt = 1. - a / r * (1. - cDE);

return {{{F * x0 + G * vx0, F * y0 + G * vy0, F * z0 + G * vz0}},
{{Ft * x0 + Gt * vx0, Ft * y0 + Gt * vy0, Ft * z0 + Gt * vz0}}};
}

int main(int argc, char *argv[])
{
namespace po = boost::program_options;

create_logger();
auto logger = spdlog::get("heyoka");

set_logger_level_trace();

unsigned N{};

po::options_description desc("Options");

desc.add_options()("help", "produce help message")("N", po::value<unsigned>(&N)->default_value(20u),
"number of segments");

po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help") != 0u) {
std::cout << desc << "\n";
return 0;
}

if (N == 0u) {
throw std::invalid_argument("The number of segments cannot be zero");
}

auto [x0, y0, z0] = make_vars("x0", "y0", "z0");
auto [vx0, vy0, vz0] = make_vars("vx0", "vy0", "vz0");

std::vector<v_ex_t> Dvs;
for (auto i = 0u; i < N; ++i) {
auto [Dvx, Dvy, Dvz]
= make_vars(fmt::format("Dvx{}", i + 1u), fmt::format("Dvy{}", i + 1u), fmt::format("Dvz{}", i + 1u));
Dvs.push_back(v_ex_t{Dvx, Dvy, Dvz});
}

auto [pos_f, vel_f] = make_lp({x0, y0, z0}, {vx0, vy0, vz0});

for (auto i = 0u; i < N; ++i) {
vel_f[0] += Dvs[i][0];
vel_f[1] += Dvs[i][1];
vel_f[2] += Dvs[i][2];

std::tie(pos_f, vel_f) = make_lp(pos_f, vel_f);
}

std::vector<expression> Dvs_list;
for (const auto &Dv : Dvs) {
Dvs_list.push_back(Dv[0]);
Dvs_list.push_back(Dv[1]);
Dvs_list.push_back(Dv[2]);
}

std::vector<expression> diff_vars_list = {x0, y0, z0, vx0, vy0, vz0};
diff_vars_list.insert(diff_vars_list.end(), Dvs_list.begin(), Dvs_list.end());

auto dt
= diff_tensors({pos_f[0], pos_f[1], pos_f[2], vel_f[0], vel_f[1], vel_f[2]}, kw::diff_args = diff_vars_list);

std::vector<expression> jac;
auto jac_sr = dt.get_derivatives(1);
std::transform(jac_sr.begin(), jac_sr.end(), std::back_inserter(jac), [](const auto &p) { return p.second; });

llvm_state s;

std::vector<expression> vars_list = {x0, y0, z0, vx0, vy0, vz0, mu, tm};
vars_list.insert(vars_list.end(), Dvs_list.begin(), Dvs_list.end());

logger->trace("Adding cfunc");

add_cfunc<double>(s, "func", jac, kw::vars = vars_list);

logger->trace("Compiling cfunc");

s.compile();

logger->trace("Looking up cfunc");

auto *fptr
= reinterpret_cast<void (*)(double *, const double *, const double *, const double *)>(s.jit_lookup("func"));

logger->trace("All done");
}
8 changes: 8 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ New
system (`#352 <https://github.com/bluescarni/heyoka/pull/352>`__).
Taylor derivatives are not implemented yet.

Changes
~~~~~~~

- Substantial performance improvements in the computation of
derivative tensors of large expressions with a high degree
of internal redundancy
(`#354 <https://github.com/bluescarni/heyoka/pull/354>`__).

Fix
~~~

Expand Down
2 changes: 2 additions & 0 deletions doc/conf.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ html_theme_options = {
"repository_url": "https://github.com/bluescarni/heyoka",
"use_repository_button": True,
"use_issues_button": True,
# See: https://github.com/pydata/pydata-sphinx-theme/issues/1492
"navigation_with_keys": False,
}


Expand Down
67 changes: 67 additions & 0 deletions include/heyoka/detail/fast_unordered.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2020, 2021, 2022, 2023 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef HEYOKA_DETAIL_FAST_UNORDERED_HPP
#define HEYOKA_DETAIL_FAST_UNORDERED_HPP

#include <heyoka/config.hpp>

#include <boost/version.hpp>

#if (BOOST_VERSION / 100000 > 1) || (BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 >= 81)

#define HEYOKA_HAVE_BOOST_UNORDERED_FLAT

#endif

#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)

#include <boost/unordered/unordered_flat_map.hpp>
#include <boost/unordered/unordered_flat_set.hpp>

#else

#include <unordered_map>
#include <unordered_set>

#endif

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

template <typename... Args>
using fast_uset =
#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
boost::unordered_flat_set<Args...>
#else
std::unordered_set<Args...>
#endif
;

template <typename... Args>
using fast_umap =
#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)
boost::unordered_flat_map<Args...>
#else
std::unordered_map<Args...>
#endif
;

} // namespace detail

HEYOKA_END_NAMESPACE

#if defined(HEYOKA_HAVE_BOOST_UNORDERED_FLAT)

#undef HEYOKA_HAVE_BOOST_UNORDERED_FLAT

#endif

#endif
33 changes: 3 additions & 30 deletions include/heyoka/detail/func_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,20 @@
#define HEYOKA_DETAIL_FUNC_CACHE_HPP

#include <heyoka/config.hpp>

#include <boost/version.hpp>

#if (BOOST_VERSION / 100000 > 1) || (BOOST_VERSION / 100000 == 1 && BOOST_VERSION / 100 % 1000 >= 81)

#include <boost/unordered/unordered_flat_map.hpp>
#include <boost/unordered/unordered_flat_set.hpp>
#include <heyoka/detail/fast_unordered.hpp>

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

using funcptr_set = boost::unordered_flat_set<const void *>;
using funcptr_set = fast_uset<const void *>;

template <typename T>
using funcptr_map = boost::unordered_flat_map<const void *, T>;
using funcptr_map = fast_umap<const void *, T>;

} // namespace detail

HEYOKA_END_NAMESPACE

#else

#include <unordered_map>
#include <unordered_set>

HEYOKA_BEGIN_NAMESPACE

namespace detail
{

using funcptr_set = std::unordered_set<const void *>;

template <typename T>
using funcptr_map = std::unordered_map<const void *, T>;

} // namespace detail

HEYOKA_END_NAMESPACE

#endif

#endif
2 changes: 2 additions & 0 deletions include/heyoka/expression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,8 @@ class HEYOKA_DLL_PUBLIC dtens

[[nodiscard]] subrange get_derivatives(std::uint32_t, std::uint32_t) const;
[[nodiscard]] subrange get_derivatives(std::uint32_t) const;
[[nodiscard]] std::vector<expression> get_gradient() const;
[[nodiscard]] std::vector<expression> get_jacobian() const;
};

HEYOKA_DLL_PUBLIC std::ostream &operator<<(std::ostream &, const dtens &);
Expand Down
55 changes: 52 additions & 3 deletions src/expression_basic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <variant>
#include <vector>

#include <boost/container_hash/hash.hpp>
#include <boost/safe_numerics/safe_integer.hpp>

#include <llvm/IR/Function.h>
Expand Down Expand Up @@ -494,12 +495,60 @@ std::size_t hash(funcptr_map<std::size_t> &func_map, const expression &ex) noexc
ex.value());
}

// NOLINTNEXTLINE(bugprone-exception-escape)
// NOLINTNEXTLINE(bugprone-exception-escape,misc-no-recursion)
std::size_t hash(const expression &ex) noexcept
{
detail::funcptr_map<std::size_t> func_map;
// NOTE: we implement an optimisation here: if either the expression is **not** a function,
// or if it is a function and none of its arguments are functions (i.e., it is a non-recursive
// expression), then we do not recurse into detail::hash() and we do not create/update the
// funcptr_map cache. This allows us to avoid memory allocations, in an effort to optimise the
// computation of hashes in decompositions, where the elementary subexpressions are
// non-recursive by definition.
// NOTE: we must be very careful with this optimisation: we need to make sure the hash computation
// result is the same whether the optimisation is enabled or not. That is, the hash
// **must** be the same regardless of whether a non-recursive expression is standalone
// or it is part of a larger expression.
return std::visit(
// NOLINTNEXTLINE(misc-no-recursion)
[&ex](const auto &v) {
using type = detail::uncvref_t<decltype(v)>;

if constexpr (std::is_same_v<type, func>) {
const auto no_func_args = std::none_of(v.args().begin(), v.args().end(), [](const auto &x) {
return std::holds_alternative<func>(x.value());
});

if (no_func_args) {
// NOTE: it is very important that here we replicate *exactly* the logic
// of func::hash(), so that we produce the same hash value that would
// be produced if ex were part of a larger expression.
std::size_t seed = std::hash<std::string>{}(v.get_name());

for (const auto &arg : v.args()) {
// NOTE: for non-function arguments, calling hash(arg)
// is identical to calling detail::hash(func_map, arg)
// (as we do in func::hash()):
// both ultimately end up using directly the std::hash
// specialisation on the arg.
boost::hash_combine(seed, hash(arg));
}

return hash(func_map, ex);
return seed;
} else {
// The regular, non-optimised path.
detail::funcptr_map<std::size_t> func_map;

return hash(func_map, ex);
}
} else {
// ex is not a function, compute its hash directly
// via a std::hash specialisation.
// NOTE: this is the same logic implemented in detail::hash(),
// except we avoid the unnecessary creation of a funcptr_map.
return std::hash<type>{}(v);
}
},
ex.value());
}

} // namespace detail
Expand Down
Loading

0 comments on commit 2a0d493

Please sign in to comment.