Skip to content

Commit

Permalink
Merge pull request #446 from bluescarni/pr/parjittaylor
Browse files Browse the repository at this point in the history
Parallel compilation for Taylor integrators
  • Loading branch information
bluescarni authored Aug 31, 2024
2 parents e704228 + 57189c7 commit 985efad
Show file tree
Hide file tree
Showing 48 changed files with 2,332 additions and 1,436 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ set(HEYOKA_SRC_FILES
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/type_traits.cpp"
# NOTE: this will be an empty file in case we are not
# building with support for real.
"${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp"
Expand Down
13 changes: 12 additions & 1 deletion benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# NOTE: we look for Boost in CONFIG mode first, as that has become the official supported way
# of locating Boost in recent Boost/CMake versions. If we fail, we try again in
# MODULE mode as last resort.
# NOTE: don't find a specific version as we already checked
# outside that the Boost version is appropriate.
find_package(Boost REQUIRED COMPONENTS program_options)
find_package(Boost QUIET COMPONENTS program_options CONFIG)
if(NOT ${Boost_FOUND})
message(STATUS "Boost not found in CONFIG mode, retrying in MODULE mode.")
find_package(Boost QUIET MODULE COMPONENTS program_options)
endif()
if(NOT ${Boost_FOUND})
message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.")
endif()
if(NOT TARGET Boost::program_options)
message(STATUS "The 'Boost::program_options' imported target is missing, creating it.")
add_library(Boost::program_options UNKNOWN IMPORTED)
Expand Down Expand Up @@ -73,6 +83,7 @@ ADD_HEYOKA_BENCHMARK(sims_flanagan_jac)
ADD_HEYOKA_BENCHMARK(cfunc_mt)
ADD_HEYOKA_BENCHMARK(diff_tensors)
ADD_HEYOKA_BENCHMARK(var_construction)
ADD_HEYOKA_BENCHMARK(sgp4_dynamics)

if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
ADD_HEYOKA_BENCHMARK(pendulum_mp)
Expand Down
9 changes: 8 additions & 1 deletion benchmark/n_body_creation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <cstdint>
#include <iostream>
#include <numeric>
#include <variant>
#include <vector>

#include <boost/program_options.hpp>
Expand Down Expand Up @@ -56,7 +57,13 @@ int main(int argc, char *argv[])
std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)
.count());

std::cout << ta.get_llvm_state().get_ir() << '\n';
if (compact_mode) {
for (auto ir : std::get<1>(ta.get_llvm_state()).get_ir()) {
std::cout << ir << '\n';
}
} else {
std::cout << std::get<0>(ta.get_llvm_state()).get_ir() << '\n';
}

auto counter = 0u;
for (const auto &ex : ta.get_decomposition()) {
Expand Down
91 changes: 91 additions & 0 deletions benchmark/sgp4_dynamics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
//
// This file is part of the heyoka library.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#include <utility>
#include <vector>

#include <boost/program_options.hpp>

#include <heyoka/expression.hpp>
#include <heyoka/kw.hpp>
#include <heyoka/math/sum.hpp>
#include <heyoka/math/time.hpp>
#include <heyoka/model/sgp4.hpp>
#include <heyoka/taylor.hpp>

#include <heyoka/logging.hpp>

using namespace heyoka;

std::vector<std::pair<heyoka::expression, heyoka::expression>> construct_sgp4_ode()
{
// Fetch sgp4's formulae.
auto sgp4_func = heyoka::model::sgp4();

// The variable representing tsince in the sgp4 formulae.
const auto tsince = heyoka::expression("tsince");

// In sgp4_func, replace the TLE data with params, and tsince
// with tsince + par[7].
sgp4_func = heyoka::subs(sgp4_func, {{"n0", heyoka::par[0]},
{"e0", heyoka::par[1]},
{"i0", heyoka::par[2]},
{"node0", heyoka::par[3]},
{"omega0", heyoka::par[4]},
{"m0", heyoka::par[5]},
{"bstar", heyoka::par[6]},
{"tsince", tsince + heyoka::par[7]}});

// Compute the rhs of the sgp4 ODE, substituting tsince with the time placeholder.
const auto dt = heyoka::diff_tensors(sgp4_func, {tsince});
auto sgp4_rhs = heyoka::subs(dt.get_jacobian(), {{tsince, heyoka::time}});

// Create the state variables for the ODE.
auto [x, y, z, vx, vy, vz, e, r] = heyoka::make_vars("x", "y", "z", "vx", "vy", "vz", "e", "r");

// Add the differential equation for r.
// NOTE: do **not** use vx/vy/vz here. Apparently, in the SGP4 algorithm, if one takes the
// time derivatives of x/y/z one does not get *exactly* the same values as the vx/vy/vz returned
// by SGP4. In order for the differential equation for r to be correct, we need the the true time
// derivatives of x/y/z, and we cannot use what SGP4 says are the velocities.
sgp4_rhs.push_back(heyoka::sum({x * sgp4_rhs[0], y * sgp4_rhs[1], z * sgp4_rhs[2]}) / r);

// Return the ODE sys.
using heyoka::prime;
return {prime(x) = sgp4_rhs[0], prime(y) = sgp4_rhs[1], prime(z) = sgp4_rhs[2], prime(vx) = sgp4_rhs[3],
prime(vy) = sgp4_rhs[4], prime(vz) = sgp4_rhs[5], prime(e) = sgp4_rhs[6], prime(r) = sgp4_rhs[7]};
}

int main(int argc, char *argv[])
{
set_logger_level_trace();

namespace po = boost::program_options;

bool parjit = false;

po::options_description desc("Options");

desc.add_options()("help", "produce help message")("parjit", "parallel JIT compilation");

po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);

if (vm.count("help")) {
std::cout << desc << "\n";
return 0;
}

if (vm.count("parjit")) {
parjit = true;
}

taylor_adaptive<double> ta{construct_sgp4_ode(), std::vector<double>(8u), kw::high_accuracy = true,
kw::compact_mode = true, kw::parjit = parjit};
}
10 changes: 5 additions & 5 deletions config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,17 @@

// NOTE: handy Boost library for this since 1.73:
// https://www.boost.org/doc/libs/1_73_0/libs/predef/doc/index.html
//
// NOTE: it makes sense here to handle only the GCC/MSVC macros here
// (on the assumption that clang is identical to GCC in this respect).
// No point in using macros provided by compilers we do not test on.
#if defined(_ARCH_PPC) || defined(_M_PPC)

#define HEYOKA_ARCH_PPC

#endif

#if defined(__arm__) || defined(_M_ARM) || defined(__arm) || defined(__aarch64__)
#if defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT) || defined(__aarch64__) || defined(_M_ARM64)

#define HEYOKA_ARCH_ARM

Expand All @@ -71,10 +75,6 @@

#endif

// Maximum number of blocks that can be processed in parallel
// when computing the Taylor derivatives in parallel mode.
#define HEYOKA_CM_PAR_MAX_INVOKE_N 20

// Setup of the ABI versioning and tagging
// machinery.

Expand Down
5 changes: 5 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ Changelog
New
~~~

- Implement parallel compilation for Taylor integrators
and compiled functions
(`#446 <https://github.com/bluescarni/heyoka/pull/446>`__,
`#444 <https://github.com/bluescarni/heyoka/pull/444>`__,
`#441 <https://github.com/bluescarni/heyoka/pull/441>`__).
- Add the possibility of specifying the LLVM code model
used for JIT compilation
(`#440 <https://github.com/bluescarni/heyoka/pull/440>`__).
Expand Down
2 changes: 1 addition & 1 deletion doc/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ heyoka (and all its dependencies) have been compiled with a compiler supporting
128-bit precision
^^^^^^^^^^^^^^^^^

On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit ARM),
On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit Linux ARM),
quadruple-precision integrations are always supported via ``long double``. Otherwise,
on platforms such as x86-64, quadruple-precision computations are supported if:

Expand Down
14 changes: 14 additions & 0 deletions doc/known_issues.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,24 @@ Unsolved

The root cause is most likely a code-generation/optimisation problem in LLVM.
This issue is currently under investigation.
* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled
by default on 64-bit ARM processors (this includes the Apple M1 and its successors).
The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities
that very rarely results in a multiply-defined symbol, which ultimately leads to compilation
failure. The issue is currently under investigation by the LLVM developers. In the
meantime, you can explicitly turn on parallel compilation via the ``kw::parjit``
:ref:`keyword argument <kwargs>` when constructing an integrator or a compiled
function.

Solved
======

* Due to an `upstream bug <https://github.com/llvm/llvm-project/issues/88115>`__,
the option for selecting the code used model for JIT compilation
(added in heyoka 6.0.0) is ignored by LLVM and the default code model
is always used. This issue affects all LLVM versions up to and including LLVM 18.
A patch for LLVM 18 that rectifies the issue is available
`here <https://github.com/llvm/llvm-project/pull/90599>`__.
* Certain LLVM versions fail to correctly free memory when objects used to
implement just-in-time compilation are destroyed. In practice this may result
in exhausting the available RAM if many integrators and/or compiled functions
Expand Down
2 changes: 1 addition & 1 deletion doc/tut_extended_precision.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ not only in single and double precision, but also in extended precision. Specifi

How these extended precision floating-point types can be accessed and used from C++ varies depending on the platform. The 80-bit
extended-precision format is available as the C++ ``long double`` type on most platforms based on Intel x86 processors. Quadruple-precision
computations are supported either via the ``long double`` type (e.g., on 64-bit ARM processors) or via the the :cpp:class:`mppp::real128` type
computations are supported either via the ``long double`` type (e.g., on 64-bit Linux ARM) or via the the :cpp:class:`mppp::real128` type
(provided that the platform supports the nonstandard ``__float128`` floating-point type and that heyoka was compiled with support
for the mp++ library - see the :ref:`installation instructions <installation>`).

Expand Down
47 changes: 39 additions & 8 deletions include/heyoka/detail/i_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@

#endif

#include <array>
#include <cstddef>
#include <cstdint>
#include <optional>
#include <variant>
#include <vector>

#include <heyoka/config.hpp>
#include <heyoka/detail/aligned_buffer.hpp>
#include <heyoka/detail/dfloat.hpp>
#include <heyoka/detail/fwd_decl.hpp>
#include <heyoka/llvm_state.hpp>
Expand Down Expand Up @@ -64,8 +67,12 @@ struct taylor_adaptive<T>::i_data {
std::vector<T> m_state;
// Time.
detail::dfloat<T> m_time;
// The LLVM machinery.
llvm_state m_llvm;
// The LLVM (multi)state.
std::variant<llvm_state, llvm_multi_state> m_llvm_state;
// A template LLVM state we keep around to create states
// similar to m_llvm_state as needed. This is created with the
// same settings as m_llvm_state.
llvm_state m_tplt_state;
// Dimension of the system.
std::uint32_t m_dim{};
// Taylor decomposition.
Expand All @@ -78,10 +85,18 @@ struct taylor_adaptive<T>::i_data {
bool m_high_accuracy{};
// Compact mode.
bool m_compact_mode{};
// The steppers.
// The stepper types (non-compact mode).
using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept;
using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept;
std::variant<step_f_t, step_f_e_t> m_step_f;
// The stepper types (compact mode). These have an additional argument - the tape pointer.
using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept;
using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept;
// The stepper.
std::variant<step_f_t, step_f_e_t, c_step_f_t, c_step_f_e_t> m_step_f;
// Size/alignment for the compact mode tape.
std::array<std::size_t, 2> m_tape_sa{};
// Compact mode tape.
detail::aligned_buffer_t m_tape;
// The vector of parameters.
std::vector<T> m_pars;
// The vector for the Taylor coefficients.
Expand Down Expand Up @@ -118,6 +133,8 @@ struct taylor_adaptive<T>::i_data {
i_data &operator=(i_data &&) noexcept = delete;

~i_data();

void init_cm_tape();
};

template <typename T>
Expand All @@ -128,8 +145,12 @@ struct taylor_adaptive_batch<T>::i_data {
std::vector<T> m_state;
// Times.
std::vector<T> m_time_hi, m_time_lo;
// The LLVM machinery.
llvm_state m_llvm;
// The LLVM (multi)state.
std::variant<llvm_state, llvm_multi_state> m_llvm_state;
// A template LLVM state we keep around to create states
// similar to m_llvm_state as needed. This is created with the
// same settings as m_llvm_state.
llvm_state m_tplt_state;
// Dimension of the system.
std::uint32_t m_dim{};
// Taylor decomposition.
Expand All @@ -142,10 +163,18 @@ struct taylor_adaptive_batch<T>::i_data {
bool m_high_accuracy{};
// Compact mode.
bool m_compact_mode{};
// The steppers.
// The stepper types (non-compact mode).
using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept;
using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept;
std::variant<step_f_t, step_f_e_t> m_step_f;
// The stepper types (compact mode). These have an additional argument - the tape pointer.
using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept;
using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept;
// The stepper.
std::variant<step_f_t, step_f_e_t, c_step_f_t, c_step_f_e_t> m_step_f;
// Size/alignment for the compact mode tape.
std::array<std::size_t, 2> m_tape_sa{};
// Compact mode tape.
detail::aligned_buffer_t m_tape;
// The vector of parameters.
std::vector<T> m_pars;
// The vector for the Taylor coefficients.
Expand Down Expand Up @@ -204,6 +233,8 @@ struct taylor_adaptive_batch<T>::i_data {
i_data &operator=(i_data &&) noexcept = delete;

~i_data();

void init_cm_tape();
};

HEYOKA_END_NAMESPACE
Expand Down
3 changes: 3 additions & 0 deletions include/heyoka/detail/type_traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ inline constexpr bool is_x86_fp80 = is_ieee754_binaryN<T, 64>();
template <typename T>
inline constexpr bool is_ieee754_binary128 = is_ieee754_binaryN<T, 113>();

template <typename T>
double get_fp_unit_cost();

} // namespace detail

HEYOKA_END_NAMESPACE
Expand Down
20 changes: 17 additions & 3 deletions include/heyoka/expression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ auto cfunc_common_opts(const KwArgs &...kw_args)
template <typename>
std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
make_multi_cfunc(llvm_state, const std::string &, const std::vector<expression> &, const std::vector<expression> &,
std::uint32_t, bool, bool, long long);
std::uint32_t, bool, bool, long long, bool);

} // namespace detail

Expand Down Expand Up @@ -818,13 +818,27 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS cfunc
}
}();

// Parallel JIT compilation.
auto parjit = [&p]() -> bool {
if constexpr (p.has(kw::parjit)) {
if constexpr (std::integral<std::remove_cvref_t<decltype(p(kw::parjit))>>) {
return static_cast<bool>(p(kw::parjit));
} else {
static_assert(detail::always_false_v<T>, "Invalid type for the 'parjit' keyword argument.");
}
} else {
return detail::default_parjit;
}
}();

// Build the template llvm_state from the keyword arguments.
llvm_state s(kw_args...);

return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec);
return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec,
parjit);
}
explicit cfunc(std::vector<expression>, std::vector<expression>,
std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool>);
std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool, bool>);

HEYOKA_DLL_LOCAL void check_valid(const char *) const;

Expand Down
Loading

0 comments on commit 985efad

Please sign in to comment.