Merge pull request #446 from bluescarni/pr/parjittaylor

Parallel compilation for Taylor integrators
bluescarni · Aug 31, 2024 · 985efad · 985efad
2 parents e704228 + 57189c7
commit 985efad
Show file tree

Hide file tree

Showing 48 changed files with 2,332 additions and 1,436 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -193,6 +193,7 @@ set(HEYOKA_SRC_FILES
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/tm_data.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/debug.cpp"
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/aligned_buffer.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/type_traits.cpp"
     # NOTE: this will be an empty file in case we are not
     # building with support for real.
     "${CMAKE_CURRENT_SOURCE_DIR}/src/detail/real_helpers.cpp"

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -1,6 +1,16 @@
+# NOTE: we look for Boost in CONFIG mode first, as that has become the official supported way
+# of locating Boost in recent Boost/CMake versions. If we fail, we try again in
+# MODULE mode as last resort.
 # NOTE: don't find a specific version as we already checked
 # outside that the Boost version is appropriate.
-find_package(Boost REQUIRED COMPONENTS program_options)
+find_package(Boost QUIET COMPONENTS program_options CONFIG)
+if(NOT ${Boost_FOUND})
+    message(STATUS "Boost not found in CONFIG mode, retrying in MODULE mode.")
+    find_package(Boost QUIET MODULE COMPONENTS program_options)
+endif()
+if(NOT ${Boost_FOUND})
+    message(FATAL_ERROR "Could not locate Boost in either CONFIG or MODULE mode.")
+endif()
 if(NOT TARGET Boost::program_options)
     message(STATUS "The 'Boost::program_options' imported target is missing, creating it.")
     add_library(Boost::program_options UNKNOWN IMPORTED)
@@ -73,6 +83,7 @@ ADD_HEYOKA_BENCHMARK(sims_flanagan_jac)
 ADD_HEYOKA_BENCHMARK(cfunc_mt)
 ADD_HEYOKA_BENCHMARK(diff_tensors)
 ADD_HEYOKA_BENCHMARK(var_construction)
+ADD_HEYOKA_BENCHMARK(sgp4_dynamics)
 
 if(HEYOKA_WITH_MPPP AND mp++_WITH_MPFR)
   ADD_HEYOKA_BENCHMARK(pendulum_mp)

diff --git a/benchmark/n_body_creation.cpp b/benchmark/n_body_creation.cpp
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <iostream>
 #include <numeric>
+#include <variant>
 #include <vector>
 
 #include <boost/program_options.hpp>
@@ -56,7 +57,13 @@ int main(int argc, char *argv[])
         std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now() - start)
             .count());
 
-    std::cout << ta.get_llvm_state().get_ir() << '\n';
+    if (compact_mode) {
+        for (auto ir : std::get<1>(ta.get_llvm_state()).get_ir()) {
+            std::cout << ir << '\n';
+        }
+    } else {
+        std::cout << std::get<0>(ta.get_llvm_state()).get_ir() << '\n';
+    }
 
     auto counter = 0u;
     for (const auto &ex : ta.get_decomposition()) {

diff --git a/benchmark/sgp4_dynamics.cpp b/benchmark/sgp4_dynamics.cpp
@@ -0,0 +1,91 @@
+// Copyright 2020, 2021, 2022, 2023, 2024 Francesco Biscani ([email protected]), Dario Izzo ([email protected])
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <utility>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include <heyoka/expression.hpp>
+#include <heyoka/kw.hpp>
+#include <heyoka/math/sum.hpp>
+#include <heyoka/math/time.hpp>
+#include <heyoka/model/sgp4.hpp>
+#include <heyoka/taylor.hpp>
+
+#include <heyoka/logging.hpp>
+
+using namespace heyoka;
+
+std::vector<std::pair<heyoka::expression, heyoka::expression>> construct_sgp4_ode()
+{
+    // Fetch sgp4's formulae.
+    auto sgp4_func = heyoka::model::sgp4();
+
+    // The variable representing tsince in the sgp4 formulae.
+    const auto tsince = heyoka::expression("tsince");
+
+    // In sgp4_func, replace the TLE data with params, and tsince
+    // with tsince + par[7].
+    sgp4_func = heyoka::subs(sgp4_func, {{"n0", heyoka::par[0]},
+                                         {"e0", heyoka::par[1]},
+                                         {"i0", heyoka::par[2]},
+                                         {"node0", heyoka::par[3]},
+                                         {"omega0", heyoka::par[4]},
+                                         {"m0", heyoka::par[5]},
+                                         {"bstar", heyoka::par[6]},
+                                         {"tsince", tsince + heyoka::par[7]}});
+
+    // Compute the rhs of the sgp4 ODE, substituting tsince with the time placeholder.
+    const auto dt = heyoka::diff_tensors(sgp4_func, {tsince});
+    auto sgp4_rhs = heyoka::subs(dt.get_jacobian(), {{tsince, heyoka::time}});
+
+    // Create the state variables for the ODE.
+    auto [x, y, z, vx, vy, vz, e, r] = heyoka::make_vars("x", "y", "z", "vx", "vy", "vz", "e", "r");
+
+    // Add the differential equation for r.
+    // NOTE: do **not** use vx/vy/vz here. Apparently, in the SGP4 algorithm, if one takes the
+    // time derivatives of x/y/z one does not get *exactly* the same values as the vx/vy/vz returned
+    // by SGP4. In order for the differential equation for r to be correct, we need the the true time
+    // derivatives of x/y/z, and we cannot use what SGP4 says are the velocities.
+    sgp4_rhs.push_back(heyoka::sum({x * sgp4_rhs[0], y * sgp4_rhs[1], z * sgp4_rhs[2]}) / r);
+
+    // Return the ODE sys.
+    using heyoka::prime;
+    return {prime(x) = sgp4_rhs[0],  prime(y) = sgp4_rhs[1],  prime(z) = sgp4_rhs[2], prime(vx) = sgp4_rhs[3],
+            prime(vy) = sgp4_rhs[4], prime(vz) = sgp4_rhs[5], prime(e) = sgp4_rhs[6], prime(r) = sgp4_rhs[7]};
+}
+
+int main(int argc, char *argv[])
+{
+    set_logger_level_trace();
+
+    namespace po = boost::program_options;
+
+    bool parjit = false;
+
+    po::options_description desc("Options");
+
+    desc.add_options()("help", "produce help message")("parjit", "parallel JIT compilation");
+
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    if (vm.count("help")) {
+        std::cout << desc << "\n";
+        return 0;
+    }
+
+    if (vm.count("parjit")) {
+        parjit = true;
+    }
+
+    taylor_adaptive<double> ta{construct_sgp4_ode(), std::vector<double>(8u), kw::high_accuracy = true,
+                               kw::compact_mode = true, kw::parjit = parjit};
+}
diff --git a/config.hpp.in b/config.hpp.in
@@ -53,13 +53,17 @@
 
 // NOTE: handy Boost library for this since 1.73:
 // https://www.boost.org/doc/libs/1_73_0/libs/predef/doc/index.html
+//
+// NOTE: it makes sense here to handle only the GCC/MSVC macros here
+// (on the assumption that clang is identical to GCC in this respect).
+// No point in using macros provided by compilers we do not test on.
 #if defined(_ARCH_PPC) || defined(_M_PPC)
 
 #define HEYOKA_ARCH_PPC
 
 #endif
 
-#if defined(__arm__) || defined(_M_ARM) || defined(__arm) || defined(__aarch64__)
+#if defined(__arm__) || defined(_M_ARM) || defined(_M_ARMT) || defined(__aarch64__) || defined(_M_ARM64)
 
 #define HEYOKA_ARCH_ARM
 
@@ -71,10 +75,6 @@
 
 #endif
 
-// Maximum number of blocks that can be processed in parallel
-// when computing the Taylor derivatives in parallel mode.
-#define HEYOKA_CM_PAR_MAX_INVOKE_N 20
-
 // Setup of the ABI versioning and tagging
 // machinery.
 

diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -7,6 +7,11 @@ Changelog
 New
 ~~~
 
+- Implement parallel compilation for Taylor integrators
+  and compiled functions
+  (`#446 <https://github.com/bluescarni/heyoka/pull/446>`__,
+  `#444 <https://github.com/bluescarni/heyoka/pull/444>`__,
+  `#441 <https://github.com/bluescarni/heyoka/pull/441>`__).
 - Add the possibility of specifying the LLVM code model
   used for JIT compilation
   (`#440 <https://github.com/bluescarni/heyoka/pull/440>`__).

diff --git a/doc/install.rst b/doc/install.rst
@@ -70,7 +70,7 @@ heyoka (and all its dependencies) have been compiled with a compiler supporting
 128-bit precision
 ^^^^^^^^^^^^^^^^^
 
-On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit ARM),
+On platforms where ``long double`` is a quadruple-precision floating-point datatype (e.g., 64-bit Linux ARM),
 quadruple-precision integrations are always supported via ``long double``. Otherwise,
 on platforms such as x86-64, quadruple-precision computations are supported if:
 

diff --git a/doc/known_issues.rst b/doc/known_issues.rst
@@ -18,10 +18,24 @@ Unsolved
 
   The root cause is most likely a code-generation/optimisation problem in LLVM.
   This issue is currently under investigation.
+* The parallel compilation feature (added in heyoka 6.0.0) is currently disabled
+  by default on 64-bit ARM processors (this includes the Apple M1 and its successors).
+  The reason is a likely thread scheduling bug in LLVM's parallel compilation facilities
+  that very rarely results in a multiply-defined symbol, which ultimately leads to compilation
+  failure. The issue is currently under investigation by the LLVM developers. In the
+  meantime, you can explicitly turn on parallel compilation via the ``kw::parjit``
+  :ref:`keyword argument <kwargs>` when constructing an integrator or a compiled
+  function.
 
 Solved
 ======
 
+* Due to an `upstream bug <https://github.com/llvm/llvm-project/issues/88115>`__,
+  the option for selecting the code used model for JIT compilation
+  (added in heyoka 6.0.0) is ignored by LLVM and the default code model
+  is always used. This issue affects all LLVM versions up to and including LLVM 18.
+  A patch for LLVM 18 that rectifies the issue is available
+  `here <https://github.com/llvm/llvm-project/pull/90599>`__.
 * Certain LLVM versions fail to correctly free memory when objects used to
   implement just-in-time compilation are destroyed. In practice this may result
   in exhausting the available RAM if many integrators and/or compiled functions

diff --git a/doc/tut_extended_precision.rst b/doc/tut_extended_precision.rst
@@ -11,7 +11,7 @@ not only in single and double precision, but also in extended precision. Specifi
 
 How these extended precision floating-point types can be accessed and used from C++ varies depending on the platform. The 80-bit
 extended-precision format is available as the C++ ``long double`` type on most platforms based on Intel x86 processors. Quadruple-precision
-computations are supported either via the ``long double`` type (e.g., on 64-bit ARM processors) or via the the :cpp:class:`mppp::real128` type
+computations are supported either via the ``long double`` type (e.g., on 64-bit Linux ARM) or via the the :cpp:class:`mppp::real128` type
 (provided that the platform supports the nonstandard ``__float128`` floating-point type and that heyoka was compiled with support
 for the mp++ library - see the :ref:`installation instructions <installation>`).
 

diff --git a/include/heyoka/detail/i_data.hpp b/include/heyoka/detail/i_data.hpp
@@ -15,12 +15,15 @@
 
 #endif
 
+#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <optional>
 #include <variant>
 #include <vector>
 
 #include <heyoka/config.hpp>
+#include <heyoka/detail/aligned_buffer.hpp>
 #include <heyoka/detail/dfloat.hpp>
 #include <heyoka/detail/fwd_decl.hpp>
 #include <heyoka/llvm_state.hpp>
@@ -64,8 +67,12 @@ struct taylor_adaptive<T>::i_data {
     std::vector<T> m_state;
     // Time.
     detail::dfloat<T> m_time;
-    // The LLVM machinery.
-    llvm_state m_llvm;
+    // The LLVM (multi)state.
+    std::variant<llvm_state, llvm_multi_state> m_llvm_state;
+    // A template LLVM state we keep around to create states
+    // similar to m_llvm_state as needed. This is created with the
+    // same settings as m_llvm_state.
+    llvm_state m_tplt_state;
     // Dimension of the system.
     std::uint32_t m_dim{};
     // Taylor decomposition.
@@ -78,10 +85,18 @@ struct taylor_adaptive<T>::i_data {
     bool m_high_accuracy{};
     // Compact mode.
     bool m_compact_mode{};
-    // The steppers.
+    // The stepper types (non-compact mode).
     using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept;
     using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept;
-    std::variant<step_f_t, step_f_e_t> m_step_f;
+    // The stepper types (compact mode). These have an additional argument - the tape pointer.
+    using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept;
+    using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept;
+    // The stepper.
+    std::variant<step_f_t, step_f_e_t, c_step_f_t, c_step_f_e_t> m_step_f;
+    // Size/alignment for the compact mode tape.
+    std::array<std::size_t, 2> m_tape_sa{};
+    // Compact mode tape.
+    detail::aligned_buffer_t m_tape;
     // The vector of parameters.
     std::vector<T> m_pars;
     // The vector for the Taylor coefficients.
@@ -118,6 +133,8 @@ struct taylor_adaptive<T>::i_data {
     i_data &operator=(i_data &&) noexcept = delete;
 
     ~i_data();
+
+    void init_cm_tape();
 };
 
 template <typename T>
@@ -128,8 +145,12 @@ struct taylor_adaptive_batch<T>::i_data {
     std::vector<T> m_state;
     // Times.
     std::vector<T> m_time_hi, m_time_lo;
-    // The LLVM machinery.
-    llvm_state m_llvm;
+    // The LLVM (multi)state.
+    std::variant<llvm_state, llvm_multi_state> m_llvm_state;
+    // A template LLVM state we keep around to create states
+    // similar to m_llvm_state as needed. This is created with the
+    // same settings as m_llvm_state.
+    llvm_state m_tplt_state;
     // Dimension of the system.
     std::uint32_t m_dim{};
     // Taylor decomposition.
@@ -142,10 +163,18 @@ struct taylor_adaptive_batch<T>::i_data {
     bool m_high_accuracy{};
     // Compact mode.
     bool m_compact_mode{};
-    // The steppers.
+    // The stepper types (non-compact mode).
     using step_f_t = void (*)(T *, const T *, const T *, T *, T *) noexcept;
     using step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *) noexcept;
-    std::variant<step_f_t, step_f_e_t> m_step_f;
+    // The stepper types (compact mode). These have an additional argument - the tape pointer.
+    using c_step_f_t = void (*)(T *, const T *, const T *, T *, T *, void *) noexcept;
+    using c_step_f_e_t = void (*)(T *, const T *, const T *, const T *, T *, T *, void *) noexcept;
+    // The stepper.
+    std::variant<step_f_t, step_f_e_t, c_step_f_t, c_step_f_e_t> m_step_f;
+    // Size/alignment for the compact mode tape.
+    std::array<std::size_t, 2> m_tape_sa{};
+    // Compact mode tape.
+    detail::aligned_buffer_t m_tape;
     // The vector of parameters.
     std::vector<T> m_pars;
     // The vector for the Taylor coefficients.
@@ -204,6 +233,8 @@ struct taylor_adaptive_batch<T>::i_data {
     i_data &operator=(i_data &&) noexcept = delete;
 
     ~i_data();
+
+    void init_cm_tape();
 };
 
 HEYOKA_END_NAMESPACE

diff --git a/include/heyoka/detail/type_traits.hpp b/include/heyoka/detail/type_traits.hpp
@@ -139,6 +139,9 @@ inline constexpr bool is_x86_fp80 = is_ieee754_binaryN<T, 64>();
 template <typename T>
 inline constexpr bool is_ieee754_binary128 = is_ieee754_binaryN<T, 113>();
 
+template <typename T>
+double get_fp_unit_cost();
+
 } // namespace detail
 
 HEYOKA_END_NAMESPACE

diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
@@ -699,7 +699,7 @@ auto cfunc_common_opts(const KwArgs &...kw_args)
 template <typename>
 std::tuple<llvm_multi_state, std::vector<expression>, std::vector<std::array<std::size_t, 2>>>
 make_multi_cfunc(llvm_state, const std::string &, const std::vector<expression> &, const std::vector<expression> &,
-                 std::uint32_t, bool, bool, long long);
+                 std::uint32_t, bool, bool, long long, bool);
 
 } // namespace detail
 
@@ -818,13 +818,27 @@ class HEYOKA_DLL_PUBLIC_INLINE_CLASS cfunc
             }
         }();
 
+        // Parallel JIT compilation.
+        auto parjit = [&p]() -> bool {
+            if constexpr (p.has(kw::parjit)) {
+                if constexpr (std::integral<std::remove_cvref_t<decltype(p(kw::parjit))>>) {
+                    return static_cast<bool>(p(kw::parjit));
+                } else {
+                    static_assert(detail::always_false_v<T>, "Invalid type for the 'parjit' keyword argument.");
+                }
+            } else {
+                return detail::default_parjit;
+            }
+        }();
+
         // Build the template llvm_state from the keyword arguments.
         llvm_state s(kw_args...);
 
-        return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec);
+        return std::make_tuple(high_accuracy, compact_mode, parallel_mode, prec, batch_size, std::move(s), check_prec,
+                               parjit);
     }
     explicit cfunc(std::vector<expression>, std::vector<expression>,
-                   std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool>);
+                   std::tuple<bool, bool, bool, long long, std::optional<std::uint32_t>, llvm_state, bool, bool>);
 
     HEYOKA_DLL_LOCAL void check_valid(const char *) const;