From d6a22b96eb2fa15fbf6041b67a9cfb9e536b291b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 1 Jan 2025 04:36:01 +0100 Subject: [PATCH] More improvements to the macros --- numerics/sin_cos.cpp | 113 +++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index d45469d9c1..79fdc3665c 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -15,29 +15,28 @@ #include "quantities/elementary_functions.hpp" #define OSACA_ANALYSED_FUNCTION Cos -#define UNDER_OSACA_HYPOTHESES(expression) \ - [] { \ - constexpr bool UseHardwareFMA = true; \ - constexpr double θ = 3; \ - /* From argument reduction. */ \ - constexpr std::int64_t n = \ - static_cast(θ * (2 / π) + 0.5); \ - constexpr double reduction_value = θ - n * π_over_2_high; \ - constexpr double reduction_error = n * π_over_2_low; \ - /* Used to determine whether a better argument reduction is needed. */ \ - constexpr DoublePrecision θ_reduced = \ - QuickTwoDifference(reduction_value, reduction_error); \ - /* Used in Sin to detect the near-0 case. */ \ - constexpr double abs_x = \ - θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ - /* Used throughout the top-level functions. */ \ - constexpr std::int64_t quadrant = n & 0b11; \ - /* Used in DetectDangerousRounding. */ \ - constexpr double normalized_error = 0; \ - /* Not NaN is the only part that matters; used at the end of the */ \ - /* top-level functions to determine whether to call the slow path. */ \ - constexpr double value = 1; \ - return expression; \ +#define UNDER_OSACA_HYPOTHESES(expression) \ + [&] { \ + constexpr bool UseHardwareFMA = true; \ + constexpr double θ = 0.1; \ + /* From argument reduction. */ \ + constexpr std::int64_t n = static_cast(θ * (2 / π) + 0.5); \ + constexpr double reduction_value = θ - n * π_over_2_high; \ + constexpr double reduction_error = n * π_over_2_low; \ + /* Used to determine whether a better argument reduction is needed. */ \ + constexpr DoublePrecision θ_reduced = \ + QuickTwoDifference(reduction_value, reduction_error); \ + /* Used in Sin to detect the near-0 case. */ \ + constexpr double abs_x = \ + θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ + /* Used throughout the top-level functions. */ \ + constexpr std::int64_t quadrant = n & 0b11; \ + /* Used in DetectDangerousRounding. */ \ + constexpr double normalized_error = 0; \ + /* Not NaN is the only part that matters; used at the end of the */ \ + /* top-level functions to determine whether to call the slow path. */ \ + constexpr double value = 1; \ + return expression; \ }() #if defined(OSACA_ANALYSED_FUNCTION) @@ -126,27 +125,37 @@ // — Return statements may be in if statements, and there may be several of // them, so they cannot be the end of a loop started unconditionally. Instead // we loop with goto. +// — We need to prevent the compiler from moving the start and end markers into +// the middle of register saving and restoring code, which would mess up the +// dependency analysis. This is done with additional conditional gotos. // — Some volatile reads and writes are used to clarify identity of the // registers in the generated code (where the names of `OSACA_result` and, if -// `OSACA_CARRY_LOOP_THROUGH_REGISTER`, `OSACA_loop_carry` appear in movsd -// instructions) and to improve the structure of the generated graph. +// `OSACA_CARRY_LOOP_THROUGH_REGISTER` is set to 0, `OSACA_loop_carry` appear +// in movsd instructions). // -// Putting a load of the input from memory in the analysed section prevents the -// compiler from reusing intermediate values in the next iteration, e.g., if the -// absolute value of the result is computed first, the compiler might reuse it -// instead of computing the absolute value of the input. However: +// Carrying the loop dependency through a memory load and store can make the +// dependency graph easier to understand, as it forces any usage of the input to +// depend on the initial movsd, with the loop carried by a single backward edge +// to that initial movsd. +// If the loop is carried through a register, multiple usages of the input may +// result in multiple back edges from the final instruction that computed the +// result. Carrying the loop through the memory could also potentially prevent +// the compiler from reusing intermediate values in the next iteration, e.g., if +// the the computation of f(x) depends on -x and produces -f(x) before f(x), as +// in an even function defined in terms of its positive half, the compiler might +// reuse -f(x₀)=-x₁ instead of computing -x₁ from x₁=f(x₀). However: // — it adds a spurious move to the latency; -// — some tools (IACA, LLVM-MCA) cannot see the dependency through memory. -// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 1 to carry the loop dependency -// through a register instead. +// — some tools (IACA) cannot see the dependency through memory. +// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 0 to carry the loop through memory. #define OSACA_EVALUATE_CONDITIONS 1 -#define OSACA_CARRY_LOOP_THROUGH_REGISTER 0 +#define OSACA_CARRY_LOOP_THROUGH_REGISTER 1 static bool OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \ + OSACA_outer_loop: \ if constexpr (std::string_view(__func__) == \ STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ IACA_VC64_START; \ @@ -157,25 +166,25 @@ static bool OSACA_loop_terminator = false; _Pragma("warning(pop)"); \ arg = OSACA_loop_carry -#define OSACA_RETURN(result) \ - do { \ - if constexpr (std::string_view(__func__) == \ - STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ - OSACA_loop_carry = (result); \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - double volatile OSACA_result = OSACA_loop_carry; \ - IACA_VC64_END; \ - /* The second goto prevents the the end marker from being interleaved */ \ - /* with register restoring moves. */ \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - return OSACA_result; \ - } else { \ - return (result); \ - } \ +#define OSACA_RETURN(result) \ + do { \ + if constexpr (std::string_view(__func__) == \ + STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ + OSACA_loop_carry = (result); \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + double volatile OSACA_result = OSACA_loop_carry; \ + IACA_VC64_END; \ + /* The outer loop prevents the the start and end marker from being */ \ + /* interleaved with register saving and restoring moves. */ \ + if (!OSACA_loop_terminator) { \ + goto OSACA_outer_loop; \ + } \ + return OSACA_result; \ + } else { \ + return (result); \ + } \ } while (false) #if OSACA_CARRY_LOOP_THROUGH_REGISTER