diff --git a/numerics/numerics.vcxproj b/numerics/numerics.vcxproj index 6e3b3e9b8b..0de586e31a 100644 --- a/numerics/numerics.vcxproj +++ b/numerics/numerics.vcxproj @@ -10,6 +10,21 @@ + + + AssemblyCode + + + + + AssemblyCode + + + + + AssemblyCode + + diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index b79dda44df..d45469d9c1 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -14,6 +14,36 @@ #include "numerics/polynomial_evaluators.hpp" #include "quantities/elementary_functions.hpp" +#define OSACA_ANALYSED_FUNCTION Cos +#define UNDER_OSACA_HYPOTHESES(expression) \ + [] { \ + constexpr bool UseHardwareFMA = true; \ + constexpr double θ = 3; \ + /* From argument reduction. */ \ + constexpr std::int64_t n = \ + static_cast(θ * (2 / π) + 0.5); \ + constexpr double reduction_value = θ - n * π_over_2_high; \ + constexpr double reduction_error = n * π_over_2_low; \ + /* Used to determine whether a better argument reduction is needed. */ \ + constexpr DoublePrecision θ_reduced = \ + QuickTwoDifference(reduction_value, reduction_error); \ + /* Used in Sin to detect the near-0 case. */ \ + constexpr double abs_x = \ + θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ + /* Used throughout the top-level functions. */ \ + constexpr std::int64_t quadrant = n & 0b11; \ + /* Used in DetectDangerousRounding. */ \ + constexpr double normalized_error = 0; \ + /* Not NaN is the only part that matters; used at the end of the */ \ + /* top-level functions to determine whether to call the slow path. */ \ + constexpr double value = 1; \ + return expression; \ + }() + +#if defined(OSACA_ANALYSED_FUNCTION) +#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION) +#endif + #if PRINCIPIA_USE_OSACA #include "intel/iacaMarks.h" @@ -97,12 +127,14 @@ // them, so they cannot be the end of a loop started unconditionally. Instead // we loop with goto. // — Some volatile reads and writes are used to clarify identity of the -// registers in the generated code (where the names of `OSACA_input` and -// 'OSACA_result' appear in movsd instructions) and to improve the structure -// of the generated graph. +// registers in the generated code (where the names of `OSACA_result` and, if +// `OSACA_CARRY_LOOP_THROUGH_REGISTER`, `OSACA_loop_carry` appear in movsd +// instructions) and to improve the structure of the generated graph. // -// Putting a load of the input from memory in the analysed section makes the -// OSACA dependency graph clearer. However: +// Putting a load of the input from memory in the analysed section prevents the +// compiler from reusing intermediate values in the next iteration, e.g., if the +// absolute value of the result is computed first, the compiler might reuse it +// instead of computing the absolute value of the input. However: // — it adds a spurious move to the latency; // — some tools (IACA, LLVM-MCA) cannot see the dependency through memory. // Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 1 to carry the loop dependency @@ -114,38 +146,42 @@ static bool OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ - double OSACA_INPUT_QUALIFIER OSACA_input = arg; \ + double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \ if constexpr (std::string_view(__func__) == \ STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ IACA_VC64_START; \ } \ - double OSACA_loop_carry = OSACA_input; \ _Pragma("warning(push)"); \ _Pragma("warning(disable : 4102)"); \ OSACA_loop: \ _Pragma("warning(pop)"); \ arg = OSACA_loop_carry -#define OSACA_RETURN(result) \ - do { \ - if constexpr (std::string_view(__func__) == \ - STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ - OSACA_loop_carry = (result); \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - double volatile OSACA_result = OSACA_loop_carry; \ - IACA_VC64_END; \ - return OSACA_result; \ - } else { \ - return (result); \ - } \ +#define OSACA_RETURN(result) \ + do { \ + if constexpr (std::string_view(__func__) == \ + STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ + OSACA_loop_carry = (result); \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + double volatile OSACA_result = OSACA_loop_carry; \ + IACA_VC64_END; \ + /* The second goto prevents the the end marker from being interleaved */ \ + /* with register restoring moves. */ \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + return OSACA_result; \ + } else { \ + return (result); \ + } \ } while (false) #if OSACA_CARRY_LOOP_THROUGH_REGISTER -#define OSACA_INPUT_QUALIFIER +#define OSACA_LOOP_CARRY_QUALIFIER #else -#define OSACA_INPUT_QUALIFIER volatile +#define OSACA_LOOP_CARRY_QUALIFIER volatile #endif // The branch not taken, determined by evaluating the condition @@ -177,33 +213,6 @@ static bool OSACA_loop_terminator = false; #define OSACA_ELSE_IF else OSACA_IF // NOLINT -// Sin- and Cos-specific definitions: - -#define UNDER_OSACA_HYPOTHESES(expression) \ - [&] { \ - constexpr bool UseHardwareFMA = true; \ - constexpr double θ = 0.1; \ - /* From argument reduction. */ \ - constexpr double n_double = θ * (2 / π); \ - constexpr double reduction_value = θ - n_double * π_over_2_high; \ - constexpr double reduction_error = n_double * π_over_2_low; \ - /* Used to determine whether a better argument reduction is needed. */ \ - constexpr DoublePrecision θ_reduced = \ - QuickTwoDifference(reduction_value, reduction_error); \ - /* Used in Sin to detect the near-0 case. */ \ - constexpr double abs_x = \ - θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ - /* Used throughout the top-level functions. */ \ - constexpr std::int64_t quadrant = \ - static_cast(n_double) & 0b11; \ - /* Used in DetectDangerousRounding. */ \ - constexpr double normalized_error = 0; \ - /* Not NaN is the only part that matters; used at the end of the */ \ - /* top-level functions to determine whether to call the slow path. */ \ - constexpr double value = 1; \ - return expression; \ - }() - namespace principia { namespace numerics { diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp index 7d3a7eeb01..b33a11a7c1 100644 --- a/numerics/sin_cos.hpp +++ b/numerics/sin_cos.hpp @@ -7,20 +7,7 @@ namespace numerics { namespace _sin_cos { namespace internal { -#define PRINCIPIA_INLINE_SIN_COS 0 -#define OSACA_ANALYSED_FUNCTION - -#if defined(OSACA_ANALYSED_FUNCTION) -#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION) -#endif - -#if PRINCIPIA_INLINE_SIN_COS -FORCE_INLINE(inline) -#endif double __cdecl Sin(double x); -#if PRINCIPIA_INLINE_SIN_COS -FORCE_INLINE(inline) -#endif double __cdecl Cos(double x); } // namespace internal @@ -31,7 +18,3 @@ using internal::Sin; } // namespace _sin_cos } // namespace numerics } // namespace principia - -#if PRINCIPIA_INLINE_SIN_COS -#include "numerics/sin_cos.cpp" -#endif