diff --git a/numerics/numerics.vcxproj b/numerics/numerics.vcxproj
index 6e3b3e9b8b..0de586e31a 100644
--- a/numerics/numerics.vcxproj
+++ b/numerics/numerics.vcxproj
@@ -10,6 +10,21 @@
+
+
+ AssemblyCode
+
+
+
+
+ AssemblyCode
+
+
+
+
+ AssemblyCode
+
+
diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp
index b79dda44df..d45469d9c1 100644
--- a/numerics/sin_cos.cpp
+++ b/numerics/sin_cos.cpp
@@ -14,6 +14,36 @@
#include "numerics/polynomial_evaluators.hpp"
#include "quantities/elementary_functions.hpp"
+#define OSACA_ANALYSED_FUNCTION Cos
+#define UNDER_OSACA_HYPOTHESES(expression) \
+ [] { \
+ constexpr bool UseHardwareFMA = true; \
+ constexpr double θ = 3; \
+ /* From argument reduction. */ \
+ constexpr std::int64_t n = \
+ static_cast(θ * (2 / π) + 0.5); \
+ constexpr double reduction_value = θ - n * π_over_2_high; \
+ constexpr double reduction_error = n * π_over_2_low; \
+ /* Used to determine whether a better argument reduction is needed. */ \
+ constexpr DoublePrecision θ_reduced = \
+ QuickTwoDifference(reduction_value, reduction_error); \
+ /* Used in Sin to detect the near-0 case. */ \
+ constexpr double abs_x = \
+ θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \
+ /* Used throughout the top-level functions. */ \
+ constexpr std::int64_t quadrant = n & 0b11; \
+ /* Used in DetectDangerousRounding. */ \
+ constexpr double normalized_error = 0; \
+ /* Not NaN is the only part that matters; used at the end of the */ \
+ /* top-level functions to determine whether to call the slow path. */ \
+ constexpr double value = 1; \
+ return expression; \
+ }()
+
+#if defined(OSACA_ANALYSED_FUNCTION)
+#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION)
+#endif
+
#if PRINCIPIA_USE_OSACA
#include "intel/iacaMarks.h"
@@ -97,12 +127,14 @@
// them, so they cannot be the end of a loop started unconditionally. Instead
// we loop with goto.
// — Some volatile reads and writes are used to clarify identity of the
-// registers in the generated code (where the names of `OSACA_input` and
-// 'OSACA_result' appear in movsd instructions) and to improve the structure
-// of the generated graph.
+// registers in the generated code (where the names of `OSACA_result` and, if
+// `OSACA_CARRY_LOOP_THROUGH_REGISTER`, `OSACA_loop_carry` appear in movsd
+// instructions) and to improve the structure of the generated graph.
//
-// Putting a load of the input from memory in the analysed section makes the
-// OSACA dependency graph clearer. However:
+// Putting a load of the input from memory in the analysed section prevents the
+// compiler from reusing intermediate values in the next iteration, e.g., if the
+// absolute value of the result is computed first, the compiler might reuse it
+// instead of computing the absolute value of the input. However:
// — it adds a spurious move to the latency;
// — some tools (IACA, LLVM-MCA) cannot see the dependency through memory.
// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 1 to carry the loop dependency
@@ -114,38 +146,42 @@
static bool OSACA_loop_terminator = false;
#define OSACA_FUNCTION_BEGIN(arg) \
- double OSACA_INPUT_QUALIFIER OSACA_input = arg; \
+ double OSACA_LOOP_CARRY_QUALIFIER OSACA_loop_carry = arg; \
if constexpr (std::string_view(__func__) == \
STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \
IACA_VC64_START; \
} \
- double OSACA_loop_carry = OSACA_input; \
_Pragma("warning(push)"); \
_Pragma("warning(disable : 4102)"); \
OSACA_loop: \
_Pragma("warning(pop)"); \
arg = OSACA_loop_carry
-#define OSACA_RETURN(result) \
- do { \
- if constexpr (std::string_view(__func__) == \
- STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \
- OSACA_loop_carry = (result); \
- if (!OSACA_loop_terminator) { \
- goto OSACA_loop; \
- } \
- double volatile OSACA_result = OSACA_loop_carry; \
- IACA_VC64_END; \
- return OSACA_result; \
- } else { \
- return (result); \
- } \
+#define OSACA_RETURN(result) \
+ do { \
+ if constexpr (std::string_view(__func__) == \
+ STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \
+ OSACA_loop_carry = (result); \
+ if (!OSACA_loop_terminator) { \
+ goto OSACA_loop; \
+ } \
+ double volatile OSACA_result = OSACA_loop_carry; \
+ IACA_VC64_END; \
+ /* The second goto prevents the the end marker from being interleaved */ \
+ /* with register restoring moves. */ \
+ if (!OSACA_loop_terminator) { \
+ goto OSACA_loop; \
+ } \
+ return OSACA_result; \
+ } else { \
+ return (result); \
+ } \
} while (false)
#if OSACA_CARRY_LOOP_THROUGH_REGISTER
-#define OSACA_INPUT_QUALIFIER
+#define OSACA_LOOP_CARRY_QUALIFIER
#else
-#define OSACA_INPUT_QUALIFIER volatile
+#define OSACA_LOOP_CARRY_QUALIFIER volatile
#endif
// The branch not taken, determined by evaluating the condition
@@ -177,33 +213,6 @@ static bool OSACA_loop_terminator = false;
#define OSACA_ELSE_IF else OSACA_IF // NOLINT
-// Sin- and Cos-specific definitions:
-
-#define UNDER_OSACA_HYPOTHESES(expression) \
- [&] { \
- constexpr bool UseHardwareFMA = true; \
- constexpr double θ = 0.1; \
- /* From argument reduction. */ \
- constexpr double n_double = θ * (2 / π); \
- constexpr double reduction_value = θ - n_double * π_over_2_high; \
- constexpr double reduction_error = n_double * π_over_2_low; \
- /* Used to determine whether a better argument reduction is needed. */ \
- constexpr DoublePrecision θ_reduced = \
- QuickTwoDifference(reduction_value, reduction_error); \
- /* Used in Sin to detect the near-0 case. */ \
- constexpr double abs_x = \
- θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \
- /* Used throughout the top-level functions. */ \
- constexpr std::int64_t quadrant = \
- static_cast(n_double) & 0b11; \
- /* Used in DetectDangerousRounding. */ \
- constexpr double normalized_error = 0; \
- /* Not NaN is the only part that matters; used at the end of the */ \
- /* top-level functions to determine whether to call the slow path. */ \
- constexpr double value = 1; \
- return expression; \
- }()
-
namespace principia {
namespace numerics {
diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp
index 7d3a7eeb01..b33a11a7c1 100644
--- a/numerics/sin_cos.hpp
+++ b/numerics/sin_cos.hpp
@@ -7,20 +7,7 @@ namespace numerics {
namespace _sin_cos {
namespace internal {
-#define PRINCIPIA_INLINE_SIN_COS 0
-#define OSACA_ANALYSED_FUNCTION
-
-#if defined(OSACA_ANALYSED_FUNCTION)
-#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION)
-#endif
-
-#if PRINCIPIA_INLINE_SIN_COS
-FORCE_INLINE(inline)
-#endif
double __cdecl Sin(double x);
-#if PRINCIPIA_INLINE_SIN_COS
-FORCE_INLINE(inline)
-#endif
double __cdecl Cos(double x);
} // namespace internal
@@ -31,7 +18,3 @@ using internal::Sin;
} // namespace _sin_cos
} // namespace numerics
} // namespace principia
-
-#if PRINCIPIA_INLINE_SIN_COS
-#include "numerics/sin_cos.cpp"
-#endif