From 7d825a7ef30a1d001e4a6a0e7b942b41962aaa96 Mon Sep 17 00:00:00 2001 From: pleroy Date: Tue, 31 Dec 2024 01:47:02 +0100 Subject: [PATCH 1/8] Simplify FMA helpers. --- numerics/sin_cos.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index b79dda44df..617d35234f 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -262,8 +262,8 @@ inline std::int64_t AccurateTableIndex(double const abs_x) { template double FusedMultiplyAdd(double const a, double const b, double const c) { - OSACA_IF((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || - (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { + static_assert(fma_policy != FMAPolicy::Auto); + if constexpr (fma_policy == FMAPolicy::Force) { using quantities::_elementary_functions::FusedMultiplyAdd; return FusedMultiplyAdd(a, b, c); } else { @@ -273,8 +273,8 @@ double FusedMultiplyAdd(double const a, double const b, double const c) { template double FusedNegatedMultiplyAdd(double const a, double const b, double const c) { - OSACA_IF((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || - (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { + static_assert(fma_policy != FMAPolicy::Auto); + if constexpr (fma_policy == FMAPolicy::Force) { using quantities::_elementary_functions::FusedNegatedMultiplyAdd; return FusedNegatedMultiplyAdd(a, b, c); } else { @@ -372,7 +372,7 @@ Value SinImplementation(DoublePrecision const θ_reduced) { double const x² = x * x; double const x³ = x² * x; double const x³_term = FusedMultiplyAdd( - x³, SinPolynomialNearZero(x²), e); + x³, SinPolynomialNearZero(x²), e); return DetectDangerousRounding(x, x³_term); } else { __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x)); From 57f397fa094bf05bc61eb5e85e302e8cffa8e9b7 Mon Sep 17 00:00:00 2001 From: pleroy Date: Tue, 31 Dec 2024 18:53:17 +0100 Subject: [PATCH 2/8] Use absolute value in reduction. --- numerics/sin_cos.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 617d35234f..f5b4e294ca 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -184,6 +184,7 @@ static bool OSACA_loop_terminator = false; constexpr bool UseHardwareFMA = true; \ constexpr double θ = 0.1; \ /* From argument reduction. */ \ + constexpr double abs_θ = θ < 0 ? -θ : θ; \ constexpr double n_double = θ * (2 / π); \ constexpr double reduction_value = θ - n_double * π_over_2_high; \ constexpr double reduction_error = n_double * π_over_2_low; \ @@ -312,12 +313,13 @@ inline double DetectDangerousRounding(double const x, double const Δx) { inline void Reduce(Argument const θ, DoublePrecision& θ_reduced, std::int64_t& quadrant) { - OSACA_IF(θ < π / 4 && θ > -π / 4) { + double const abs_θ = std::abs(θ); + OSACA_IF(abs_θ < π / 4) { θ_reduced.value = θ; θ_reduced.error = 0; quadrant = 0; return; - } OSACA_ELSE_IF(θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { + } OSACA_ELSE_IF(abs_θ <= π_over_2_threshold) { // We are not very sensitive to rounding errors in this expression, because // in the worst case it could cause the reduced angle to jump from the // vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment From 3f4949d31ff1557718c834f2f822126b3ed8d278 Mon Sep 17 00:00:00 2001 From: pleroy Date: Wed, 1 Jan 2025 12:56:13 +0100 Subject: [PATCH 3/8] Better rounding logging. --- functions/sin_cos_test.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/functions/sin_cos_test.cpp b/functions/sin_cos_test.cpp index 9a1d524567..aeff0e6d20 100644 --- a/functions/sin_cos_test.cpp +++ b/functions/sin_cos_test.cpp @@ -106,7 +106,8 @@ TEST_F(SinCosTest, Random) { } if (sin_ulps_error > 0.5) { ++incorrectly_rounded_sin; - LOG(ERROR) << "Sin: " << std::setprecision(25) << principia_argument; + LOG(ERROR) << "Sin: " << sin_ulps_error << " ulps at " + << std::setprecision(25) << principia_argument; } } { @@ -122,7 +123,8 @@ TEST_F(SinCosTest, Random) { } if (cos_ulps_error > 0.5) { ++incorrectly_rounded_cos; - LOG(ERROR) << "Cos: " << std::setprecision(25) << principia_argument; + LOG(ERROR) << "Cos: " << cos_ulps_error << " ulps at " + << std::setprecision(25) << principia_argument; } } } From 23696390913c8d33908fed9dd9f3ba3a2412654b Mon Sep 17 00:00:00 2001 From: pleroy Date: Wed, 1 Jan 2025 13:12:08 +0100 Subject: [PATCH 4/8] Better management of sign in Cos removes an xor from the critical path and enables arithmetic directly from memory. --- numerics/sin_cos.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index f5b4e294ca..6245c9badc 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -413,28 +413,28 @@ Value CosImplementation(DoublePrecision const θ_reduced) { auto const& e = θ_reduced.error; double const abs_x = std::abs(x); __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x)); + double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign)); auto const i = AccurateTableIndex(abs_x); auto const& accurate_values = SinCosAccurateTable[i]; - double const x₀ = - _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign)); - double const sin_x₀ = - _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign)); + double const& x₀ = accurate_values.x; + double const& sin_x₀ = accurate_values.sin_x; double const& cos_x₀ = accurate_values.cos_x; // [GB91] incorporates `e` in the computation of `h`. However, `x` and `e` // don't overlap and in the first interval `x` and `h` may be of the same // order of magnitude. Instead we incorporate the terms in `e` and `e * h` // later in the computation. Note that the terms in `e * h²` and higher are // *not* computed correctly below because they don't matter. - double const h = x - x₀; + double const h = abs_x - x₀; DoublePrecision const cos_x₀_minus_h_sin_x₀ = TwoProductNegatedAdd(sin_x₀, h, cos_x₀); - double const h² = h * (h + 2 * e); + double const h² = h * (h + 2 * abs_e); double const h³ = h² * h; double const polynomial_term = FusedNegatedMultiplyAdd( sin_x₀, - FusedMultiplyAdd(h³, SinPolynomial(h²), e), + FusedMultiplyAdd( + h³, SinPolynomial(h²), abs_e), cos_x₀ * h² * CosPolynomial(h²)) + cos_x₀_minus_h_sin_x₀.error; return DetectDangerousRounding(cos_x₀_minus_h_sin_x₀.value, polynomial_term); From 47668b396ee99ee1901270d31f9f383f2f3f7dc7 Mon Sep 17 00:00:00 2001 From: pleroy Date: Wed, 1 Jan 2025 13:51:09 +0100 Subject: [PATCH 5/8] Integrate the abs_e correction in an FMA that is not depended upon from anything other than the final addition. This does one more multiplication, but on Zen3 is saves a cycle. --- numerics/sin_cos.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 6245c9badc..e805a9d89f 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -399,9 +399,9 @@ Value SinImplementation(DoublePrecision const θ_reduced) { double const polynomial_term = FusedMultiplyAdd( cos_x₀, - FusedMultiplyAdd(h³, SinPolynomial(h²), e), + h³ * SinPolynomial(h²), sin_x₀ * h² * CosPolynomial(h²)) + - sin_x₀_plus_h_cos_x₀.error; + FusedMultiplyAdd(cos_x₀, e, sin_x₀_plus_h_cos_x₀.error); return DetectDangerousRounding(sin_x₀_plus_h_cos_x₀.value, polynomial_term); } } @@ -433,10 +433,10 @@ Value CosImplementation(DoublePrecision const θ_reduced) { double const polynomial_term = FusedNegatedMultiplyAdd( sin_x₀, - FusedMultiplyAdd( - h³, SinPolynomial(h²), abs_e), + h³ * SinPolynomial(h²), cos_x₀ * h² * CosPolynomial(h²)) + - cos_x₀_minus_h_sin_x₀.error; + FusedNegatedMultiplyAdd( + sin_x₀, abs_e, cos_x₀_minus_h_sin_x₀.error); return DetectDangerousRounding(cos_x₀_minus_h_sin_x₀.value, polynomial_term); } From 79696533bcb71ec8a762c5eec36982329a09eea1 Mon Sep 17 00:00:00 2001 From: pleroy Date: Wed, 1 Jan 2025 17:10:20 +0100 Subject: [PATCH 6/8] Clean up after merge. --- functions/functions.vcxproj | 4 +--- functions/sin_cos_test.cpp | 21 --------------------- numerics/sin_cos.cpp | 1 + 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/functions/functions.vcxproj b/functions/functions.vcxproj index 2aa4c86528..f4027c7bf7 100644 --- a/functions/functions.vcxproj +++ b/functions/functions.vcxproj @@ -5,9 +5,7 @@ functions - - AssemblyCode - + diff --git a/functions/sin_cos_test.cpp b/functions/sin_cos_test.cpp index aeff0e6d20..3b1b994413 100644 --- a/functions/sin_cos_test.cpp +++ b/functions/sin_cos_test.cpp @@ -32,25 +32,6 @@ class SinCosTest : public ::testing::Test { double a_ = 1.0; }; -// Defined in sin_cos.hpp -#if PRINCIPIA_USE_OSACA - -// A convenient skeleton for analysing code with OSACA. Note that to speed-up -// analysis, we disable all the other tests when using OSACA. -TEST_F(SinCosTest, DISABLED_OSACA) { - static_assert(PRINCIPIA_INLINE_SIN_COS == 1, - "Must force inlining to use OSACA"); - auto osaca_sin = [](double const a) { - return Sin(a); - }; - auto osaca_cos = [](double const a) { - return Cos(a); - }; - CHECK_NE(osaca_sin(a_), osaca_cos(a_)); -} - -#else - TEST_F(SinCosTest, AccurateTableIndex) { static constexpr std::int64_t iterations = 100; @@ -193,7 +174,5 @@ TEST_F(SinCosTest, HardReduction) { AlmostEquals(-4.687165924254627611122582801963884e-19, 0)); } -#endif - } // namespace functions_test } // namespace principia diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 214cdd8562..3f8337ee4f 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -20,6 +20,7 @@ constexpr bool UseHardwareFMA = true; \ constexpr double θ = 3; \ /* From argument reduction. */ \ + constexpr double abs_θ = θ > 0 ? θ : -θ; \ constexpr std::int64_t n = static_cast(θ * (2 / π) + 0.5); \ constexpr double reduction_value = θ - n * π_over_2_high; \ constexpr double reduction_error = n * π_over_2_low; \ From 0957e787360ec97fea8db9b4a4ed301f5790cc8a Mon Sep 17 00:00:00 2001 From: pleroy Date: Wed, 1 Jan 2025 19:34:03 +0100 Subject: [PATCH 7/8] Sign restoration of the end of the Sin function. --- numerics/sin_cos.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 3f8337ee4f..640e81b00e 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -304,6 +304,7 @@ double FusedNegatedMultiplyAdd(double const a, double const b, double const c) { // Evaluates the sum `x + Δx`. If that sum has a dangerous rounding // configuration (that is, the bits after the last mantissa bit of the sum are // either 1000... or 0111..., then returns `NaN`. Otherwise returns the sum. +// `x` is always positive. `Δx` may be positive or negative. inline double DetectDangerousRounding(double const x, double const Δx) { DoublePrecision const sum = QuickTwoSum(x, Δx); double const& value = sum.value; @@ -396,31 +397,33 @@ Value SinImplementation(DoublePrecision const θ_reduced) { return DetectDangerousRounding(x, x³_term); } else { __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x)); + double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign)); auto const i = AccurateTableIndex(abs_x); auto const& accurate_values = SinCosAccurateTable[i]; - double const x₀ = - _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign)); - double const sin_x₀ = - _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign)); + double const& x₀ = accurate_values.x; + double const& sin_x₀ = accurate_values.sin_x; double const& cos_x₀ = accurate_values.cos_x; // [GB91] incorporates `e` in the computation of `h`. However, `x` and `e` // don't overlap and in the first interval `x` and `h` may be of the same // order of magnitude. Instead we incorporate the terms in `e` and `e * h` // later in the computation. Note that the terms in `e * h²` and higher are // *not* computed correctly below because they don't matter. - double const h = x - x₀; + double const h = abs_x - x₀; DoublePrecision const sin_x₀_plus_h_cos_x₀ = TwoProductAdd(cos_x₀, h, sin_x₀); - double const h² = h * (h + 2 * e); + double const h² = h * (h + 2 * abs_e); double const h³ = h² * h; double const polynomial_term = FusedMultiplyAdd( cos_x₀, h³ * SinPolynomial(h²), sin_x₀ * h² * CosPolynomial(h²)) + - FusedMultiplyAdd(cos_x₀, e, sin_x₀_plus_h_cos_x₀.error); - return DetectDangerousRounding(sin_x₀_plus_h_cos_x₀.value, polynomial_term); + FusedMultiplyAdd(cos_x₀, abs_e, sin_x₀_plus_h_cos_x₀.error); + return _mm_cvtsd_f64( + _mm_xor_pd(_mm_set_sd(DetectDangerousRounding( + sin_x₀_plus_h_cos_x₀.value, polynomial_term)), + sign)); } } From 3d8d9cbce9db9d817d62086a8810232c636d71ae Mon Sep 17 00:00:00 2001 From: pleroy Date: Thu, 2 Jan 2025 03:05:23 +0100 Subject: [PATCH 8/8] After egg's review. --- numerics/sin_cos.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 640e81b00e..23903ebf54 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -397,7 +397,7 @@ Value SinImplementation(DoublePrecision const θ_reduced) { return DetectDangerousRounding(x, x³_term); } else { __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x)); - double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign)); + double const e_abs = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign)); auto const i = AccurateTableIndex(abs_x); auto const& accurate_values = SinCosAccurateTable[i]; double const& x₀ = accurate_values.x; @@ -412,14 +412,14 @@ Value SinImplementation(DoublePrecision const θ_reduced) { DoublePrecision const sin_x₀_plus_h_cos_x₀ = TwoProductAdd(cos_x₀, h, sin_x₀); - double const h² = h * (h + 2 * abs_e); + double const h² = h * (h + 2 * e_abs); double const h³ = h² * h; double const polynomial_term = FusedMultiplyAdd( cos_x₀, h³ * SinPolynomial(h²), sin_x₀ * h² * CosPolynomial(h²)) + - FusedMultiplyAdd(cos_x₀, abs_e, sin_x₀_plus_h_cos_x₀.error); + FusedMultiplyAdd(cos_x₀, e_abs, sin_x₀_plus_h_cos_x₀.error); return _mm_cvtsd_f64( _mm_xor_pd(_mm_set_sd(DetectDangerousRounding( sin_x₀_plus_h_cos_x₀.value, polynomial_term)), @@ -434,7 +434,7 @@ Value CosImplementation(DoublePrecision const θ_reduced) { auto const& e = θ_reduced.error; double const abs_x = std::abs(x); __m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x)); - double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign)); + double const e_abs = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign)); auto const i = AccurateTableIndex(abs_x); auto const& accurate_values = SinCosAccurateTable[i]; double const& x₀ = accurate_values.x; @@ -449,7 +449,7 @@ Value CosImplementation(DoublePrecision const θ_reduced) { DoublePrecision const cos_x₀_minus_h_sin_x₀ = TwoProductNegatedAdd(sin_x₀, h, cos_x₀); - double const h² = h * (h + 2 * abs_e); + double const h² = h * (h + 2 * e_abs); double const h³ = h² * h; double const polynomial_term = FusedNegatedMultiplyAdd( @@ -457,7 +457,7 @@ Value CosImplementation(DoublePrecision const θ_reduced) { h³ * SinPolynomial(h²), cos_x₀ * h² * CosPolynomial(h²)) + FusedNegatedMultiplyAdd( - sin_x₀, abs_e, cos_x₀_minus_h_sin_x₀.error); + sin_x₀, e_abs, cos_x₀_minus_h_sin_x₀.error); return DetectDangerousRounding(cos_x₀_minus_h_sin_x₀.value, polynomial_term); }