Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More optimizations of Sin and Cos #4154

Merged
merged 9 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions functions/functions.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
<RootNamespace>functions</RootNamespace>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<AssemblerOutput>AssemblyCode</AssemblerOutput>
</ClCompile>
<ClCompile />
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="accurate_table_generator.hpp" />
Expand Down
27 changes: 4 additions & 23 deletions functions/sin_cos_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,6 @@ class SinCosTest : public ::testing::Test {
double a_ = 1.0;
};

// Defined in sin_cos.hpp
#if PRINCIPIA_USE_OSACA

// A convenient skeleton for analysing code with OSACA. Note that to speed-up
// analysis, we disable all the other tests when using OSACA.
TEST_F(SinCosTest, DISABLED_OSACA) {
static_assert(PRINCIPIA_INLINE_SIN_COS == 1,
"Must force inlining to use OSACA");
auto osaca_sin = [](double const a) {
return Sin(a);
};
auto osaca_cos = [](double const a) {
return Cos(a);
};
CHECK_NE(osaca_sin(a_), osaca_cos(a_));
}

#else

TEST_F(SinCosTest, AccurateTableIndex) {
static constexpr std::int64_t iterations = 100;

Expand Down Expand Up @@ -106,7 +87,8 @@ TEST_F(SinCosTest, Random) {
}
if (sin_ulps_error > 0.5) {
++incorrectly_rounded_sin;
LOG(ERROR) << "Sin: " << std::setprecision(25) << principia_argument;
LOG(ERROR) << "Sin: " << sin_ulps_error << " ulps at "
<< std::setprecision(25) << principia_argument;
}
}
{
Expand All @@ -122,7 +104,8 @@ TEST_F(SinCosTest, Random) {
}
if (cos_ulps_error > 0.5) {
++incorrectly_rounded_cos;
LOG(ERROR) << "Cos: " << std::setprecision(25) << principia_argument;
LOG(ERROR) << "Cos: " << cos_ulps_error << " ulps at "
<< std::setprecision(25) << principia_argument;
}
}
}
Expand Down Expand Up @@ -191,7 +174,5 @@ TEST_F(SinCosTest, HardReduction) {
AlmostEquals(-4.687165924254627611122582801963884e-19, 0));
}

#endif

} // namespace functions_test
} // namespace principia
53 changes: 29 additions & 24 deletions numerics/sin_cos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
constexpr bool UseHardwareFMA = true; \
constexpr double θ = 3; \
/* From argument reduction. */ \
constexpr double abs_θ = θ > 0 ? θ : -θ; \
constexpr std::int64_t n = static_cast<std::int64_t>(θ * (2 / π) + 0.5); \
constexpr double reduction_value = θ - n * π_over_2_high; \
constexpr double reduction_error = n * π_over_2_low; \
Expand Down Expand Up @@ -280,8 +281,8 @@ inline std::int64_t AccurateTableIndex(double const abs_x) {

template<FMAPolicy fma_policy>
double FusedMultiplyAdd(double const a, double const b, double const c) {
OSACA_IF((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
static_assert(fma_policy != FMAPolicy::Auto);
if constexpr (fma_policy == FMAPolicy::Force) {
using quantities::_elementary_functions::FusedMultiplyAdd;
return FusedMultiplyAdd(a, b, c);
} else {
Expand All @@ -291,8 +292,8 @@ double FusedMultiplyAdd(double const a, double const b, double const c) {

template<FMAPolicy fma_policy>
double FusedNegatedMultiplyAdd(double const a, double const b, double const c) {
OSACA_IF((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) ||
(fma_policy == FMAPolicy::Auto && UseHardwareFMA)) {
static_assert(fma_policy != FMAPolicy::Auto);
if constexpr (fma_policy == FMAPolicy::Force) {
using quantities::_elementary_functions::FusedNegatedMultiplyAdd;
return FusedNegatedMultiplyAdd(a, b, c);
} else {
Expand All @@ -303,6 +304,7 @@ double FusedNegatedMultiplyAdd(double const a, double const b, double const c) {
// Evaluates the sum `x + Δx`. If that sum has a dangerous rounding
// configuration (that is, the bits after the last mantissa bit of the sum are
// either 1000... or 0111..., then returns `NaN`. Otherwise returns the sum.
// `x` is always positive. `Δx` may be positive or negative.
inline double DetectDangerousRounding(double const x, double const Δx) {
DoublePrecision<double> const sum = QuickTwoSum(x, Δx);
double const& value = sum.value;
Expand Down Expand Up @@ -330,12 +332,13 @@ inline double DetectDangerousRounding(double const x, double const Δx) {
inline void Reduce(Argument const θ,
DoublePrecision<Argument>& θ_reduced,
std::int64_t& quadrant) {
OSACA_IF(θ < π / 4 && θ > -π / 4) {
double const abs_θ = std::abs(θ);
OSACA_IF(abs_θ < π / 4) {
θ_reduced.value = θ;
θ_reduced.error = 0;
quadrant = 0;
return;
} OSACA_ELSE_IF(θ <= π_over_2_threshold && θ >= -π_over_2_threshold) {
} OSACA_ELSE_IF(abs_θ <= π_over_2_threshold) {
// We are not very sensitive to rounding errors in this expression, because
// in the worst case it could cause the reduced angle to jump from the
// vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment
Expand Down Expand Up @@ -390,35 +393,37 @@ Value SinImplementation(DoublePrecision<Argument> const θ_reduced) {
double const x² = x * x;
double const x³ = x² * x;
double const x³_term = FusedMultiplyAdd<fma_policy>(
x³, SinPolynomialNearZero<fma_policy>(x²), e);
x³, SinPolynomialNearZero<fma_policy>(x²), e);
return DetectDangerousRounding(x, x³_term);
} else {
__m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign));
auto const i = AccurateTableIndex(abs_x);
auto const& accurate_values = SinCosAccurateTable[i];
double const x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign));
double const sin_x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
double const& x₀ = accurate_values.x;
double const& sin_x₀ = accurate_values.sin_x;
double const& cos_x₀ = accurate_values.cos_x;
// [GB91] incorporates `e` in the computation of `h`. However, `x` and `e`
// don't overlap and in the first interval `x` and `h` may be of the same
// order of magnitude. Instead we incorporate the terms in `e` and `e * h`
// later in the computation. Note that the terms in `e * h²` and higher are
// *not* computed correctly below because they don't matter.
double const h = x - x₀;
double const h = abs_x - x₀;

DoublePrecision<double> const sin_x₀_plus_h_cos_x₀ =
TwoProductAdd<fma_policy>(cos_x₀, h, sin_x₀);
double const h² = h * (h + 2 * e);
double const h² = h * (h + 2 * abs_e);
double const h³ = h² * h;
double const polynomial_term =
FusedMultiplyAdd<fma_policy>(
cos_x₀,
FusedMultiplyAdd<fma_policy>(h³, SinPolynomial<fma_policy>(h²), e),
h³ * SinPolynomial<fma_policy>(h²),
sin_x₀ * h² * CosPolynomial<fma_policy>(h²)) +
sin_x₀_plus_h_cos_x₀.error;
return DetectDangerousRounding(sin_x₀_plus_h_cos_x₀.value, polynomial_term);
FusedMultiplyAdd<fma_policy>(cos_x₀, abs_e, sin_x₀_plus_h_cos_x₀.error);
return _mm_cvtsd_f64(
_mm_xor_pd(_mm_set_sd(DetectDangerousRounding(
sin_x₀_plus_h_cos_x₀.value, polynomial_term)),
sign));
}
}

Expand All @@ -429,30 +434,30 @@ Value CosImplementation(DoublePrecision<Argument> const θ_reduced) {
auto const& e = θ_reduced.error;
double const abs_x = std::abs(x);
__m128d const sign = _mm_and_pd(masks::sign_bit, _mm_set_sd(x));
double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
double const abs_e = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign));
double const e_abs = _mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(e), sign));

(meaning eabs, short for something like eabs θ) otherwise it looks like it is abs e.

auto const i = AccurateTableIndex(abs_x);
auto const& accurate_values = SinCosAccurateTable[i];
double const x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.x), sign));
double const sin_x₀ =
_mm_cvtsd_f64(_mm_xor_pd(_mm_set_sd(accurate_values.sin_x), sign));
double const& x₀ = accurate_values.x;
double const& sin_x₀ = accurate_values.sin_x;
double const& cos_x₀ = accurate_values.cos_x;
// [GB91] incorporates `e` in the computation of `h`. However, `x` and `e`
// don't overlap and in the first interval `x` and `h` may be of the same
// order of magnitude. Instead we incorporate the terms in `e` and `e * h`
// later in the computation. Note that the terms in `e * h²` and higher are
// *not* computed correctly below because they don't matter.
double const h = x - x₀;
double const h = abs_x - x₀;

DoublePrecision<double> const cos_x₀_minus_h_sin_x₀ =
TwoProductNegatedAdd<fma_policy>(sin_x₀, h, cos_x₀);
double const h² = h * (h + 2 * e);
double const h² = h * (h + 2 * abs_e);
double const h³ = h² * h;
double const polynomial_term =
FusedNegatedMultiplyAdd<fma_policy>(
sin_x₀,
FusedMultiplyAdd<fma_policy>(h³, SinPolynomial<fma_policy>(h²), e),
h³ * SinPolynomial<fma_policy>(h²),
cos_x₀ * h² * CosPolynomial<fma_policy>(h²)) +
cos_x₀_minus_h_sin_x₀.error;
FusedNegatedMultiplyAdd<fma_policy>(
sin_x₀, abs_e, cos_x₀_minus_h_sin_x₀.error);
return DetectDangerousRounding(cos_x₀_minus_h_sin_x₀.value, polynomial_term);
}

Expand Down