Skip to content

Commit

Permalink
A working implementation.
Browse files Browse the repository at this point in the history
  • Loading branch information
pleroy committed May 31, 2024
1 parent 571d445 commit ac3623c
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 63 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,6 @@ unity/

# Visual Studio Code
.vscode/

# Nuget
.nuget/
159 changes: 96 additions & 63 deletions benchmarks/elementary_functions_experiments_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include "benchmark/benchmark.h"
#include "numerics/double_precision.hpp"
#include "numerics/scale_b.hpp"
#include "quantities/elementary_functions.hpp"
#include "quantities/numbers.hpp" // 🧙 For π.

Expand All @@ -15,8 +16,11 @@ namespace functions {

// TODO(phl): The polynomials in this file should use class
// |PolynomialInMonomialBasis|.
// TODO(phl): Study the effect of rounding the polynomial coefficients to
// machine numbers.

using namespace principia::numerics::_double_precision;
using namespace principia::numerics::_scale_b;
using namespace principia::quantities::_elementary_functions;

using Value = double;
Expand Down Expand Up @@ -52,17 +56,14 @@ class TableSpacingImplementation {
accurate_values_;
};

template<Argument max_table_spacing, std::int64_t number_of_tables>
class MultiTableImplementation {
public:
void Initialize();
static constexpr double max_table_spacing = 2.0 / 1024.0;
static constexpr std::int64_t number_of_tables = 9;

Value Sin(Argument x);
Value Cos(Argument x);

private:
// ArcSin(2^-(n + 1)) for n in [0, 8] rounded towards positive infinity.
static constexpr std::array<double, number_of_tables> cutoffs_{
// ArcSin(2^-(n + 1)) for n in [0, 8] rounded towards positive infinity. The
// entry at index n has n leading zeroes in the mantissa of its sinus.
static constexpr std::array<double, number_of_tables> cutoffs{
0x1.0C152382D7366p-1,
0x1.02BE9CE0B87CEp-2,
0x1.00ABE0C129E1Fp-3,
Expand All @@ -73,6 +74,24 @@ class MultiTableImplementation {
0x1.00002AAABDDDFp-8,
0x1.00000AAAABDDEp-9};

// The spacing between arguments above each cutoff.
static constexpr std::array<double, number_of_tables> table_spacings{
ScaleB(max_table_spacing, 0),
ScaleB(max_table_spacing, -1),
ScaleB(max_table_spacing, -2),
ScaleB(max_table_spacing, -3),
ScaleB(max_table_spacing, -4),
ScaleB(max_table_spacing, -5),
ScaleB(max_table_spacing, -6),
ScaleB(max_table_spacing, -7),
ScaleB(max_table_spacing, -8)};

void Initialize();

Value Sin(Argument x);
Value Cos(Argument x);

private:
// Despite the name these are not accurate values, but for the purposes of
// benchmarking it doesn't matter.
struct AccurateValues {
Expand All @@ -81,12 +100,14 @@ class MultiTableImplementation {
Value cos_x;
};

//TODO(phl)templatize
Value SinPolynomial(Argument x);
Value CosPolynomial(Argument x);
// |i| is the index of the binade in |cutoffs_|,
Value CosPolynomial(std::int64_t i, Argument x);

// Because the interval [π / 6, π / 4] is shorter than the next, the maximum
// value is reached between the first two cutoffs.
static constexpr std::int64_t table_size =
static_cast<std::int64_t>((2 * x_min - x_min) / max_table_spacing);
static_cast<std::int64_t>((cutoffs[0] - cutoffs[1]) / table_spacings[1]);

std::array<std::int64_t, number_of_tables> one_over_table_spacings_;
std::array<std::array<AccurateValues, table_size>, number_of_tables>
Expand Down Expand Up @@ -140,7 +161,8 @@ Value TableSpacingImplementation<table_spacing>::Cos(Argument const x) {
}

template<Argument table_spacing>
Value TableSpacingImplementation<table_spacing>::SinPolynomial(Argument const x) {
Value TableSpacingImplementation<table_spacing>::SinPolynomial(
Argument const x) {
if constexpr (table_spacing == 2.0 / 256.0) {
// 71 bits.
return -0.166666666666666666666421797625 +
Expand All @@ -153,7 +175,8 @@ Value TableSpacingImplementation<table_spacing>::SinPolynomial(Argument const x)
}

template<Argument table_spacing>
Value TableSpacingImplementation<table_spacing>::CosPolynomial(Argument const x) {
Value TableSpacingImplementation<table_spacing>::CosPolynomial(
Argument const x) {
if constexpr (table_spacing == 2.0 / 256.0) {
// 77 bits.
return -0.499999999999999999999999974543 +
Expand All @@ -166,43 +189,37 @@ Value TableSpacingImplementation<table_spacing>::CosPolynomial(Argument const x)
}
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
void MultiTableImplementation<max_table_spacing, number_of_tables>::
Initialize() {
void MultiTableImplementation::Initialize() {
Argument current_x_max = x_max;
Argument current_x_min = cutoffs_[0];
Argument current_table_spacing = max_table_spacing;
Argument current_x_min;
for (std::int64_t i = 0; i < number_of_tables; ++i) {
one_over_table_spacings_[i] = 1.0 / current_table_spacing;
Argument x = current_x_min;
current_x_min = cutoffs[i];
one_over_table_spacings_[i] = 1.0 / table_spacings[i];
Argument x = current_x_min + table_spacings[i] / 2;
for (std::int64_t j = 0; j < table_size && x < current_x_max; ++j) {
accurate_values_[i][j] = {.x = x,
.sin_x = std::sin(x),
.cos_x = std::cos(x)};
x += current_table_spacing;
x += table_spacings[i];
}
current_x_max = current_x_min;
current_x_min /= 2;
current_table_spacing /= 2;
}
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
FORCE_INLINE(inline)
Value MultiTableImplementation<max_table_spacing, number_of_tables>::
Sin(Argument const x) {
Value MultiTableImplementation::Sin(Argument const x) {
// Because the intervals are unequal, this loop does on average 2.28
// comparisons, which is better than a binary tree.
std::int64_t i = -1;
for (std::int64_t k = 0; k < cutoffs_.size(); ++k) {
if (cutoffs_[k] <= x) {
for (std::int64_t k = 0; k < cutoffs.size(); ++k) {
if (cutoffs[k] <= x) {
i = k;
break;
}
}

Argument const x_minus_cutoff = x - cutoffs_[i]
auto const j = static_cast<std::int64_t>((x_minus_cutoff) *
Argument const x_minus_cutoff = x - cutoffs[i];
auto const j = static_cast<std::int64_t>(x_minus_cutoff *
one_over_table_spacings_[i]);
auto const& accurate_values = accurate_values_[i][j];
auto const& x₀ = accurate_values.x;
Expand All @@ -212,19 +229,25 @@ Sin(Argument const x) {
auto const h² = h * h;
auto const h³ = h² * h;
auto const sin_x₀_plus_h_cos_x₀ = TwoProductAdd(cos_x₀, h, sin_x₀);
return sin_x₀_plus_h_cos_x₀.value +
((sin_x₀ * h² * CosPolynomial(h²) + cos_x₀ * h³ * SinPolynomial(h²)) +
sin_x₀_plus_h_cos_x₀.error);
return sin_x₀_plus_h_cos_x₀.value + ((sin_x₀ * h² * CosPolynomial(i, h²) +
cos_x₀ * h³ * SinPolynomial(h²)) +
sin_x₀_plus_h_cos_x₀.error);
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
FORCE_INLINE(inline)
Value MultiTableImplementation<max_table_spacing, number_of_tables>::
Cos(Argument const x) {
int x_exponent;
auto const x_mantissa = std::frexp(x, &x_exponent);
auto const i = number_of_tables + x_exponent;
auto const j = static_cast<std::int64_t>((x_mantissa - 0.5) *
Value MultiTableImplementation::Cos(Argument const x) {
// Because the intervals are unequal, this loop does on average 2.28
// comparisons, which is better than a binary tree.
std::int64_t i = -1;
for (std::int64_t k = 0; k < cutoffs.size(); ++k) {
if (cutoffs[k] <= x) {
i = k;
break;
}
}

Argument const x_minus_cutoff = x - cutoffs[i];
auto const j = static_cast<std::int64_t>(x_minus_cutoff *
one_over_table_spacings_[i]);
auto const& accurate_values = accurate_values_[i][j];
auto const& x₀ = accurate_values.x;
Expand All @@ -234,25 +257,33 @@ Cos(Argument const x) {
auto const h² = h * h;
auto const h³ = h² * h;
auto const cos_x₀_minus_h_sin_x₀ = TwoProductNegatedAdd(sin_x₀, h, cos_x₀);
return cos_x₀_minus_h_sin_x₀.value +
((cos_x₀ * h² * CosPolynomial(h²) - sin_x₀ * h³ * SinPolynomial(h²)) +
cos_x₀_minus_h_sin_x₀.error);
return cos_x₀_minus_h_sin_x₀.value + ((cos_x₀ * h² * CosPolynomial(i, h²) -
sin_x₀ * h³ * SinPolynomial(h²)) +
cos_x₀_minus_h_sin_x₀.error);
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
Value MultiTableImplementation<max_table_spacing, number_of_tables>::
SinPolynomial(Argument const x) {
Value MultiTableImplementation::SinPolynomial(Argument const x) {
// 85 bits.
return -0.166666666666666666666666651721 +
0.00833333316093951937646271666739 * x;
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
Value MultiTableImplementation<max_table_spacing, number_of_tables>::
CosPolynomial(Argument const x) {
// 72 bits.
return -0.499999999999999999999872434553 +
0.0416666654823785864634569932662 * x;
Value MultiTableImplementation::CosPolynomial(std::int64_t const i,
Argument const x) {
// i == 1 goes first because it is the largest argument interval.
if (i == 1) {
// 78 bits.
return -0.499999999999999999999998006790 +
0.0416666663705946726372008045758 * x;
} else if (i == 0) {
// 72 bits.
return -0.499999999999999999999872434553 +
0.0416666654823785864634569932662 * x;
} else {
// 84 bits.
return -0.499999999999999999999999968856 +
0.0416666665926486697856340784849 * x;
}
}

template<Argument table_spacing>
Expand Down Expand Up @@ -313,13 +344,14 @@ void BM_ExperimentCosTableSpacing(benchmark::State& state) {
}
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
void BM_ExperimentSinMultiTable(benchmark::State& state) {
std::mt19937_64 random(42);
std::uniform_real_distribution<> uniformly_at(
std::scalbln(x_min, 1 - number_of_tables), x_max);
MultiTableImplementation::cutoffs
[MultiTableImplementation::number_of_tables - 1],
x_max);

MultiTableImplementation<max_table_spacing, number_of_tables> implementation;
MultiTableImplementation implementation;
implementation.Initialize();

Argument a[number_of_iterations];
Expand All @@ -336,20 +368,22 @@ void BM_ExperimentSinMultiTable(benchmark::State& state) {
// The implementation is not accurate, but let's check that it's not
// broken.
auto const absolute_error = Abs(v[i] - std::sin(a[i]));
CHECK_LT(absolute_error, 5.6e-17);
//LOG(ERROR)<<absolute_error;
CHECK_LT(absolute_error, 1.2e-16);
#endif
}
benchmark::DoNotOptimize(v);
}
}

template<Argument max_table_spacing, std::int64_t number_of_tables>
void BM_ExperimentCosMultiTable(benchmark::State& state) {
std::mt19937_64 random(42);
std::uniform_real_distribution<> uniformly_at(
std::scalbln(x_min, 1 - number_of_tables), x_max);
MultiTableImplementation::cutoffs
[MultiTableImplementation::number_of_tables - 1],
x_max);

MultiTableImplementation<max_table_spacing, number_of_tables> implementation;
MultiTableImplementation implementation;
implementation.Initialize();

Argument a[number_of_iterations];
Expand All @@ -366,6 +400,7 @@ void BM_ExperimentCosMultiTable(benchmark::State& state) {
// The implementation is not accurate, but let's check that it's not
// broken.
auto const absolute_error = Abs(v[i] - std::cos(a[i]));
//LOG(ERROR)<<absolute_error;
CHECK_LT(absolute_error, 1.2e-16);
#endif
}
Expand All @@ -381,10 +416,8 @@ BENCHMARK_TEMPLATE(BM_ExperimentCosTableSpacing, 2.0 / 256.0)
->Unit(benchmark::kNanosecond);
BENCHMARK_TEMPLATE(BM_ExperimentCosTableSpacing, 2.0 / 1024.0)
->Unit(benchmark::kNanosecond);
BENCHMARK_TEMPLATE(BM_ExperimentSinMultiTable, 2.0 / 1024.0, 9)
->Unit(benchmark::kNanosecond);
BENCHMARK_TEMPLATE(BM_ExperimentCosMultiTable, 2.0 / 256.0, 9)
->Unit(benchmark::kNanosecond);
BENCHMARK(BM_ExperimentSinMultiTable)->Unit(benchmark::kNanosecond);
BENCHMARK(BM_ExperimentCosMultiTable)->Unit(benchmark::kNanosecond);

} // namespace functions
} // namespace principia

0 comments on commit ac3623c

Please sign in to comment.