Benchmarking that sometimes works

eggrobin · Oct 19, 2024 · c7da637 · c7da637
1 parent ff268a0
commit c7da637
Show file tree

Hide file tree

Showing 5 changed files with 300 additions and 0 deletions.
diff --git a/functions/benchmarking.cpp b/functions/benchmarking.cpp
@@ -0,0 +1,198 @@
+#include "functions/benchmarking.hpp"
+
+#include "absl/strings/str_format.h"
+#include "geometry/sign.hpp"
+#include "numerics/root_finders.hpp"
+#include "quantities/elementary_functions.hpp"
+#include "quantities/quantities.hpp"
+
+namespace principia {
+namespace functions {
+namespace _benchmarking {
+
+using namespace principia::geometry::_sign;
+using namespace principia::numerics::_root_finders;
+using namespace principia::quantities::_elementary_functions;
+using namespace principia::quantities::_quantities;
+
+  std::string MeasurementResult::ToGUMString() const {
+  if (standard_uncertainty == 0) {
+    return DebugString(value);
+  }
+  double const floor_log10_u = std::floor(std::log10(standard_uncertainty));
+  std::int64_t value_integer_digits = std::floor(std::log10(value)) + 1;
+  std::int64_t uncertainty_digits = 2;
+  std::int64_t digits_shown =
+      std::floor(std::log10(value)) - floor_log10_u + uncertainty_digits;
+  std::int64_t fractional_digits_shown = digits_shown - value_integer_digits;
+  if (fractional_digits_shown < 0) {
+    digits_shown += -fractional_digits_shown;
+    uncertainty_digits += -fractional_digits_shown;
+    fractional_digits_shown = 0;
+  }
+  if (fractional_digits_shown == 1) {
+    CHECK_EQ(uncertainty_digits, 2);
+    double uncertainty_parenthetical =
+        std::ceil(10 * standard_uncertainty) / 10;
+    return absl::StrFormat("%.1f(%03.1f)", value, uncertainty_parenthetical);
+  } else {
+    std::int64_t uncertainty_parenthetical =
+        std::ceil(standard_uncertainty *
+                  std::pow(10, uncertainty_digits - 1 - floor_log10_u));
+    return absl::StrFormat("%.*f(%0*d)",
+                           fractional_digits_shown,
+                           value,
+                           uncertainty_digits,
+                           uncertainty_parenthetical);
+  }
+}
+
+// From [Coh51].
+MeasurementResult LogNormalTerminus(std::vector<double> const& x) {
+  if (x.empty()) {
+    return {0, 0};
+  }
+  if (x.size() == 1) {
+    return {x[0], 0};
+  }
+  double const n = x.size();
+  double const n² = n * n;
+  auto const Σ₁ⁿ = [n](auto const summand) {
+    double Σ = 0;
+    for (int i = 0; i < n; ++i) {
+      Σ += summand(i);
+    }
+    return Σ;
+  };
+  auto const λ = [&Σ₁ⁿ, &x, n, n²](double α) {
+    double const Σ₁ⁿlog_xᵢ_minus_α =
+        Σ₁ⁿ([&](int i) { return std::log(x[i] - α); });
+    double const Σ₁ⁿlog²_xᵢ_minus_α =
+        Σ₁ⁿ([&](int i) { return Pow<2>(std::log(x[i] - α)); });
+    return Σ₁ⁿ([&](int i) { return 1 / (x[i] - α); }) *
+               (n * Σ₁ⁿlog_xᵢ_minus_α - n * Σ₁ⁿlog²_xᵢ_minus_α +
+                Pow<2>(Σ₁ⁿlog_xᵢ_minus_α)) -
+           n² * Σ₁ⁿ([&](int i) { return std::log(x[i] - α) / (x[i] - α); });
+  };
+  Sign sign_λ_0(λ(0));
+  // λ(x₁) is NaN, and λ is so ill-conditioned there that it has the wrong
+  // sign just below, so we use a cheesy factor.
+  double const x₁ = *std::min_element(x.begin(), x.end());
+  double cheese = 1;
+  while (Sign(λ((1 - cheese) * x₁)) == sign_λ_0) {
+    cheese /= 2;
+    if (cheese < 0x1p-53) {
+      // The MLE is very close to the minimum; in that limit the variance
+      // becomes 0.
+      return {.value = x₁, .standard_uncertainty = 0};
+    }
+  }
+  double const α = Brent(λ, 0.0, x₁ * (1 - cheese));
+  double const Σ₁ⁿlog_xᵢ_minus_α =
+      Σ₁ⁿ([&](int i) { return std::log(x[i] - α); });
+  double const Σ₁ⁿlog²_xᵢ_minus_α =
+      Σ₁ⁿ([&](int i) { return Pow<2>(std::log(x[i] - α)); });
+  double const β = std::exp(1 / n * Σ₁ⁿlog_xᵢ_minus_α);
+  double const γ² =
+      1 / n * Σ₁ⁿlog²_xᵢ_minus_α - Pow<2>(1 / n * Σ₁ⁿlog_xᵢ_minus_α);
+  double const ω = std::exp(γ²);
+  double const β² = β * β;
+  double const α_variance = β² * γ² / (n * ω * (ω * (1 + γ²) - 2 * γ² - 1));
+  return {.value = α, .standard_uncertainty = Sqrt(α_variance)};
+}
+
+__declspec(noinline) double __cdecl identity(double x) {
+  return x;
+}
+
+__declspec(noinline) MeasurementResult
+    BenchmarkFunctionThroughput(
+    double (__cdecl *f)(double),
+    std::function<double()> get_input,
+    std::int64_t const samples,
+    MeasurementResult const identity_throughput) {
+  std::vector<double> cycle_counts;
+  constexpr std::int64_t n = 1 << 16;
+  std::array<double, static_cast<std::size_t>(n)> inputs{};
+  for (std::int64_t j = 0; j < samples; ++j) {
+    for (std::int64_t i = 0; i < n; ++i) {
+      inputs[i] = (inputs[i] + get_input()) - inputs[i];
+    }
+    auto const start = __rdtsc();
+    for (std::int64_t i = 0; i < n; ++i) {
+      double const result = f(inputs[i]);
+      inputs[i] = f(inputs[i]);
+    }
+    auto const stop = __rdtsc();
+    cycle_counts.push_back((double)(stop - start) / n);
+  }
+  MeasurementResult throughput = LogNormalTerminus(cycle_counts);
+  throughput = {.value = throughput.value - identity_throughput.value,
+                .standard_uncertainty =
+                    Sqrt(Pow<2>(identity_throughput.standard_uncertainty) +
+                         Pow<2>(identity_throughput.standard_uncertainty))};
+  return throughput;
+}
+
+MeasurementResult BenchmarkFunctionThroughput(double (__cdecl *f)(double),
+                                              std::function<double()> get_input,
+                                              std::int64_t const samples) {
+  return BenchmarkFunctionThroughput(
+      f,
+      get_input,
+      samples,
+      BenchmarkFunctionThroughput(
+          &identity, get_input, samples, MeasurementResult{0, 0}));
+}
+
+__declspec(noinline) MeasurementResult
+    BenchmarkFunctionLatency(
+    double (__cdecl *f)(double),
+    std::function<double()> get_input,
+    std::int64_t const samples,
+    MeasurementResult const identity_latency) {
+  std::vector<double> cycle_counts;
+  constexpr std::int64_t n = 1 << 16;
+  for (int j = 0; j < samples; ++j) {
+    std::array<double, static_cast<std::size_t>(n)> inputs;
+    for (std::int64_t i = 0; i < n; ++i) {
+      inputs[i] = get_input();
+    }
+    auto const start = __rdtsc();
+    double x = inputs[0];
+    for (std::int64_t i = 0; i < n; ++i) {
+      double const result = f(x);
+      x = result + inputs[i] - result;
+    }
+    auto const stop = __rdtsc();
+    LOG(INFO) << x;
+    cycle_counts.push_back((double)(stop - start) / n);
+  }
+  MeasurementResult latency = LogNormalTerminus(cycle_counts);
+  latency = {.value = latency.value - identity_latency.value,
+             .standard_uncertainty =
+                 Sqrt(Pow<2>(latency.standard_uncertainty) +
+                      Pow<2>(identity_latency.standard_uncertainty))};
+  if (f == &identity) {
+    LOG(ERROR) << "Identity latency:" << latency.ToGUMString();
+  }
+  return latency;
+}
+
+MeasurementResult BenchmarkFunctionLatency(double (__cdecl *f)(double),
+                                           std::function<double()> get_input,
+                                           std::int64_t const samples) {
+  LOG(ERROR) << "Latency including overhead:"
+             << BenchmarkFunctionLatency(
+                    f, get_input, samples, MeasurementResult{0, 0}).ToGUMString();
+  return BenchmarkFunctionLatency(
+      f,
+      get_input,
+      samples,
+      BenchmarkFunctionLatency(
+          &identity, get_input, samples, MeasurementResult{0, 0}));
+}
+
+}  // namespace _benchmarking
+}  // namespace functions
+}  // namespace principia
diff --git a/functions/benchmarking.hpp b/functions/benchmarking.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <functional>
+#include <string>
+
+namespace principia {
+namespace functions {
+namespace _benchmarking {
+
+struct MeasurementResult {
+  double value{};
+  double standard_uncertainty{};
+  std::string ToGUMString() const;
+};
+
+MeasurementResult BenchmarkFunctionThroughput(double (__cdecl *f)(double),
+                                              std::function<double()> get_input,
+                                              std::int64_t const samples);
+
+MeasurementResult BenchmarkFunctionLatency(double (__cdecl *f)(double),
+                                           std::function<double()> get_input,
+                                           std::int64_t const samples);
+
+}  // namespace _benchmarking
+}  // namespace functions
+}  // namespace principia
diff --git a/functions/functions.vcxproj b/functions/functions.vcxproj
@@ -7,12 +7,15 @@
   <ItemGroup>
     <ClInclude Include="accurate_table_generator.hpp" />
     <ClInclude Include="accurate_table_generator_body.hpp" />
+    <ClInclude Include="benchmarking.hpp" />
     <ClInclude Include="multiprecision.hpp" />
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="accurate_table_generator_test.cpp" />
+    <ClCompile Include="benchmarking.cpp" />
     <ClCompile Include="core_math_accuracy_test.cpp" />
     <ClCompile Include="multiprecision.cpp" />
+    <ClCompile Include="sin_cos_benchmark.cpp" />
     <ClCompile Include="sin_cos_test.cpp" />
     <ClCompile Include="std_accuracy_test.cpp" />
   </ItemGroup>

diff --git a/functions/functions.vcxproj.filters b/functions/functions.vcxproj.filters
@@ -33,6 +33,12 @@
     <ClCompile Include="sin_cos_test.cpp">
       <Filter>Test Files</Filter>
     </ClCompile>
+    <ClCompile Include="benchmarking.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="sin_cos_benchmark.cpp">
+      <Filter>Test Files</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="multiprecision.hpp">
@@ -44,5 +50,8 @@
     <ClInclude Include="accurate_table_generator_body.hpp">
       <Filter>Source Files</Filter>
     </ClInclude>
+    <ClInclude Include="benchmarking.hpp">
+      <Filter>Header Files</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
diff --git a/functions/sin_cos_benchmark.cpp b/functions/sin_cos_benchmark.cpp
@@ -0,0 +1,64 @@
+#include <algorithm>
+#include <limits>
+#include <random>
+
+#include "boost/multiprecision/cpp_int.hpp"
+#include "functions/benchmarking.hpp"
+#include "functions/multiprecision.hpp"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "numerics/next.hpp"
+#include "numerics/sin_cos.hpp"
+#include "quantities/numbers.hpp"
+#include "testing_utilities/almost_equals.hpp"
+
+// This test lives in `functions` to avoid pulling `boost` into `numerics`.
+namespace principia {
+namespace numerics {
+namespace _sin_cos {
+
+using namespace functions::_benchmarking;
+
+class SinCosBenchmark : public ::testing::Test {
+ protected:
+  std::mt19937_64 random_{42};
+  std::uniform_real_distribution<> uniformly_at_{-2 * π, 2 * π};
+};
+
+TEST_F(SinCosBenchmark, StdSinLatency) {
+  // Note that we need to wrap std::sin in a lambda because of differences in
+  // calling convention.
+  std::cout << "std::sin latency: "
+            << BenchmarkFunctionLatency(
+                   &std::sin, [this]() { return uniformly_at_(random_); }, 10000)
+                   .ToGUMString()
+            << " cycles\n";
+}
+
+TEST_F(SinCosBenchmark, PrincipiaSinLatency) {
+  std::cout << "Principia Sin latency: "
+            << BenchmarkFunctionLatency(
+                   &Sin, [this]() { return uniformly_at_(random_); }, 10000)
+                   .ToGUMString()
+            << " cycles\n";
+}
+
+TEST_F(SinCosBenchmark, StdSinThroughput) {
+  std::cout << "std::sin reciprocal throughput: "
+            << BenchmarkFunctionThroughput(
+                   &std::sin, [this]() { return uniformly_at_(random_); }, 10000)
+                   .ToGUMString()
+            << " cycles\n";
+}
+
+TEST_F(SinCosBenchmark, PrincipiaSinThroughput) {
+  std::cout << "Principia Sin reciprocal throughput: "
+            << BenchmarkFunctionThroughput(
+                   &Sin, [this]() { return uniformly_at_(random_); }, 10000)
+                   .ToGUMString()
+            << " cycles\n";
+}
+
+}  // namespace _sin_cos
+}  // namespace numerics
+}  // namespace principia