perf(elreal,ci): cut sanitizer/Debug CI time ~20x (depth, -O1, ctest -j)

Ravenwater · claude · Ravenwater · commit 37cf1b839c96 · 2026-06-03T18:14:59.000-04:00
The ASan/UBSan jobs grew past an hour because the elreal transcendental tests
(el_math_trigonometry/hyperbolic/exponent/constants) run a heavy O(depth^4)
workload at -O0 with no test parallelism. Four independent, multiplicative levers,
none of which lose any asserted coverage:

1) Test generation depth (the dominant, quartic lever). The math functions default
   to depth 4 (~212 bits) for high-precision use, but the tests assert only loose
   host tolerances (1e-10/1e-5) plus structural identities and the 0-overlap
   invariant -- all satisfied with wide margin at far smaller depth (d ~ d*53 bits
   on a double host). Thread an explicit small test depth (LIBRARY DEFAULTS
   UNCHANGED): single-series fns (atan/sin/cos/tan/exp/log/sinh/cosh/tanh) at
   depth 2 (~106 bits); the deepest compositions (asin/acos, pow=exp(y*log(x))) at
   depth 3; constants generated at depth 6 instead of the 16/32 defaults, with one
   deliberately-deep pi check kept at 8. Full elreal Debug+assertions ctest drops
   178s -&gt; 8.2s locally (constants alone 175s -&gt; 8.3s), 32/32 pass on gcc + clang.

2) Sanitizer builds at -O1 (CMakeLists.txt). AddressSanitizer's recommended
   setting: 2-4x faster than -O0 on this template-heavy header-only library with
   equal-or-better ASan detection (Debug -g retained for stacks). UBSan loses a few
   UB sites the optimizer elides before instrumentation -- an accepted trade.

3) ctest -j 2 in sanitizers.yml (was serial). Matches the 2-vCPU ubuntu-latest
   runner; safe under ASan's ~2-3x shadow-memory multiplier in 7 GB (the sibling
   cmake.yml already runs ctest -j 4 on the same runner).

4) --timeout 180 in sanitizers.yml: caps a hung instrumented test at 180s instead
   of CTest's 1500s default (tail-latency insurance, no effect on green runs).

Also tightens the priestRenorm iterate-to-fixpoint loop bound from 6 to 3
(threeAdd.hpp): a cancellation residual converges in two extra passes, so 3 is
ample headroom and removes the recently-added worst-case rescue cost.

Validated: full elreal suite (32 tests) passes on gcc and clang in Release and
Debug (assertions on); ASCII clean.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml
@@ -69,15 +69,18 @@ jobs:
         env:
           ASAN_OPTIONS: ${{ matrix.sanitizer == 'ASan' && 'detect_leaks=0:halt_on_error=1' || '' }}
           UBSAN_OPTIONS: ${{ matrix.sanitizer == 'UBSan' && 'print_stacktrace=1:halt_on_error=1' || '' }}
-        run: ctest --output-on-failure
+        # -j 2 matches the 2-vCPU ubuntu-latest runner; safe under ASan's ~2-3x
+        # shadow-memory multiplier in 7 GB. --timeout caps any hung instrumented
+        # test at 180s instead of CTest's 1500s default.
+        run: ctest -j 2 --timeout 180 --output-on-failure
 
       - name: Rerun failed tests
         if: failure()
         working-directory: ${{github.workspace}}/build
         env:
           ASAN_OPTIONS: ${{ matrix.sanitizer == 'ASan' && 'detect_leaks=0:halt_on_error=1' || '' }}
           UBSAN_OPTIONS: ${{ matrix.sanitizer == 'UBSan' && 'print_stacktrace=1:halt_on_error=1' || '' }}
-        run: ctest --rerun-failed --output-on-failure
+        run: ctest -j 2 --timeout 180 --rerun-failed --output-on-failure
 
       - name: Upload test logs
         if: failure()
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -478,16 +478,20 @@ if(CMAKE_COMPILER_IS_GNUCXX OR MINGW OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 		set(EXTRA_C_FLAGS_RELEASE "${EXTRA_C_FLAGS_RELEASE} -ffp-contract=off")
 	endif()
 
-	# Sanitizer support (GCC/Clang only)
+	# Sanitizer support (GCC/Clang only). Compile at -O1: this is the AddressSanitizer
+	# project's recommended setting -- it runs 2-4x faster than -O0 (a large win for
+	# this template-heavy header-only library) with equal-or-better ASan detection,
+	# and keeps the Debug -g for symbolized stacks. UBSan loses a few UB sites the
+	# optimizer elides before instrumentation, an accepted trade for the speedup.
 	if(UNIVERSAL_ENABLE_ASAN)
-		message(STATUS "AddressSanitizer enabled")
-		set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+		message(STATUS "AddressSanitizer enabled (-O1)")
+		set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer -O1")
 		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
 	endif()
 
 	if(UNIVERSAL_ENABLE_UBSAN)
-		message(STATUS "UndefinedBehaviorSanitizer enabled")
-		set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsanitize=undefined -fno-omit-frame-pointer")
+		message(STATUS "UndefinedBehaviorSanitizer enabled (-O1)")
+		set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fsanitize=undefined -fno-omit-frame-pointer -O1")
 		set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined")
 	endif()
 
diff --git a/elastic/elreal/math/constants.cpp b/elastic/elreal/math/constants.cpp
@@ -53,46 +53,54 @@ int check_value(const sw::universal::ZBCL<FpType>& z, double ref, double tol, co
     return n;
 }
 
+// Test generation depth. The constant generators default to deep precision (e=32,
+// pi=16 blocks) for high-precision *use*, but these tests only compare against the
+// host-double std::numbers reference (a ~1e-12 tolerance). Six blocks is ~300+ bits
+// on a double host -- far past the reference -- while cutting generation cost by
+// ~(6/16)^4 (and much more for e) vs the defaults, which is the dominant saving on
+// this, the most expensive elreal math test, under the -O0/instrumented CI tiers.
+static constexpr std::size_t kConstDepth = 6;
+
 // Series constants -- tested on double / float only.
 template <typename FpType>
-int verify_series(double tol, const std::string& host) {
+int verify_series(double tol, const std::string& host, std::size_t depth) {
     using namespace sw::universal;
     int n = 0;
-    n += check_value(e_zbcl<FpType>(),        std::numbers::e_v<double>,    tol, host + " e");
-    n += check_value(ln2_zbcl<FpType>(),      std::numbers::ln2_v<double>,  tol, host + " ln2");
-    n += check_value(ln10_zbcl<FpType>(),     std::numbers::ln10_v<double>, tol, host + " ln10");
-    n += check_value(log2_10_zbcl<FpType>(),  std::log2(10.0),              tol, host + " log2(10)");
-    n += check_value(pi_zbcl<FpType>(),       std::numbers::pi_v<double>,   tol, host + " pi");
+    n += check_value(e_zbcl<FpType>(depth),        std::numbers::e_v<double>,    tol, host + " e");
+    n += check_value(ln2_zbcl<FpType>(depth),      std::numbers::ln2_v<double>,  tol, host + " ln2");
+    n += check_value(ln10_zbcl<FpType>(depth),     std::numbers::ln10_v<double>, tol, host + " ln10");
+    n += check_value(log2_10_zbcl<FpType>(depth),  std::log2(10.0),              tol, host + " log2(10)");
+    n += check_value(pi_zbcl<FpType>(depth),       std::numbers::pi_v<double>,   tol, host + " pi");
     return n;
 }
 
 // Radical constants + euler_gamma -- tested on all hosts (stay in range).
 template <typename FpType>
-int verify_radicals(double tol, const std::string& host) {
+int verify_radicals(double tol, const std::string& host, std::size_t depth) {
     using namespace sw::universal;
     int n = 0;
     struct { ZBCL<FpType> z; double ref; const char* name; } rad[] = {
-        { sqrt2_zbcl<FpType>(), std::numbers::sqrt2_v<double>, "sqrt2" },
-        { sqrt3_zbcl<FpType>(), std::numbers::sqrt3_v<double>, "sqrt3" },
-        { sqrt5_zbcl<FpType>(), std::sqrt(5.0),                "sqrt5" },
-        { phi_zbcl<FpType>(),   std::numbers::phi_v<double>,   "phi"   },
+        { sqrt2_zbcl<FpType>(depth), std::numbers::sqrt2_v<double>, "sqrt2" },
+        { sqrt3_zbcl<FpType>(depth), std::numbers::sqrt3_v<double>, "sqrt3" },
+        { sqrt5_zbcl<FpType>(depth), std::sqrt(5.0),                "sqrt5" },
+        { phi_zbcl<FpType>(depth),   std::numbers::phi_v<double>,   "phi"   },
     };
     for (auto& r : rad) n += check_value(r.z, r.ref, tol, host + " " + r.name);
 
     // Algebraic identities (tolerance-based; sqrt/phi are approximate streams).
     for (double v : { 2.0, 3.0, 5.0 }) {
-        ZBCL<FpType> s = sqrt(from_native<FpType>(v));
+        ZBCL<FpType> s = sqrt(from_native<FpType>(v), depth);
         if (std::abs(est::approx(mul(s, s)) - v) > tol) {
             std::cout << host << " sqrt(" << v << ")^2 != " << v << '\n'; ++n;
         }
     }
     {   // phi^2 == phi + 1
-        ZBCL<FpType> phi = phi_zbcl<FpType>();
+        ZBCL<FpType> phi = phi_zbcl<FpType>(depth);
         double lhs = est::approx(mul(phi, phi));
         double rhs = est::approx(add(phi, from_native<FpType>(1.0)));
         if (std::abs(lhs - rhs) > tol) { std::cout << host << " phi^2 != phi+1\n"; ++n; }
     }
-    n += check_value(euler_gamma_zbcl<FpType>(), std::numbers::egamma_v<double>, tol, host + " egamma");
+    n += check_value(euler_gamma_zbcl<FpType>(depth), std::numbers::egamma_v<double>, tol, host + " egamma");
     return n;
 }
 
@@ -108,7 +116,9 @@ int verify_highprec_double() {
         block<double>{ -2.9947698097183397e-33,  0 },
     };
     ZBCL<double> ref = zbcl_from_blocks<double>(priestRenorm(pilimbs));
-    ZBCL<double> diff = add(pi_zbcl<double>(16), negate(ref));
+    // depth 8 (~424 bits) is the suite's one deliberately-deep generation, well
+    // past the 3-limb (~159-bit) reference it is compared against.
+    ZBCL<double> diff = add(pi_zbcl<double>(8), negate(ref));
     // Agreement to > 100 bits (~30 digits) demonstrates precision well beyond the
     // host double (the 3-limb reference itself caps the check at ~159 bits). The
     // leading limb cancels exactly, so the residual magnitude is the real signal
@@ -130,12 +140,12 @@ try {
     bool reportTestCases = false;
     ReportTestSuiteHeader(test_suite, reportTestCases);
 
-    nrOfFailedTestCases += verify_series<double>(1e-12, "const<double>");
-    nrOfFailedTestCases += verify_series<float>(1e-6, "const<float>");
+    nrOfFailedTestCases += verify_series<double>(1e-12, "const<double>", kConstDepth);
+    nrOfFailedTestCases += verify_series<float>(1e-6, "const<float>", kConstDepth);
 
-    nrOfFailedTestCases += verify_radicals<double>(1e-12, "const<double>");
-    nrOfFailedTestCases += verify_radicals<float>(1e-6, "const<float>");
-    nrOfFailedTestCases += verify_radicals<bfloat16>(1e-2, "const<bfloat16>");
+    nrOfFailedTestCases += verify_radicals<double>(1e-12, "const<double>", kConstDepth);
+    nrOfFailedTestCases += verify_radicals<float>(1e-6, "const<float>", kConstDepth);
+    nrOfFailedTestCases += verify_radicals<bfloat16>(1e-2, "const<bfloat16>", kConstDepth);
 
     nrOfFailedTestCases += verify_highprec_double();
 
diff --git a/elastic/elreal/math/exponent.cpp b/elastic/elreal/math/exponent.cpp
@@ -43,51 +43,58 @@ int near(const sw::universal::ZBCL<FpType>& z, double ref, double tol, const std
     return n;
 }
 
+// Test depths. exp/log are single-series and fine at depth 2, but verify_explog
+// also exercises general pow = exp(y*log(x)) (two stacked series), so it runs at
+// depth 3 (~159 bits) for guard bits; still ~(3/4)^4 cheaper than the default 4.
+// The integer pow path is a pure multiply and needs no series headroom (depth 2).
+static constexpr std::size_t kExpDepth    = 3;
+static constexpr std::size_t kPowIntDepth = 2;
+
 // exp/log/general-pow -- {double, float}.
 template <typename FpType>
-int verify_explog(double tol, const std::string& host) {
+int verify_explog(double tol, const std::string& host, std::size_t depth) {
     using namespace sw::universal;
     int n = 0;
 
     for (double v : { 0.0, 1.0, 2.0, -1.0, 0.5, 3.5 })
-        n += near(exp(from_native<FpType>(v)), std::exp(v), tol, host + " exp(" + std::to_string(v) + ")");
+        n += near(exp(from_native<FpType>(v), depth), std::exp(v), tol, host + " exp(" + std::to_string(v) + ")");
     for (double v : { 1.0, 2.0, 5.0, 10.0, 0.5, 100.0 })
-        n += near(log(from_native<FpType>(v)), std::log(v), tol, host + " log(" + std::to_string(v) + ")");
+        n += near(log(from_native<FpType>(v), depth), std::log(v), tol, host + " log(" + std::to_string(v) + ")");
 
     // round trips
     for (double v : { 0.5, 2.0, 3.0 }) {
-        n += near(log(exp(from_native<FpType>(v))), v, tol, host + " log(exp)");
+        n += near(log(exp(from_native<FpType>(v), depth), depth), v, tol, host + " log(exp)");
         ZBCL<FpType> pos = from_native<FpType>(std::exp(v));   // x = e^v > 0
-        n += near(exp(log(pos)), std::exp(v), tol * std::exp(v), host + " exp(log)");
+        n += near(exp(log(pos, depth), depth), std::exp(v), tol * std::exp(v), host + " exp(log)");
     }
     // exp(a+b) == exp(a)*exp(b)
     {
         ZBCL<FpType> a = from_native<FpType>(1.0), b = from_native<FpType>(0.5);
-        double lhs = est::approx(exp(add(a, b)));
-        double rhs = est::approx(mul(exp(a), exp(b)));
+        double lhs = est::approx(exp(add(a, b), depth));
+        double rhs = est::approx(mul(exp(a, depth), exp(b, depth)));
         if (std::abs(lhs - rhs) > tol * std::exp(1.5)) { std::cout << host << " exp(a+b)!=exp(a)exp(b)\n"; ++n; }
     }
     // log(x*y) == log(x)+log(y)
     {
         ZBCL<FpType> x = from_native<FpType>(3.0), y = from_native<FpType>(7.0);
-        double lhs = est::approx(log(mul(x, y)));
-        double rhs = est::approx(add(log(x), log(y)));
+        double lhs = est::approx(log(mul(x, y), depth));
+        double rhs = est::approx(add(log(x, depth), log(y, depth)));
         if (std::abs(lhs - rhs) > tol) { std::cout << host << " log(xy)!=log(x)+log(y)\n"; ++n; }
     }
     // general pow via exp(y*log(x))
-    n += near(pow(from_native<FpType>(2.0), from_native<FpType>(0.5)), std::sqrt(2.0), tol, host + " pow(2,0.5)");
-    n += near(pow(from_native<FpType>(9.0), from_native<FpType>(0.5)), 3.0, tol, host + " pow(9,0.5)");
+    n += near(pow(from_native<FpType>(2.0), from_native<FpType>(0.5), depth), std::sqrt(2.0), tol, host + " pow(2,0.5)");
+    n += near(pow(from_native<FpType>(9.0), from_native<FpType>(0.5), depth), 3.0, tol, host + " pow(9,0.5)");
     return n;
 }
 
 // pow integer fast path (pure multiply) -- all hosts.
 template <typename FpType>
-int verify_pow_int(double tol, const std::string& host) {
+int verify_pow_int(double tol, const std::string& host, std::size_t depth) {
     using namespace sw::universal;
     int n = 0;
     struct { double b, e, r; } cases[] = { {2,10,1024}, {3,4,81}, {5,3,125}, {2,0,1}, {7,2,49} };
     for (auto& c : cases) {
-        ZBCL<FpType> p = pow(from_native<FpType>(c.b), from_native<FpType>(c.e));
+        ZBCL<FpType> p = pow(from_native<FpType>(c.b), from_native<FpType>(c.e), depth);
         if (std::abs(est::approx(p) - c.r) > tol * std::max(1.0, c.r)) {
             std::cout << host << " pow(" << c.b << "," << c.e << ") = " << est::approx(p)
                       << " != " << c.r << '\n'; ++n;
@@ -107,12 +114,12 @@ try {
     bool reportTestCases = false;
     ReportTestSuiteHeader(test_suite, reportTestCases);
 
-    nrOfFailedTestCases += verify_explog<double>(1e-10, "explog<double>");
-    nrOfFailedTestCases += verify_explog<float>(1e-5, "explog<float>");
+    nrOfFailedTestCases += verify_explog<double>(1e-10, "explog<double>", kExpDepth);
+    nrOfFailedTestCases += verify_explog<float>(1e-5, "explog<float>", kExpDepth);
 
-    nrOfFailedTestCases += verify_pow_int<double>(1e-12, "pow<double>");
-    nrOfFailedTestCases += verify_pow_int<float>(1e-5, "pow<float>");
-    nrOfFailedTestCases += verify_pow_int<bfloat16>(1e-1, "pow<bfloat16>");
+    nrOfFailedTestCases += verify_pow_int<double>(1e-12, "pow<double>", kPowIntDepth);
+    nrOfFailedTestCases += verify_pow_int<float>(1e-5, "pow<float>", kPowIntDepth);
+    nrOfFailedTestCases += verify_pow_int<bfloat16>(1e-1, "pow<bfloat16>", kPowIntDepth);
 
     ReportTestSuiteResults(test_suite, nrOfFailedTestCases);
     return (nrOfFailedTestCases > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
diff --git a/elastic/elreal/math/hyperbolic.cpp b/elastic/elreal/math/hyperbolic.cpp
@@ -37,35 +37,41 @@ int near(const sw::universal::ZBCL<FpType>& z, double ref, double tol, const std
     return n;
 }
 
+// Test depth. sinh/cosh/tanh are single-series (exp-based) functions; depth 2
+// (~106 bits on a double host) clears the 1e-10/1e-5 tolerances and the
+// cosh^2-sinh^2 identity with wide margin, at ~O(depth^4) lower cost than the
+// default depth 4 -- the dominant lever on the instrumented CI tiers.
+static constexpr std::size_t kHypDepth = 2;
+
 template <typename FpType>
-int verify_all(double tol, const std::string& host) {
+int verify_all(double tol, const std::string& host, std::size_t depth) {
     using namespace sw::universal;
     int n = 0;
 
     for (double x : { 0.0, 0.5, 1.0, -1.5, 2.0, -0.25 }) {
         const std::string sx = std::to_string(x);
-        n += near(sinh(from_native<FpType>(x)), std::sinh(x), tol, host + " sinh(" + sx + ")");
-        n += near(cosh(from_native<FpType>(x)), std::cosh(x), tol, host + " cosh(" + sx + ")");
-        n += near(tanh(from_native<FpType>(x)), std::tanh(x), tol, host + " tanh(" + sx + ")");
+        n += near(sinh(from_native<FpType>(x), depth), std::sinh(x), tol, host + " sinh(" + sx + ")");
+        n += near(cosh(from_native<FpType>(x), depth), std::cosh(x), tol, host + " cosh(" + sx + ")");
+        n += near(tanh(from_native<FpType>(x), depth), std::tanh(x), tol, host + " tanh(" + sx + ")");
     }
 
     // cosh^2 - sinh^2 == 1
     for (double x : { 0.5, 1.3, 2.0 }) {
-        ZBCL<FpType> c = cosh(from_native<FpType>(x)), s = sinh(from_native<FpType>(x));
+        ZBCL<FpType> c = cosh(from_native<FpType>(x), depth), s = sinh(from_native<FpType>(x), depth);
         double id = est::approx(add(mul(c, c), negate(mul(s, s))));
         if (std::abs(id - 1.0) > tol) { std::cout << host << " cosh^2-sinh^2 != 1 at " << x << " (" << id << ")\n"; ++n; }
     }
     // tanh == sinh/cosh
     for (double x : { 0.7, 1.5 }) {
-        ZBCL<FpType> th = tanh(from_native<FpType>(x));
-        ZBCL<FpType> sc = div(sinh(from_native<FpType>(x)), cosh(from_native<FpType>(x)));
+        ZBCL<FpType> th = tanh(from_native<FpType>(x), depth);
+        ZBCL<FpType> sc = div(sinh(from_native<FpType>(x), depth), cosh(from_native<FpType>(x), depth));
         if (std::abs(est::approx(th) - est::approx(sc)) > tol) { std::cout << host << " tanh!=sinh/cosh at " << x << '\n'; ++n; }
     }
     // parity
     {
         ZBCL<FpType> a = from_native<FpType>(1.1);
-        if (std::abs(est::approx(sinh(negate(a))) + est::approx(sinh(a))) > tol) { std::cout << host << " sinh parity\n"; ++n; }
-        if (std::abs(est::approx(cosh(negate(a))) - est::approx(cosh(a))) > tol) { std::cout << host << " cosh parity\n"; ++n; }
+        if (std::abs(est::approx(sinh(negate(a), depth)) + est::approx(sinh(a, depth))) > tol) { std::cout << host << " sinh parity\n"; ++n; }
+        if (std::abs(est::approx(cosh(negate(a), depth)) - est::approx(cosh(a, depth))) > tol) { std::cout << host << " cosh parity\n"; ++n; }
     }
     return n;
 }
@@ -80,8 +86,8 @@ try {
     bool reportTestCases = false;
     ReportTestSuiteHeader(test_suite, reportTestCases);
 
-    nrOfFailedTestCases += verify_all<double>(1e-10, "hyp<double>");
-    nrOfFailedTestCases += verify_all<float>(1e-5, "hyp<float>");
+    nrOfFailedTestCases += verify_all<double>(1e-10, "hyp<double>", kHypDepth);
+    nrOfFailedTestCases += verify_all<float>(1e-5, "hyp<float>", kHypDepth);
 
     ReportTestSuiteResults(test_suite, nrOfFailedTestCases);
     return (nrOfFailedTestCases > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
diff --git a/elastic/elreal/math/trigonometry.cpp b/elastic/elreal/math/trigonometry.cpp
diff --git a/include/sw/universal/number/elreal/threeAdd.hpp b/include/sw/universal/number/elreal/threeAdd.hpp