Merge pull request #280 from hvdijk/horner-precision

Improve precision of horner_polynomial
uxlfoundation · Jan 8, 2024 · f732b95 · f732b95
2 parents ae30b07 + c59ec90
commit f732b95
Show file tree

Hide file tree

Showing 46 changed files with 234 additions and 323 deletions.
diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/atan_unsafe.h b/modules/compiler/builtins/abacus/include/abacus/internal/atan_unsafe.h
@@ -70,7 +70,7 @@ T atan_unsafe(const T &x) {
       -0.27671732461794832889713721816369215e-1,
       0.12384767226415412063082059444905478e-1};
 
-  T r = c * abacus::internal::horner_polynomial<T, 15>(c * c, polynomial);
+  T r = c * abacus::internal::horner_polynomial(c * c, polynomial);
 
   // atan(PI/7) in two constants for Cody & Waite reduction
   const abacus_double atank1 =

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/exp_unsafe.h b/modules/compiler/builtins/abacus/include/abacus/internal/exp_unsafe.h
@@ -58,8 +58,7 @@ struct exp_unsafe_helper<T, abacus_half> {
                                        1.0833740234375e-2f16};
 
     // Minimax polynomial approximation in the domain [0, ln(2)] of e^x
-    const T result =
-        abacus::internal::horner_polynomial<T, 6>(rr_x, polynomial);
+    const T result = abacus::internal::horner_polynomial(rr_x, polynomial);
 
     // Polynomial approximation * 2^k
     return abacus::internal::ldexp_unsafe(result, k);
@@ -87,7 +86,7 @@ struct exp_unsafe_helper<T, abacus_float> {
         0.272579824216659e-5f};
 
     // minimax from 0 -> ln(2) of e^x
-    const T result = abacus::internal::horner_polynomial<T, 10>(r, polynomial);
+    const T result = abacus::internal::horner_polynomial(r, polynomial);
 
     return abacus::internal::ldexp_unsafe(result, k);
   }
@@ -126,7 +125,7 @@ struct exp_unsafe_helper<T, abacus_double> {
                                           0.294609311301038779771435680411e-8};
 
     // minimax from 0 -> ln(2) of e^x
-    const T result = abacus::internal::horner_polynomial<T, 15>(r, polynomial);
+    const T result = abacus::internal::horner_polynomial(r, polynomial);
 
     return abacus::internal::ldexp_unsafe(result, k);
   }

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/half_sincos_approx.h b/modules/compiler/builtins/abacus/include/abacus/internal/half_sincos_approx.h
@@ -34,9 +34,9 @@ inline T half_sincos_approx(const T &x, T *cosVal) {
 
   // minimax polynomials from 0 -> PI/4
   const T xx = x * x;
-  *cosVal = abacus::internal::horner_polynomial<T, 4>(xx, _half_sincos_coefc);
+  *cosVal = abacus::internal::horner_polynomial(xx, _half_sincos_coefc);
 
-  return x * abacus::internal::horner_polynomial<T, 4>(xx, _half_sincos_coefs);
+  return x * abacus::internal::horner_polynomial(xx, _half_sincos_coefs);
 }
 }  // namespace internal
 }  // namespace abacus

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/horner_polynomial.h b/modules/compiler/builtins/abacus/include/abacus/internal/horner_polynomial.h
@@ -22,71 +22,21 @@
 
 namespace abacus {
 namespace internal {
-#ifdef __CA_BUILTINS_HALF_SUPPORT
-inline abacus_half horner_polynomial(const abacus_half x,
-                                     ABACUS_CONSTANT abacus_half *p_coef,
-                                     int N) {
-  abacus_half coef_sum = p_coef[N - 1];
+template <typename T, typename TCoef>
+inline T horner_polynomial(const T x, const TCoef *p_coef, size_t N) {
+  T coef_sum = T(p_coef[N - 1]);
 
-  for (int n = N - 1; n > 0; n--) {
-    coef_sum = p_coef[n - 1] + x * coef_sum;
+  for (size_t n = N - 1; n > 0; n--) {
+    coef_sum = __abacus_fma(coef_sum, x, T(p_coef[n - 1]));
   }
 
   return coef_sum;
 }
-#endif  // __CA_BUILTINS_HALF_SUPPORT
 
-inline abacus_float horner_polynomial(const abacus_float x,
-                                      ABACUS_CONSTANT abacus_float *p_coef,
-                                      int N) {
-  abacus_float coef_sum = p_coef[N - 1];
-
-  for (int n = N - 1; n > 0; n--) {
-    coef_sum = p_coef[n - 1] + x * coef_sum;
-  }
-
-  return coef_sum;
-}
-
-#ifdef __CA_BUILTINS_DOUBLE_SUPPORT
-inline abacus_double horner_polynomial(const abacus_double x,
-                                       ABACUS_CONSTANT abacus_double *p_coef,
-                                       int N) {
-  abacus_double coef_sum = p_coef[N - 1];
-
-  for (int n = N - 1; n > 0; n--) {
-    coef_sum = p_coef[n - 1] + x * coef_sum;
-  }
-
-  return coef_sum;
-}
-#endif  // __CA_BUILTINS_DOUBLE_SUPPORT
-
-template <typename T, unsigned int N>
-inline T horner_polynomial(
-    const T &x, const typename TypeTraits<T>::ElementType *coefficients) {
-  T sum = coefficients[N - 1];
-
-  for (unsigned int n = N - 1; n > 0; n--) {
-    sum = ((T)coefficients[n - 1]) + x * sum;
-  }
-
-  return sum;
-}
-
-#ifdef __OPENCL_VERSION__
-template <typename T, unsigned int N>
-inline T horner_polynomial(const T &x, ABACUS_CONSTANT
-                           typename TypeTraits<T>::ElementType *coefficients) {
-  T sum = coefficients[N - 1];
-
-  for (unsigned int n = N - 1; n > 0; n--) {
-    sum = ((T)coefficients[n - 1]) + x * sum;
-  }
-
-  return sum;
+template <typename T, size_t N, typename TCoef>
+inline T horner_polynomial(const T x, const TCoef (&coef)[N]) {
+  return horner_polynomial(x, coef, N);
 }
-#endif
 }  // namespace internal
 }  // namespace abacus
 

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/lgamma_positive.h b/modules/compiler/builtins/abacus/include/abacus/internal/lgamma_positive.h
@@ -175,8 +175,9 @@ inline T lgamma_positive(const T &x) {
     const SignedType cond = x > _intervals[i];
     interval = __abacus_select(interval, i, cond);
 
-    const T poly = abacus::internal::horner_polynomial<T, 8>(
-        x - _lgamma_translation[i], __codeplay_lgamma_positive_coeff + i * 8);
+    const T poly = abacus::internal::horner_polynomial(
+        x - _lgamma_translation[i], __codeplay_lgamma_positive_coeff + i * 8,
+        8);
 
     ans = __abacus_select(ans, poly, cond);
   }
@@ -212,9 +213,9 @@ inline T lgamma_positive_half(const T &x) {
     const SignedType cond = x > _intervals_half[i];
     interval = __abacus_select(interval, i, cond);
 
-    const T poly = abacus::internal::horner_polynomial<T, 8>(
+    const T poly = abacus::internal::horner_polynomial(
         x - _lgamma_translation_half[i],
-        __codeplay_lgamma_positive_coeff_half + i * 8);
+        __codeplay_lgamma_positive_coeff_half + i * 8, 8);
 
     ans = __abacus_select(ans, poly, cond);
   }

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/log2_extended_precision.h b/modules/compiler/builtins/abacus/include/abacus/internal/log2_extended_precision.h
@@ -35,8 +35,10 @@ namespace {
 // Aprroximation of log2(x+1) between [-0.25;0.5]
 // See log2_extended_precision.sollya for derivation
 // In order to gain performance we've dropped the first term of the polynomial
-// out, 1.442689418792724609375, since it's in 32-bit precision and we'll
-// manually add it in exactly by splitting into 16-bit values.
+// out, since it's in 32-bit precision and we'll manually add it in exactly by
+// splitting into 16-bit values.
+static constexpr ABACUS_CONSTANT abacus_float
+    __codeplay_log2_extended_precision_coeffH0 = 1.442689418792724609375f;
 static ABACUS_CONSTANT abacus_half
     __codeplay_log2_extended_precision_coeffH[5] = {
         -0.72119140625f16, 0.4814453125f16, -0.369384765625f16, 0.2919921875f16,
@@ -64,7 +66,7 @@ struct log2_extended_precision_helper<T, abacus_float> {
   static T _(const T &xMant, T *out_remainder) {
     T xMAnt1m = xMant - 1.0f;
 
-    T poly = abacus::internal::horner_polynomial<T, 16>(
+    T poly = abacus::internal::horner_polynomial(
         xMAnt1m, __codeplay_log2_extended_precision_coeff);
 
     T poly_times_x_lo;
@@ -147,7 +149,7 @@ T log2_extended_precision_half_unsafe(const T &x, T *ans_lo, T *xExp) {
 
   // Approximate log2(x+1) with polynomial
   xMant = xMant - 1.0f16;
-  T poly_start = abacus::internal::horner_polynomial<T, 5>(
+  T poly_start = abacus::internal::horner_polynomial(
       xMant, __codeplay_log2_extended_precision_coeffH);
 
   T poly_lo;
@@ -222,7 +224,7 @@ T log2_extended_precision_half_safe(const T &x, T *ans_lo, T *hiExp, T *loExp) {
   // Approximate log2(x+1) with polynomial
   xMant = xMant - 1.0f16;
 
-  T poly_start = abacus::internal::horner_polynomial<T, 5>(
+  T poly_start = abacus::internal::horner_polynomial(
       xMant, __codeplay_log2_extended_precision_coeffH);
 
   // Avoid creating denormal numbers in the lo components of exact add and

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/log_extended_precision.h b/modules/compiler/builtins/abacus/include/abacus/internal/log_extended_precision.h
@@ -61,7 +61,7 @@ namespace internal {
 template <typename T>
 inline T log_extended_precision(const T &xMant, T *out_remainder) {
   const T xMant1m = xMant - 1.0f;
-  const T poly = abacus::internal::horner_polynomial<T, 26>(
+  const T poly = abacus::internal::horner_polynomial(
       xMant1m, __codeplay_natural_log_extended_precision_coeffD);
 
   // We need xMant1m*(xMant1m*(xMant1m*(xMant1m*poly + 1/3) - 0.5) + 1)

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/pow_unsafe.h b/modules/compiler/builtins/abacus/include/abacus/internal/pow_unsafe.h
@@ -161,7 +161,7 @@ struct pow_unsafe_helper<T, abacus_half> {
 
     // Now a normal exp2 should do the trick:
     // We know that 0 <= y_times_log2X <= 1 so we can just use a polynomial
-    T result = abacus::internal::horner_polynomial<T, 6>(
+    T result = abacus::internal::horner_polynomial(
         y_times_log2X, __codeplay_pow_unsafe_coeffH);
 
     // We do the same trick as for __log2_extra_precision here, using some extra
@@ -238,8 +238,8 @@ struct pow_unsafe_helper<T, abacus_float> {
     exponent_mantissa -= abacus::detail::cast::convert<T>(
         abacus::internal::floor_unsafe(exponent_mantissa));
 
-    T result = abacus::internal::horner_polynomial<T, 8>(
-        exponent_mantissa, __codeplay_pow_unsafe_coeff);
+    T result = abacus::internal::horner_polynomial(exponent_mantissa,
+                                                   __codeplay_pow_unsafe_coeff);
 
     result = __abacus_ldexp(result, exponent_floor);
 
@@ -326,10 +326,9 @@ struct pow_unsafe_helper<T, abacus_double> {
     exponent_floor += mantissa_trunc;
     exponent_mantissa -= abacus::detail::cast::convert<T>(mantissa_trunc);
 
-    T result =
-        (T)1.0 + exponent_mantissa *
-                     abacus::internal::horner_polynomial<T, 18>(
-                         exponent_mantissa, __codeplay_pow_unsafe_coeffD);
+    T result = (T)1.0 + exponent_mantissa * abacus::internal::horner_polynomial(
+                                                exponent_mantissa,
+                                                __codeplay_pow_unsafe_coeffD);
 
     result = __abacus_ldexp(
         result, abacus::detail::cast::convert<IntVecType>(exponent_floor));

diff --git a/modules/compiler/builtins/abacus/include/abacus/internal/sincos_approx.h b/modules/compiler/builtins/abacus/include/abacus/internal/sincos_approx.h
@@ -62,10 +62,9 @@ struct sincos_approx_helper<T, abacus_half> {
     using UnsignedType = typename TypeTraits<T>::UnsignedType;
 
     const T xx = x * x;
-    *out_cos = abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefcH);
+    *out_cos = abacus::internal::horner_polynomial(xx, _sincos_coefcH);
 
-    const T sin =
-        x * abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefsH);
+    const T sin = x * abacus::internal::horner_polynomial(xx, _sincos_coefsH);
 
     // 0.151611328125 (0x30da) is only slightly above 2 ULP. It's faster to
     // add a special case for this input using select instead of changing the
@@ -81,9 +80,9 @@ template <typename T>
 struct sincos_approx_helper<T, abacus_float> {
   static T _(const T &x, T *out_cos) {
     const T xx = x * x;
-    *out_cos = abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefc);
+    *out_cos = abacus::internal::horner_polynomial(xx, _sincos_coefc);
 
-    return x * abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefs);
+    return x * abacus::internal::horner_polynomial(xx, _sincos_coefs);
   }
 };
 
@@ -92,9 +91,9 @@ template <typename T>
 struct sincos_approx_helper<T, abacus_double> {
   static T _(const T &x, T *out_cos) {
     const T xx = x * x;
-    *out_cos = abacus::internal::horner_polynomial<T, 7>(xx, _sincos_coefcD);
+    *out_cos = abacus::internal::horner_polynomial(xx, _sincos_coefcD);
 
-    return x * abacus::internal::horner_polynomial<T, 7>(xx, _sincos_coefsD);
+    return x * abacus::internal::horner_polynomial(xx, _sincos_coefsD);
   }
 };
 #endif  // __CA_BUILTINS_DOUBLE_SUPPORT

diff --git a/modules/compiler/builtins/abacus/source/abacus_math/acos.cpp b/modules/compiler/builtins/abacus/source/abacus_math/acos.cpp
@@ -156,8 +156,8 @@ T acos(const T x) {
     const SignedType cond = xAbs <= intervals[i];
     interval = __abacus_select(interval, i, cond);
 
-    const T poly = abacus::internal::horner_polynomial<T, 4>(
-        i < 12 ? oneMinusXAbs : xAbs, polynomial + i * 4);
+    const T poly = abacus::internal::horner_polynomial(
+        i < 12 ? oneMinusXAbs : xAbs, polynomial + i * 4, 4);
 
     ans = __abacus_select(ans, poly, cond);
   }
@@ -217,7 +217,7 @@ T ABACUS_API acos_half(T x) {
 
   xAbs = xAbs - T(1.0f16);
   T ansBig =
-      xAbs * abacus::internal::horner_polynomial<T, 3>(xAbs, __codeplay_acos_1);
+      xAbs * abacus::internal::horner_polynomial(xAbs, __codeplay_acos_1);
 
   ansBig = abacus::internal::sqrt(ansBig);
 

diff --git a/modules/compiler/builtins/abacus/source/abacus_math/acosh.cpp b/modules/compiler/builtins/abacus/source/abacus_math/acosh.cpp
@@ -75,7 +75,7 @@ T acosh_half(const T x) {
 
   const T small_return =
       abacus::internal::sqrt(x - 1.0f16) *
-      abacus::internal::horner_polynomial<T, 3>(x - T(1.0f16), _acoshH);
+      abacus::internal::horner_polynomial(x - T(1.0f16), _acoshH);
 
   ans = __abacus_select(ans, small_return, SignedType(x < T(2.0f16)));
 
@@ -86,8 +86,7 @@ template <>
 abacus_half acosh_half(const abacus_half x) {
   if (x < 2.0f16) {
     return abacus::internal::sqrt(x - 1.0f16) *
-           abacus::internal::horner_polynomial<abacus_half, 3>(x - 1.0f16,
-                                                               _acoshH);
+           abacus::internal::horner_polynomial(x - 1.0f16, _acoshH);
   }
 
   // As x > 11.7, acosh(x) = log(x + sqrt(x^2 - 1)) converges to

diff --git a/modules/compiler/builtins/abacus/source/abacus_math/acospi.cpp b/modules/compiler/builtins/abacus/source/abacus_math/acospi.cpp
@@ -50,10 +50,10 @@ T acospi_half(const T x) {
 
   // TODO maybe there's a way to pick the coefficients of the polynomials
   // branchlessly, then this wouldn't branch
-  const T poly1 = abacus::internal::horner_polynomial<T, 3>(
-      x2, __codeplay_acospi_coeff_halfH2);
-  const T poly2 = abacus::internal::horner_polynomial<T, 3>(
-      xAbs, __codeplay_acospi_coeff_halfH1);
+  const T poly1 =
+      abacus::internal::horner_polynomial(x2, __codeplay_acospi_coeff_halfH2);
+  const T poly2 =
+      abacus::internal::horner_polynomial(xAbs, __codeplay_acospi_coeff_halfH1);
 
   ans = xAbs * __abacus_select(poly1, poly2, cond1);
 
@@ -71,13 +71,13 @@ abacus_half acospi_half(const abacus_half x) {
 
   if (xAbs > 5.9375E-1f16) {
     xAbs = xAbs - 1.0f16;
-    ans = xAbs * abacus::internal::horner_polynomial<abacus_half, 3>(
+    ans = xAbs * abacus::internal::horner_polynomial(
                      xAbs, __codeplay_acospi_coeff_halfH1);
 
     ans = abacus::internal::sqrt(ans);
   } else {
     const abacus_half x2 = x * x;
-    ans = xAbs * abacus::internal::horner_polynomial<abacus_half, 3>(
+    ans = xAbs * abacus::internal::horner_polynomial(
                      x2, __codeplay_acospi_coeff_halfH2);
 
     ans = ans + 0.5f16;

diff --git a/modules/compiler/builtins/abacus/source/abacus_math/asin.cpp b/modules/compiler/builtins/abacus/source/abacus_math/asin.cpp
@@ -161,12 +161,12 @@ T asin_half(const T x) {
   // around x = 1 we want to estimate (asin(x) - pi/2)^2
   SignedType xBig = (xAbs > T(5.9375E-1f16));
   const T x2 = x * x;
-  T ans = x * abacus::internal::horner_polynomial<T, 3>(x2, __codeplay_asin_2);
+  T ans = x * abacus::internal::horner_polynomial(x2, __codeplay_asin_2);
 
   xAbs = xAbs - T(1.0f16);
 
   T ansBig =
-      xAbs * abacus::internal::horner_polynomial<T, 3>(xAbs, __codeplay_asin_1);
+      xAbs * abacus::internal::horner_polynomial(xAbs, __codeplay_asin_1);
 
   ansBig = -abacus::internal::sqrt(ansBig) + ABACUS_PI_2_H;
   ansBig = __abacus_copysign(ansBig, x);
@@ -197,17 +197,15 @@ abacus_half asin_half(const abacus_half x) {
 
     // Estimate (asin(x + 1) - pi / 2)^2 (see solly script)
     abacus_half ans =
-        xAbs * abacus::internal::horner_polynomial<abacus_half, 3>(
-                   xAbs, __codeplay_asin_1);
+        xAbs * abacus::internal::horner_polynomial(xAbs, __codeplay_asin_1);
 
     ans = -abacus::internal::sqrt(ans) + ABACUS_PI_2_H;
     return __abacus_copysign(ans, x);
   }
 
   // Estimate the remaining values
   const abacus_half x2 = x * x;
-  return x * abacus::internal::horner_polynomial<abacus_half, 3>(
-                 x2, __codeplay_asin_2);
+  return x * abacus::internal::horner_polynomial(x2, __codeplay_asin_2);
 }
 }  // namespace
 
@@ -229,8 +227,8 @@ T asin(const T x) {
     const SignedType cond = xAbs < intervals[i];
     interval = __abacus_select(interval, i, cond);
 
-    const T poly = abacus::internal::horner_polynomial<T, 5>(
-        i < 9 ? oneMinusXAbs : xAbs, __codeplay_asin_coeff + i * 5);
+    const T poly = abacus::internal::horner_polynomial(
+        i < 9 ? oneMinusXAbs : xAbs, __codeplay_asin_coeff + i * 5, 5);
 
     ans = __abacus_select(ans, poly, cond);
   }