Skip to content

Commit

Permalink
Merge pull request #280 from hvdijk/horner-precision
Browse files Browse the repository at this point in the history
Improve precision of horner_polynomial
  • Loading branch information
hvdijk authored Jan 8, 2024
2 parents ae30b07 + c59ec90 commit f732b95
Show file tree
Hide file tree
Showing 46 changed files with 234 additions and 323 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ T atan_unsafe(const T &x) {
-0.27671732461794832889713721816369215e-1,
0.12384767226415412063082059444905478e-1};

T r = c * abacus::internal::horner_polynomial<T, 15>(c * c, polynomial);
T r = c * abacus::internal::horner_polynomial(c * c, polynomial);

// atan(PI/7) in two constants for Cody & Waite reduction
const abacus_double atank1 =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ struct exp_unsafe_helper<T, abacus_half> {
1.0833740234375e-2f16};

// Minimax polynomial approximation in the domain [0, ln(2)] of e^x
const T result =
abacus::internal::horner_polynomial<T, 6>(rr_x, polynomial);
const T result = abacus::internal::horner_polynomial(rr_x, polynomial);

// Polynomial approximation * 2^k
return abacus::internal::ldexp_unsafe(result, k);
Expand Down Expand Up @@ -87,7 +86,7 @@ struct exp_unsafe_helper<T, abacus_float> {
0.272579824216659e-5f};

// minimax from 0 -> ln(2) of e^x
const T result = abacus::internal::horner_polynomial<T, 10>(r, polynomial);
const T result = abacus::internal::horner_polynomial(r, polynomial);

return abacus::internal::ldexp_unsafe(result, k);
}
Expand Down Expand Up @@ -126,7 +125,7 @@ struct exp_unsafe_helper<T, abacus_double> {
0.294609311301038779771435680411e-8};

// minimax from 0 -> ln(2) of e^x
const T result = abacus::internal::horner_polynomial<T, 15>(r, polynomial);
const T result = abacus::internal::horner_polynomial(r, polynomial);

return abacus::internal::ldexp_unsafe(result, k);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ inline T half_sincos_approx(const T &x, T *cosVal) {

// minimax polynomials from 0 -> PI/4
const T xx = x * x;
*cosVal = abacus::internal::horner_polynomial<T, 4>(xx, _half_sincos_coefc);
*cosVal = abacus::internal::horner_polynomial(xx, _half_sincos_coefc);

return x * abacus::internal::horner_polynomial<T, 4>(xx, _half_sincos_coefs);
return x * abacus::internal::horner_polynomial(xx, _half_sincos_coefs);
}
} // namespace internal
} // namespace abacus
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,71 +22,21 @@

namespace abacus {
namespace internal {
#ifdef __CA_BUILTINS_HALF_SUPPORT
inline abacus_half horner_polynomial(const abacus_half x,
ABACUS_CONSTANT abacus_half *p_coef,
int N) {
abacus_half coef_sum = p_coef[N - 1];
template <typename T, typename TCoef>
inline T horner_polynomial(const T x, const TCoef *p_coef, size_t N) {
T coef_sum = T(p_coef[N - 1]);

for (int n = N - 1; n > 0; n--) {
coef_sum = p_coef[n - 1] + x * coef_sum;
for (size_t n = N - 1; n > 0; n--) {
coef_sum = __abacus_fma(coef_sum, x, T(p_coef[n - 1]));
}

return coef_sum;
}
#endif // __CA_BUILTINS_HALF_SUPPORT

inline abacus_float horner_polynomial(const abacus_float x,
ABACUS_CONSTANT abacus_float *p_coef,
int N) {
abacus_float coef_sum = p_coef[N - 1];

for (int n = N - 1; n > 0; n--) {
coef_sum = p_coef[n - 1] + x * coef_sum;
}

return coef_sum;
}

#ifdef __CA_BUILTINS_DOUBLE_SUPPORT
inline abacus_double horner_polynomial(const abacus_double x,
ABACUS_CONSTANT abacus_double *p_coef,
int N) {
abacus_double coef_sum = p_coef[N - 1];

for (int n = N - 1; n > 0; n--) {
coef_sum = p_coef[n - 1] + x * coef_sum;
}

return coef_sum;
}
#endif // __CA_BUILTINS_DOUBLE_SUPPORT

template <typename T, unsigned int N>
inline T horner_polynomial(
const T &x, const typename TypeTraits<T>::ElementType *coefficients) {
T sum = coefficients[N - 1];

for (unsigned int n = N - 1; n > 0; n--) {
sum = ((T)coefficients[n - 1]) + x * sum;
}

return sum;
}

#ifdef __OPENCL_VERSION__
template <typename T, unsigned int N>
inline T horner_polynomial(const T &x, ABACUS_CONSTANT
typename TypeTraits<T>::ElementType *coefficients) {
T sum = coefficients[N - 1];

for (unsigned int n = N - 1; n > 0; n--) {
sum = ((T)coefficients[n - 1]) + x * sum;
}

return sum;
template <typename T, size_t N, typename TCoef>
inline T horner_polynomial(const T x, const TCoef (&coef)[N]) {
return horner_polynomial(x, coef, N);
}
#endif
} // namespace internal
} // namespace abacus

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,9 @@ inline T lgamma_positive(const T &x) {
const SignedType cond = x > _intervals[i];
interval = __abacus_select(interval, i, cond);

const T poly = abacus::internal::horner_polynomial<T, 8>(
x - _lgamma_translation[i], __codeplay_lgamma_positive_coeff + i * 8);
const T poly = abacus::internal::horner_polynomial(
x - _lgamma_translation[i], __codeplay_lgamma_positive_coeff + i * 8,
8);

ans = __abacus_select(ans, poly, cond);
}
Expand Down Expand Up @@ -212,9 +213,9 @@ inline T lgamma_positive_half(const T &x) {
const SignedType cond = x > _intervals_half[i];
interval = __abacus_select(interval, i, cond);

const T poly = abacus::internal::horner_polynomial<T, 8>(
const T poly = abacus::internal::horner_polynomial(
x - _lgamma_translation_half[i],
__codeplay_lgamma_positive_coeff_half + i * 8);
__codeplay_lgamma_positive_coeff_half + i * 8, 8);

ans = __abacus_select(ans, poly, cond);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ namespace {
// Aprroximation of log2(x+1) between [-0.25;0.5]
// See log2_extended_precision.sollya for derivation
// In order to gain performance we've dropped the first term of the polynomial
// out, 1.442689418792724609375, since it's in 32-bit precision and we'll
// manually add it in exactly by splitting into 16-bit values.
// out, since it's in 32-bit precision and we'll manually add it in exactly by
// splitting into 16-bit values.
static constexpr ABACUS_CONSTANT abacus_float
__codeplay_log2_extended_precision_coeffH0 = 1.442689418792724609375f;
static ABACUS_CONSTANT abacus_half
__codeplay_log2_extended_precision_coeffH[5] = {
-0.72119140625f16, 0.4814453125f16, -0.369384765625f16, 0.2919921875f16,
Expand Down Expand Up @@ -64,7 +66,7 @@ struct log2_extended_precision_helper<T, abacus_float> {
static T _(const T &xMant, T *out_remainder) {
T xMAnt1m = xMant - 1.0f;

T poly = abacus::internal::horner_polynomial<T, 16>(
T poly = abacus::internal::horner_polynomial(
xMAnt1m, __codeplay_log2_extended_precision_coeff);

T poly_times_x_lo;
Expand Down Expand Up @@ -147,7 +149,7 @@ T log2_extended_precision_half_unsafe(const T &x, T *ans_lo, T *xExp) {

// Approximate log2(x+1) with polynomial
xMant = xMant - 1.0f16;
T poly_start = abacus::internal::horner_polynomial<T, 5>(
T poly_start = abacus::internal::horner_polynomial(
xMant, __codeplay_log2_extended_precision_coeffH);

T poly_lo;
Expand Down Expand Up @@ -222,7 +224,7 @@ T log2_extended_precision_half_safe(const T &x, T *ans_lo, T *hiExp, T *loExp) {
// Approximate log2(x+1) with polynomial
xMant = xMant - 1.0f16;

T poly_start = abacus::internal::horner_polynomial<T, 5>(
T poly_start = abacus::internal::horner_polynomial(
xMant, __codeplay_log2_extended_precision_coeffH);

// Avoid creating denormal numbers in the lo components of exact add and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ namespace internal {
template <typename T>
inline T log_extended_precision(const T &xMant, T *out_remainder) {
const T xMant1m = xMant - 1.0f;
const T poly = abacus::internal::horner_polynomial<T, 26>(
const T poly = abacus::internal::horner_polynomial(
xMant1m, __codeplay_natural_log_extended_precision_coeffD);

// We need xMant1m*(xMant1m*(xMant1m*(xMant1m*poly + 1/3) - 0.5) + 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ struct pow_unsafe_helper<T, abacus_half> {

// Now a normal exp2 should do the trick:
// We know that 0 <= y_times_log2X <= 1 so we can just use a polynomial
T result = abacus::internal::horner_polynomial<T, 6>(
T result = abacus::internal::horner_polynomial(
y_times_log2X, __codeplay_pow_unsafe_coeffH);

// We do the same trick as for __log2_extra_precision here, using some extra
Expand Down Expand Up @@ -238,8 +238,8 @@ struct pow_unsafe_helper<T, abacus_float> {
exponent_mantissa -= abacus::detail::cast::convert<T>(
abacus::internal::floor_unsafe(exponent_mantissa));

T result = abacus::internal::horner_polynomial<T, 8>(
exponent_mantissa, __codeplay_pow_unsafe_coeff);
T result = abacus::internal::horner_polynomial(exponent_mantissa,
__codeplay_pow_unsafe_coeff);

result = __abacus_ldexp(result, exponent_floor);

Expand Down Expand Up @@ -326,10 +326,9 @@ struct pow_unsafe_helper<T, abacus_double> {
exponent_floor += mantissa_trunc;
exponent_mantissa -= abacus::detail::cast::convert<T>(mantissa_trunc);

T result =
(T)1.0 + exponent_mantissa *
abacus::internal::horner_polynomial<T, 18>(
exponent_mantissa, __codeplay_pow_unsafe_coeffD);
T result = (T)1.0 + exponent_mantissa * abacus::internal::horner_polynomial(
exponent_mantissa,
__codeplay_pow_unsafe_coeffD);

result = __abacus_ldexp(
result, abacus::detail::cast::convert<IntVecType>(exponent_floor));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,9 @@ struct sincos_approx_helper<T, abacus_half> {
using UnsignedType = typename TypeTraits<T>::UnsignedType;

const T xx = x * x;
*out_cos = abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefcH);
*out_cos = abacus::internal::horner_polynomial(xx, _sincos_coefcH);

const T sin =
x * abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefsH);
const T sin = x * abacus::internal::horner_polynomial(xx, _sincos_coefsH);

// 0.151611328125 (0x30da) is only slightly above 2 ULP. It's faster to
// add a special case for this input using select instead of changing the
Expand All @@ -81,9 +80,9 @@ template <typename T>
struct sincos_approx_helper<T, abacus_float> {
static T _(const T &x, T *out_cos) {
const T xx = x * x;
*out_cos = abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefc);
*out_cos = abacus::internal::horner_polynomial(xx, _sincos_coefc);

return x * abacus::internal::horner_polynomial<T, 4>(xx, _sincos_coefs);
return x * abacus::internal::horner_polynomial(xx, _sincos_coefs);
}
};

Expand All @@ -92,9 +91,9 @@ template <typename T>
struct sincos_approx_helper<T, abacus_double> {
static T _(const T &x, T *out_cos) {
const T xx = x * x;
*out_cos = abacus::internal::horner_polynomial<T, 7>(xx, _sincos_coefcD);
*out_cos = abacus::internal::horner_polynomial(xx, _sincos_coefcD);

return x * abacus::internal::horner_polynomial<T, 7>(xx, _sincos_coefsD);
return x * abacus::internal::horner_polynomial(xx, _sincos_coefsD);
}
};
#endif // __CA_BUILTINS_DOUBLE_SUPPORT
Expand Down
6 changes: 3 additions & 3 deletions modules/compiler/builtins/abacus/source/abacus_math/acos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ T acos(const T x) {
const SignedType cond = xAbs <= intervals[i];
interval = __abacus_select(interval, i, cond);

const T poly = abacus::internal::horner_polynomial<T, 4>(
i < 12 ? oneMinusXAbs : xAbs, polynomial + i * 4);
const T poly = abacus::internal::horner_polynomial(
i < 12 ? oneMinusXAbs : xAbs, polynomial + i * 4, 4);

ans = __abacus_select(ans, poly, cond);
}
Expand Down Expand Up @@ -217,7 +217,7 @@ T ABACUS_API acos_half(T x) {

xAbs = xAbs - T(1.0f16);
T ansBig =
xAbs * abacus::internal::horner_polynomial<T, 3>(xAbs, __codeplay_acos_1);
xAbs * abacus::internal::horner_polynomial(xAbs, __codeplay_acos_1);

ansBig = abacus::internal::sqrt(ansBig);

Expand Down
5 changes: 2 additions & 3 deletions modules/compiler/builtins/abacus/source/abacus_math/acosh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ T acosh_half(const T x) {

const T small_return =
abacus::internal::sqrt(x - 1.0f16) *
abacus::internal::horner_polynomial<T, 3>(x - T(1.0f16), _acoshH);
abacus::internal::horner_polynomial(x - T(1.0f16), _acoshH);

ans = __abacus_select(ans, small_return, SignedType(x < T(2.0f16)));

Expand All @@ -86,8 +86,7 @@ template <>
abacus_half acosh_half(const abacus_half x) {
if (x < 2.0f16) {
return abacus::internal::sqrt(x - 1.0f16) *
abacus::internal::horner_polynomial<abacus_half, 3>(x - 1.0f16,
_acoshH);
abacus::internal::horner_polynomial(x - 1.0f16, _acoshH);
}

// As x > 11.7, acosh(x) = log(x + sqrt(x^2 - 1)) converges to
Expand Down
12 changes: 6 additions & 6 deletions modules/compiler/builtins/abacus/source/abacus_math/acospi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ T acospi_half(const T x) {

// TODO maybe there's a way to pick the coefficients of the polynomials
// branchlessly, then this wouldn't branch
const T poly1 = abacus::internal::horner_polynomial<T, 3>(
x2, __codeplay_acospi_coeff_halfH2);
const T poly2 = abacus::internal::horner_polynomial<T, 3>(
xAbs, __codeplay_acospi_coeff_halfH1);
const T poly1 =
abacus::internal::horner_polynomial(x2, __codeplay_acospi_coeff_halfH2);
const T poly2 =
abacus::internal::horner_polynomial(xAbs, __codeplay_acospi_coeff_halfH1);

ans = xAbs * __abacus_select(poly1, poly2, cond1);

Expand All @@ -71,13 +71,13 @@ abacus_half acospi_half(const abacus_half x) {

if (xAbs > 5.9375E-1f16) {
xAbs = xAbs - 1.0f16;
ans = xAbs * abacus::internal::horner_polynomial<abacus_half, 3>(
ans = xAbs * abacus::internal::horner_polynomial(
xAbs, __codeplay_acospi_coeff_halfH1);

ans = abacus::internal::sqrt(ans);
} else {
const abacus_half x2 = x * x;
ans = xAbs * abacus::internal::horner_polynomial<abacus_half, 3>(
ans = xAbs * abacus::internal::horner_polynomial(
x2, __codeplay_acospi_coeff_halfH2);

ans = ans + 0.5f16;
Expand Down
14 changes: 6 additions & 8 deletions modules/compiler/builtins/abacus/source/abacus_math/asin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,12 @@ T asin_half(const T x) {
// around x = 1 we want to estimate (asin(x) - pi/2)^2
SignedType xBig = (xAbs > T(5.9375E-1f16));
const T x2 = x * x;
T ans = x * abacus::internal::horner_polynomial<T, 3>(x2, __codeplay_asin_2);
T ans = x * abacus::internal::horner_polynomial(x2, __codeplay_asin_2);

xAbs = xAbs - T(1.0f16);

T ansBig =
xAbs * abacus::internal::horner_polynomial<T, 3>(xAbs, __codeplay_asin_1);
xAbs * abacus::internal::horner_polynomial(xAbs, __codeplay_asin_1);

ansBig = -abacus::internal::sqrt(ansBig) + ABACUS_PI_2_H;
ansBig = __abacus_copysign(ansBig, x);
Expand Down Expand Up @@ -197,17 +197,15 @@ abacus_half asin_half(const abacus_half x) {

// Estimate (asin(x + 1) - pi / 2)^2 (see solly script)
abacus_half ans =
xAbs * abacus::internal::horner_polynomial<abacus_half, 3>(
xAbs, __codeplay_asin_1);
xAbs * abacus::internal::horner_polynomial(xAbs, __codeplay_asin_1);

ans = -abacus::internal::sqrt(ans) + ABACUS_PI_2_H;
return __abacus_copysign(ans, x);
}

// Estimate the remaining values
const abacus_half x2 = x * x;
return x * abacus::internal::horner_polynomial<abacus_half, 3>(
x2, __codeplay_asin_2);
return x * abacus::internal::horner_polynomial(x2, __codeplay_asin_2);
}
} // namespace

Expand All @@ -229,8 +227,8 @@ T asin(const T x) {
const SignedType cond = xAbs < intervals[i];
interval = __abacus_select(interval, i, cond);

const T poly = abacus::internal::horner_polynomial<T, 5>(
i < 9 ? oneMinusXAbs : xAbs, __codeplay_asin_coeff + i * 5);
const T poly = abacus::internal::horner_polynomial(
i < 9 ? oneMinusXAbs : xAbs, __codeplay_asin_coeff + i * 5, 5);

ans = __abacus_select(ans, poly, cond);
}
Expand Down
Loading

0 comments on commit f732b95

Please sign in to comment.