-
Notifications
You must be signed in to change notification settings - Fork 286
Complex asinh accuracy refinement #6428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
s-oboyle
wants to merge
22
commits into
NVIDIA:main
Choose a base branch
from
s-oboyle:complex_asinh_accuracy_refinement
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+329
−20
Open
Changes from all commits
Commits
Show all changes
22 commits
Select commit
Hold shift + click to select a range
1401314
moving machine
s-oboyle 4469c9d
Merge branch 'NVIDIA:main' into complex_asinh_accuracy_refinement
s-oboyle 8b659d0
Comment cleanup
s-oboyle c16cd26
clang-format
s-oboyle 99c51dc
More const's, comment cleanup
s-oboyle 63898de
Remove unneeded headers. May need to add some back in when the roots.…
s-oboyle 962f48a
Merge branch 'NVIDIA:main' into complex_asinh_accuracy_refinement
s-oboyle 7844a7e
re-enable header error
s-oboyle 08f5133
Add noexcept to internal function
s-oboyle a8c2333
spell-check
s-oboyle c24b96d
Apply suggestions from code review
s-oboyle bce04ea
Update libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_funct…
s-oboyle d69eafc
Updated all constant initialization
s-oboyle 9ad1b20
Update libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_funct…
s-oboyle 7996458
More non-cuda compiler guards
s-oboyle 55c63ad
Update libcudacxx/include/cuda/std/__complex/inverse_hyperbolic_funct…
s-oboyle 7b6aec1
Changed return method for extended_sqrt
s-oboyle 8f447e8
Added headers back
s-oboyle 9629a78
Merge branch 'main' into complex_asinh_accuracy_refinement
s-oboyle ba49bcb
removed noexcept in favour of _CCCL_FORCEINLINE
s-oboyle 590322c
last commit does not work, switched back to using inline and noexcept…
s-oboyle 2d4ad99
Merge branch 'main' into complex_asinh_accuracy_refinement
s-oboyle File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,51 +39,360 @@ | |
|
|
||
| _CCCL_BEGIN_NAMESPACE_CUDA_STD | ||
|
|
||
| // Specialization of 1/sqrt(x), uses device-only rsqrtf/rsqrt when viable. | ||
| // Sometimes compilers can optimize 1/sqrt(x) so we don't suffer the slowdown of the divide. | ||
| // It can happen that the double 1/sqrt() on device gets optimized better than CUDA rsqrt(), | ||
| // but it depends on the calling function. We needs to optimize on an individual basis. | ||
| template <class _Tp> | ||
| [[nodiscard]] _CCCL_API inline _Tp __internal_rsqrt_inverse_hyperbloic(_Tp __x) noexcept | ||
| { | ||
| #if _CCCL_CUDA_COMPILATION() | ||
| if constexpr (is_same_v<_Tp, float>) | ||
| { | ||
| NV_IF_TARGET(NV_IS_DEVICE, (return ::rsqrtf(__x);)) | ||
| } | ||
| if constexpr (is_same_v<_Tp, double>) | ||
| { | ||
| NV_IF_TARGET(NV_IS_DEVICE, (return ::rsqrt(__x);)) | ||
| } | ||
| #endif // _CCCL_CUDA_COMPILATION() | ||
| return _Tp{1} / ::cuda::std::sqrt(__x); | ||
| } | ||
|
|
||
| template <class _Tp> | ||
| struct _CCCL_ALIGNAS(2 * sizeof(_Tp)) __cccl_asinh_sqrt_return_hilo | ||
| { | ||
| _Tp __hi; | ||
| _Tp __lo; | ||
| }; | ||
|
|
||
| // An unsafe sqrt(_Tp + _Tp) extended precision sqrt. | ||
| template <typename _Tp> | ||
| static _CCCL_API inline __cccl_asinh_sqrt_return_hilo<_Tp> | ||
| __internal_double_Tp_sqrt_unsafe(_Tp __hi, _Tp __lo) noexcept | ||
| { | ||
| // rsqrt | ||
| const _Tp __initial_guess = __internal_rsqrt_inverse_hyperbloic<_Tp>(__hi); | ||
|
|
||
| // Newton-Raphson, assume we have a reasonable rsqrt above and | ||
| // that we don't need to update the first term (__initial_guess). | ||
| // x_(n+1) = x_n - 0.5*x_n*((__hi + __lo)(x_n^2) - 1) | ||
|
|
||
| // __initial_guess^2: | ||
| const _Tp __init_sq_hi = __initial_guess * __initial_guess; | ||
| const _Tp __init_sq_lo = ::cuda::std::fma(__initial_guess, __initial_guess, -__init_sq_hi); | ||
|
|
||
| // Times (__hi + __lo). | ||
| // We need to add the -1.0 here in an fma, or different compilers (eg host vs device) | ||
| // optimize differently and give (sometimes very) different results. | ||
| const _Tp _hi_hi_hi = ::cuda::std::fma(__hi, __init_sq_hi, _Tp{-1.0}); | ||
| // Low part not needed for fp32/fp64, but might be if this is extended to other types: | ||
| // _Tp _hi_hi_lo = fma(__hi, __init_sq_hi, -_hi_hi_hi); | ||
|
|
||
| // Add all terms | ||
| const _Tp __full_term = _hi_hi_hi + (::cuda::std::fma(__lo, __init_sq_hi, __hi * __init_sq_lo) /*+ _hi_hi_lo*/); | ||
|
|
||
| const _Tp __correction_term = _Tp(-0.5) * __initial_guess * __full_term; | ||
|
|
||
| // rsqrt(hi + lo) is now estimated well by (__initial_guess + __correction_term) | ||
| // Multiply everything by (hi + lo) to get sqrt(hi + lo) | ||
| const _Tp __ans_hi_hi = __hi * __initial_guess; | ||
| _Tp __ans_hi_lo = ::cuda::std::fma(__hi, __initial_guess, -__ans_hi_hi); | ||
|
|
||
| // All terms needed, allow the compiler to pick which way to | ||
| // optimize this to fma, same accuracy. | ||
| __ans_hi_lo += __initial_guess * __lo + __correction_term * __hi; | ||
|
|
||
| __cccl_asinh_sqrt_return_hilo<_Tp> __retval_hilo; | ||
| __retval_hilo.__hi = __ans_hi_hi; | ||
| __retval_hilo.__lo = __ans_hi_lo; | ||
|
|
||
| return __retval_hilo; | ||
| } | ||
|
|
||
| // asinh | ||
|
|
||
| template <class _Tp> | ||
| [[nodiscard]] _CCCL_API inline complex<_Tp> asinh(const complex<_Tp>& __x) | ||
| { | ||
| constexpr _Tp __pi = __numbers<_Tp>::__pi(); | ||
| if (::cuda::std::isinf(__x.real())) | ||
| // Uint of the same size as our fp type. | ||
| using __uint_t = __fp_storage_of_t<_Tp>; | ||
|
|
||
| constexpr int32_t __mant_nbits = __fp_mant_nbits_v<__fp_format_of_v<_Tp>>; | ||
| constexpr int32_t __exp_max = __fp_exp_max_v<__fp_format_of_v<_Tp>>; | ||
|
|
||
| constexpr _Tp __pi = __numbers<_Tp>::__pi(); | ||
| constexpr _Tp __ln2 = __numbers<_Tp>::__ln2(); | ||
|
|
||
| _Tp __realx = ::cuda::std::fabs(__x.real()); | ||
| _Tp __imagx = ::cuda::std::fabs(__x.imag()); | ||
|
|
||
| // Special cases that do not pass through: | ||
| if (!::cuda::std::isfinite(__realx) || !::cuda::std::isfinite(__imagx)) | ||
| { | ||
| if (::cuda::std::isnan(__x.imag())) | ||
| // If z is (x,+inf) (for any positive finite x), the result is (+inf, pi/2) | ||
| if (::cuda::std::isfinite(__realx) && ::cuda::std::isinf(__imagx)) | ||
| { | ||
| return __x; | ||
| return complex<_Tp>(::cuda::std::copysign(numeric_limits<_Tp>::infinity(), __x.real()), | ||
| ::cuda::std::copysign(_Tp(0.5) * __pi, __x.imag())); | ||
| } | ||
| if (::cuda::std::isinf(__x.imag())) | ||
|
|
||
| // If z is (+inf,y) (for any positive finite y), the result is (+inf,+0) | ||
| if (::cuda::std::isinf(__realx) && ::cuda::std::isfinite(__imagx)) | ||
| { | ||
| return complex<_Tp>(__x.real(), ::cuda::std::copysign(__pi * _Tp(0.25), __x.imag())); | ||
| return complex<_Tp>(::cuda::std::copysign(numeric_limits<_Tp>::infinity(), __x.real()), | ||
| ::cuda::std::copysign(_Tp(0), __x.imag())); | ||
| } | ||
| return complex<_Tp>(__x.real(), ::cuda::std::copysign(_Tp(0), __x.imag())); | ||
| } | ||
| if (::cuda::std::isnan(__x.real())) | ||
| { | ||
| if (::cuda::std::isinf(__x.imag())) | ||
|
|
||
| // If z is (+inf,+inf), the result is (+inf, pi/4) | ||
| if (::cuda::std::isinf(__realx) && ::cuda::std::isinf(__imagx)) | ||
| { | ||
| return complex<_Tp>(__x.imag(), __x.real()); | ||
| return complex<_Tp>(::cuda::std::copysign(numeric_limits<_Tp>::infinity(), __x.real()), | ||
| ::cuda::std::copysign(_Tp(0.25) * __pi, __x.imag())); | ||
| } | ||
| if (__x.imag() == _Tp(0)) | ||
|
|
||
| // If z is (+inf,NaN), the result is (+inf,NaN) | ||
| if (::cuda::std::isinf(__realx) && ::cuda::std::isnan(__imagx)) | ||
| { | ||
| return __x; | ||
| } | ||
| return complex<_Tp>(__x.real(), __x.real()); | ||
|
|
||
| // If z is (NaN,+0), the result is (NaN,+0) | ||
| if (::cuda::std::isnan(__realx) && (__imagx == _Tp{0})) | ||
| { | ||
| return __x; | ||
| } | ||
|
|
||
| // If z is (NaN,+inf), the result is (±INF,NaN) (the sign of the real part is unspecified) | ||
| if (::cuda::std::isnan(__realx) && ::cuda::std::isinf(__imagx)) | ||
| { | ||
| return complex<_Tp>(__x.imag(), numeric_limits<_Tp>::quiet_NaN()); | ||
| } | ||
| } | ||
| if (::cuda::std::isinf(__x.imag())) | ||
|
|
||
| // Special case that for various reasons does not pass | ||
| // easily through the algorithm below: | ||
| if ((__realx == _Tp{0}) && (__imagx == _Tp(1))) | ||
| { | ||
| return complex<_Tp>(::cuda::std::copysign(__x.imag(), __x.real()), | ||
| ::cuda::std::copysign(__pi / _Tp(2), __x.imag())); | ||
| return complex<_Tp>(__x.real(), ::cuda::std::copysign(_Tp{0.5} * __pi, __x.imag())); | ||
| } | ||
| complex<_Tp> __z = ::cuda::std::log(__x + ::cuda::std::sqrt(::cuda::std::__sqr(__x) + _Tp(1))); | ||
| return complex<_Tp>(::cuda::std::copysign(__z.real(), __x.real()), ::cuda::std::copysign(__z.imag(), __x.imag())); | ||
|
|
||
| // It is a little involved to account for large inputs in an inlined-into-existing-code fashion. | ||
| // The easiest place to account for large values appears to be here at the start. | ||
| // Use asinh(x) ~ log(2x) for large x. | ||
| // Get the largest exponent that passes through the algorithm without issue. | ||
| // This is ~(max_exponent / 4), with a small bias to make sure edge cases get caught | ||
| // ~254 for double, ~30 for float | ||
| constexpr int32_t __max_allowed_exponent = (__exp_max / 4) - 2; | ||
| constexpr __uint_t __max_allowed_val_as_uint = __uint_t(__max_allowed_exponent + __exp_max) << __mant_nbits; | ||
|
|
||
| // Check if the largest component of __x is > 2^__max_allowed_exponent: | ||
| _Tp __x_big_factor = _Tp{0}; | ||
| const _Tp __max = (__realx > __imagx) ? __realx : __imagx; | ||
| const bool __x_big = ::cuda::std::bit_cast<__uint_t>(__max) > __max_allowed_val_as_uint; | ||
|
|
||
| if (__x_big) | ||
| { | ||
| // We need __max to be <= ~(2^__max_allowed_exponent), | ||
| // but not small enough that the asinh(x) ~ log(2x) estimate does | ||
| // not break down. We are not able to reduce this with a single simple reduction, | ||
| // so we do a fast/inlined frexp/ldexp: | ||
| const int32_t __exp_biased = int32_t(::cuda::std::bit_cast<__uint_t>(__max) >> __mant_nbits); | ||
|
|
||
| // Get a factor such that (__max * __exp_mul_factor) <= __max_allowed_exponent | ||
| const __uint_t __exp_reduce_factor = | ||
| __uint_t((2 * __exp_max) + __max_allowed_exponent - __exp_biased) << __mant_nbits; | ||
| const _Tp __exp_mul_factor = ::cuda::std::bit_cast<_Tp>(__exp_reduce_factor); | ||
|
|
||
| // Scale down to a working range. | ||
| __realx *= __exp_mul_factor; | ||
| __imagx *= __exp_mul_factor; | ||
|
|
||
| __x_big_factor = _Tp((__exp_biased - __exp_max) - __max_allowed_exponent) * __ln2; | ||
| } | ||
|
|
||
| // let compiler pick which way to fma this, accuracy stays the same. | ||
| const _Tp __diffx_m1 = __realx * __realx - (__imagx - _Tp{1}) * (__imagx + _Tp{1}); | ||
|
|
||
| // Get the real and imag parts of |sqrt(z^2 + 1)|^2 | ||
| // This equates to calculating: | ||
| // sqrt((re*re + (im + 1.0)*(im + 1.0))*(re*re + (im - 1.0)*(im - 1.0))); | ||
| // Where we need both the term inside the sqrt in extended precision, as well | ||
| // as evaluation the sqrt itself in extended precision. | ||
|
|
||
| // Get re^2 + im^2 + 1 in extended precision. | ||
| // The low part of re^2 doesn't seem to matter. | ||
| const _Tp __imagx_sq_hi = __imagx * __imagx; | ||
| const _Tp __imagx_sq_lo = ::cuda::std::fma(__imagx, __imagx, -__imagx_sq_hi); | ||
|
|
||
| const _Tp __x_abs_sq_hi = __imagx_sq_hi; | ||
| const _Tp __x_abs_sq_lo = ::cuda::std::fma(__realx, __realx, __imagx_sq_lo); | ||
|
|
||
| // Add one: | ||
| const _Tp __x_abs_sq_p1_hi = (__x_abs_sq_hi + _Tp{1}); | ||
| const _Tp __x_abs_sq_p1_lo = __x_abs_sq_lo - ((__x_abs_sq_p1_hi - _Tp{1}) - __x_abs_sq_hi); | ||
|
|
||
| // square: | ||
| const _Tp __x_abs_sq_p1_sq_hi = __x_abs_sq_p1_hi * __x_abs_sq_p1_hi; | ||
| _Tp __x_abs_sq_p1_sq_lo = ::cuda::std::fma(__x_abs_sq_p1_hi, __x_abs_sq_p1_hi, -__x_abs_sq_p1_sq_hi); | ||
|
|
||
| // Add in the lower square terms, all needed | ||
| __x_abs_sq_p1_sq_lo = | ||
| ::cuda::std::fma(__x_abs_sq_p1_lo, (_Tp{2} * __x_abs_sq_p1_hi + __x_abs_sq_p1_lo), __x_abs_sq_p1_sq_lo); | ||
|
|
||
| // Get __x_abs_sq_p1_sq_hi/lo - 4.0*__imagx_sq_hi/lo: | ||
| // Subtract high parts: | ||
| _Tp __inner_most_term_hi = __x_abs_sq_p1_sq_hi - _Tp{4} * __imagx_sq_hi; | ||
| _Tp __inner_most_term_lo = ((__x_abs_sq_p1_sq_hi - __inner_most_term_hi) - _Tp{4} * __imagx_sq_hi); | ||
| // lo parts, all needed: | ||
| __inner_most_term_lo += __x_abs_sq_p1_sq_lo - _Tp{4} * __imagx_sq_lo; | ||
|
|
||
| // We can have some slightly bad cases here due to catastrohip cancellation that can't be fixed easily. | ||
| // We still need to to the extended-sqrt on these values, so we fix them now. | ||
| // It occurs around "real ~= small" and "imag ~= (1 - small)", and imag < 1. | ||
| // Worked out through targeted testing on fp64 and fp32. | ||
| _Tp __realx_small_bound = _Tp{1.0e-13}; | ||
| _Tp __imagx_close_bound = _Tp{0.98}; | ||
|
|
||
| if constexpr (is_same_v<_Tp, float>) | ||
| { | ||
| __realx_small_bound = _Tp{1.0e-5f}; | ||
| __imagx_close_bound = _Tp{0.9f}; | ||
| } | ||
|
|
||
| if ((__realx < __realx_small_bound) && (__imagx_close_bound < __imagx) && (__imagx <= _Tp{1})) | ||
| { | ||
| // Get (real^2 + (1 - imag)^2) * (real^2 + (1 + imag)^2) in double-double: | ||
| // term1 = (real^2 + (1 - imag)^2) | ||
| const _Tp __term1_hi = (_Tp{1} - __imagx) * (_Tp{1} - __imagx); | ||
| const _Tp __term1_lo = ::cuda::std::fma(_Tp{1} - __imagx, _Tp{1} - __imagx, -__term1_hi) + __realx * __realx; | ||
|
|
||
| // Need (1.0 + __imagx)^2 with nearly full accuracy. | ||
| const _Tp __term2_sum_hi = (_Tp{1} + __imagx); | ||
| const _Tp __term2_sum_lo = ((_Tp{1} - __term2_sum_hi) + __imagx); | ||
|
|
||
| const _Tp __term2_sq_hi = __term2_sum_hi * __term2_sum_hi; | ||
| _Tp __term2_sq_lo = ::cuda::std::fma(__term2_sum_hi, __term2_sum_hi, -__term2_sq_hi); | ||
| __term2_sq_lo += _Tp{2} * __term2_sum_hi * __term2_sum_lo; | ||
|
|
||
| // Multiple __term1_hi/lo and __term2_sq_hi/lo: | ||
| __inner_most_term_hi = __term1_hi * __term2_sq_hi; | ||
| __inner_most_term_lo = ::cuda::std::fma(__term1_hi, __term2_sq_hi, -__inner_most_term_hi); | ||
| // All needed: | ||
| __inner_most_term_lo += __term1_hi * __term2_sq_lo + __term1_lo * __term2_sq_hi; | ||
| } | ||
|
|
||
| // Normalize the above (assumed in the extended-sqrt function): | ||
| const _Tp __norm_hi = __inner_most_term_hi + __inner_most_term_lo; | ||
| const _Tp __norm_lo = -((__norm_hi - __inner_most_term_hi) - __inner_most_term_lo); | ||
| __inner_most_term_hi = __norm_hi; | ||
| __inner_most_term_lo = __norm_lo; | ||
|
|
||
| // Extended sqrt function: | ||
| // (__extended_sqrt_hi + __extended_sqrt_lo) = sqrt(__inner_most_term_hi + __inner_most_term_lo) | ||
| const __cccl_asinh_sqrt_return_hilo<_Tp> __extended_sqrt_hilo = | ||
| __internal_double_Tp_sqrt_unsafe<_Tp>(__inner_most_term_hi, __inner_most_term_lo); | ||
|
|
||
| _Tp __extended_sqrt_hi = __extended_sqrt_hilo.__hi; | ||
| _Tp __extended_sqrt_lo = __extended_sqrt_hilo.__lo; | ||
|
|
||
| // 0.0, and some very particular values, do not survive this unsafe sqrt function. | ||
| // This case occurs when (1 + x^2) is zero or denormal. (and rsqrt(x)*rsqrt(x) become inf). | ||
| constexpr __uint_t __min_normal_bits = __uint_t(0x1) << __mant_nbits; | ||
| const _Tp __min_normal = ::cuda::std::bit_cast<_Tp>(__min_normal_bits); | ||
|
|
||
| if (__inner_most_term_hi <= _Tp{2} * __min_normal) | ||
| { | ||
| __extended_sqrt_hi = _Tp{2} * __realx; | ||
| __extended_sqrt_lo = _Tp{0}; | ||
| } | ||
|
|
||
| // Get sqrt(0.5*(__extended_sqrt_hi + __diffx_m1)) | ||
| // This can result in catastrophic cancellation if __diffx_m1 < 0, in this case | ||
| // We instead use the equivalent | ||
| // (__realx*__imagx) / sqrt(0.5*(__extended_sqrt_hi - __diffx_m1)) | ||
|
|
||
| const _Tp __inside_sqrt_term = _Tp{0.5} * (::cuda::std::fabs(__diffx_m1) + __extended_sqrt_hi); | ||
|
|
||
| // Allow for rsqrt optimization: | ||
| // We can have two slightly different paths depending on whether rsqrt is available | ||
| // or not, aka are we on device or host. | ||
|
|
||
| const _Tp __recip_sqrt = __internal_rsqrt_inverse_hyperbloic<_Tp>(__inside_sqrt_term); | ||
| _Tp __pos_evaluation_real; | ||
|
|
||
| // This reuses the sqrt calculated on CPU already in __recip_sqrt, | ||
| // And gets sqrt quickly on device using the rsqrt already calculated. | ||
| #if _CCCL_CUDA_COMPILATION() | ||
| NV_IF_ELSE_TARGET(NV_IS_DEVICE, | ||
| (__pos_evaluation_real = (__recip_sqrt * __inside_sqrt_term);), | ||
| (__pos_evaluation_real = ::cuda::std::sqrt(__inside_sqrt_term);)) | ||
| #else | ||
| __pos_evaluation_real = ::cuda::std::sqrt(__inside_sqrt_term); | ||
| #endif // _CCCL_CUDA_COMPILATION() | ||
|
|
||
| // Here, in a happy coincidence(?), we happen to intermediately calculate an accurate | ||
| // return value for the real part of the answer in the case that __realx is small, | ||
| // as you would obtain from the Taylor expansion of asinh. (~ real/sqrt(1 - imag^2)). | ||
| // The following parts of the calculation result in bad catastrophic cancellation for | ||
| // this case, so we save this intermediate value: | ||
| const _Tp __small_x_real_return_val = __realx * __recip_sqrt; | ||
| const _Tp __pos_evaluation_imag = __imagx * __small_x_real_return_val; | ||
|
|
||
| const _Tp __sqrt_real_part = (__diffx_m1 > _Tp{0}) ? __pos_evaluation_real : __pos_evaluation_imag; | ||
| const _Tp __sqrt_imag_part = (__diffx_m1 > _Tp{0}) ? __pos_evaluation_imag : __pos_evaluation_real; | ||
|
|
||
| // For an accurate log, we calculate |(__sqrt_real_part + i*__sqrt_imag_part)| - 1 and use log1p. | ||
| // This can normally have bad catastrophic cancellation, however | ||
| // we have a lot of retained enough accuracy to subtract fairly simply: | ||
| const _Tp __m1 = __extended_sqrt_hi - _Tp{1}; | ||
| const _Tp __rem = -((__m1 + _Tp{1}) - __extended_sqrt_hi); | ||
|
|
||
| __extended_sqrt_hi = __m1; | ||
| __extended_sqrt_lo += __rem; | ||
|
|
||
| // Final sum before sending it to log1p, all terms needed. | ||
| // Add our sum via three terms, adding equally sized components. | ||
| const _Tp __sum1 = (__x_abs_sq_hi + __extended_sqrt_hi); | ||
| const _Tp __sum2 = _Tp{2} * (__realx * __sqrt_real_part + __imagx * __sqrt_imag_part); | ||
| const _Tp __sum3 = (__extended_sqrt_lo + __x_abs_sq_lo); | ||
|
|
||
| const _Tp __abs_sqrt_part_sq = __sum1 + (__sum2 + __sum3); | ||
|
|
||
| const _Tp __atan2_input1 = __imagx + __sqrt_imag_part; | ||
| const _Tp __atan2_input2 = __realx + __sqrt_real_part; | ||
|
|
||
| _Tp __ans_real = _Tp{0.5} * ::cuda::std::log1p(__abs_sqrt_part_sq); | ||
| const _Tp __ans_imag = ::cuda::std::atan2(__atan2_input1, __atan2_input2); | ||
|
|
||
| // The small |real| case, as mentioned above. | ||
| // Bounds found by testing. | ||
| _Tp __realx_small_bound_override = _Tp{2.220446e-16}; | ||
|
|
||
| if constexpr (is_same_v<_Tp, float>) | ||
| { | ||
| __realx_small_bound_override = _Tp{6.0e-08f}; | ||
| } | ||
|
|
||
| if (__realx < __realx_small_bound_override && __imagx < _Tp(1)) | ||
| { | ||
| __ans_real = __small_x_real_return_val; | ||
| } | ||
|
|
||
| __ans_real += __x_big_factor; | ||
|
|
||
| // Copy signs back in | ||
| return complex<_Tp>(::cuda::std::copysign(__ans_real, __x.real()), ::cuda::std::copysign(__ans_imag, __x.imag())); | ||
| } | ||
|
|
||
| // We have performance issues with some trigonometric functions with extended floating point types | ||
| #if _LIBCUDACXX_HAS_NVBF16() | ||
| template <> | ||
| _CCCL_API inline complex<__nv_bfloat16> asinh(const complex<__nv_bfloat16>& __x) | ||
| { | ||
| return complex<__nv_bfloat16>{::cuda::std::asinh(complex<float>{__x})}; | ||
| const complex<__nv_bfloat16> ans = complex<__nv_bfloat16>{::cuda::std::asinh(complex<float>{__x})}; | ||
|
|
||
| return ans; | ||
|
Comment on lines
+393
to
+395
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this needed for? |
||
| } | ||
| #endif // _LIBCUDACXX_HAS_NVBF16() | ||
|
|
||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.