fix(unary-cuda): improve acos/reciprocal/tan numeric correctness

PanZezhong1725 · PanZezhong1725 · commit f43dc029ad9c · 2026-04-08T09:11:05.000Z
diff --git a/src/infiniop/ops/acos/cuda/kernel.cuh b/src/infiniop/ops/acos/cuda/kernel.cuh
@@ -7,17 +7,6 @@
 
 namespace op::acos::cuda {
 
-// ----------------------
-// Fast acos approximation
-// ----------------------
-__device__ __forceinline__ float fast_acosf(float x) {
-    // 高性能多项式近似 acos(x)
-    float ax = fabsf(x);
-    float t = sqrtf(1.0f - ax);
-    float r = ((-0.0187293f * ax + 0.0742610f) * ax - 0.2121144f) * ax + 1.5707288f;
-    return (x >= 0.0f ? t * r : 3.14159265358979323846f - t * r);
-}
-
 // ----------------------
 // float kernel (F32)
 // ----------------------
@@ -26,39 +15,27 @@ __device__ __forceinline__ T acos_impl(T val);
 
 template <>
 __device__ __forceinline__ float acos_impl<float>(float val) {
-    return fast_acosf(val);
+    return ::acosf(val);
 }
 
 // ----------------------
 // half kernel (F16)
 // ----------------------
 template <>
 __device__ __forceinline__ half acos_impl<half>(half val) {
-#if (__CUDA_ARCH__ >= 530)
-    float f = __half2float(val);
-    return __float2half(fast_acosf(f));
-#else
     float f = __half2float(val);
-    return __float2half(fast_acosf(f));
-#endif
+    return __float2half(::acosf(f));
 }
 
 // ----------------------
 // half2 kernel (F16x2 vectorized)
 // ----------------------
 template <>
 __device__ __forceinline__ half2 acos_impl<half2>(half2 val) {
-#if (__CUDA_ARCH__ >= 530)
     float2 f = __half22float2(val);
-    f.x = fast_acosf(f.x);
-    f.y = fast_acosf(f.y);
+    f.x = ::acosf(f.x);
+    f.y = ::acosf(f.y);
     return __float22half2_rn(f);
-#else
-    float2 f = __half22float2(val);
-    f.x = fast_acosf(f.x);
-    f.y = fast_acosf(f.y);
-    return __float22half2_rn(f);
-#endif
 }
 
 // ----------------------
@@ -67,15 +44,20 @@ __device__ __forceinline__ half2 acos_impl<half2>(half2 val) {
 template <>
 __device__ __forceinline__ cuda_bfloat16 acos_impl<cuda_bfloat16>(cuda_bfloat16 val) {
     float f = __bfloat162float(val);
-    return __float2bfloat16(fast_acosf(f));
+    return __float2bfloat16(::acosf(f));
+}
+
+template <>
+__device__ __forceinline__ double acos_impl<double>(double val) {
+    return ::acos(val);
 }
 
 // ----------------------
 // Fallback kernel
 // ----------------------
 template <typename T>
 __device__ __forceinline__ T acos_impl(T val) {
-    return static_cast<T>(fast_acosf(static_cast<float>(val)));
+    return static_cast<T>(::acos(static_cast<double>(val)));
 }
 
 // ----------------------
diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
@@ -1,22 +1,25 @@
 #ifndef __RECIPROCAL_CUDA_H__
 #define __RECIPROCAL_CUDA_H__
 
+#include <type_traits>
+
 namespace op::reciprocal::cuda {
 typedef struct ReciprocalOp {
 public:
     static constexpr size_t num_inputs = 1;
     template <typename T>
     __device__ __forceinline__ T operator()(const T &x) const {
         if constexpr (std::is_same_v<T, half2>) {
-            return h2rcp(x);
+            float2 vf = __half22float2(x);
+            vf.x = 1.0f / vf.x;
+            vf.y = 1.0f / vf.y;
+            return __float22half2_rn(vf);
         } else if constexpr (std::is_same_v<T, half>) {
-            return hrcp(x);
+            return __float2half(1.0f / __half2float(x));
         } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
-            // bfloat16 does not have a direct hrcp intrinsic in some versions,
-            // often handled by converting to float or using specific bf16 intrinsics
-            return __float2bfloat16(1.0f / __bfloat162float(x));
+            return __float2bfloat16_rn(1.0f / __bfloat162float(x));
         } else if constexpr (std::is_same_v<T, float>) {
-            return __frcp_rd(x);
+            return 1.0f / x;
         } else {
             return static_cast<T>(1) / x;
         }
diff --git a/src/infiniop/ops/tan/cuda/kernel.cuh b/src/infiniop/ops/tan/cuda/kernel.cuh
@@ -1,26 +1,40 @@
 #ifndef __TAN_CUDA_H__
 #define __TAN_CUDA_H__
 
+#include <cmath>
+#include <type_traits>
+
 namespace op::tan::cuda {
 
 typedef struct TanOp {
 public:
     static constexpr size_t num_inputs = 1;
     template <typename T>
     __device__ __forceinline__ T operator()(const T &x) const {
-        if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(x);
+            vf.x = ::tanf(vf.x);
+            vf.y = ::tanf(vf.y);
+            return __float22half2_rn(vf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(x));
+            float f1 = __bfloat162float(__high2bfloat16(x));
+            return __floats2bfloat162_rn(::tanf(f0), ::tanf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
             // BF16
             const float x_f = __bfloat162float(x);
-            return __float2bfloat16(__tanf(x_f));
+            return __float2bfloat16_rn(::tanf(x_f));
         } else if constexpr (std::is_same_v<T, half>) {
             // FP16
             const float x_f = __half2float(x);
-            return __float2half(__tanf(x_f));
+            return __float2half(::tanf(x_f));
         } else if constexpr (std::is_same_v<T, float>) {
             // FP32
-            return __tanf(x);
+            return ::tanf(x);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::tan(x);
         } else {
-            return __tanf(x);
+            return static_cast<T>(::tan(static_cast<double>(x)));
         }
     }
 } TanOp;