Use faster approx for cos and exp2, greatly reduce cpu usage on decoder thread

disjtqz · disjtqz · commit a14f5c03834a · 2022-10-29T11:36:28.000-07:00
diff --git a/libavcodec/mdct_template.c b/libavcodec/mdct_template.c
@@ -41,6 +41,17 @@
 #   define RSCALE(x, y) ((int)((x) + (unsigned)(y)) >> 1)
 #endif /* FFT_FIXED_32 */
 #endif
+static av_always_inline
+double cosfast(double x) 
+{
+	//todo:calc sin too
+    double inv2pi = 1.0 / (2.0 * M_PI);
+    x *= inv2pi;
+    x -= 0.25 + floor(x + 0.25);
+    x *= 16.0 * (fabs(x) - 0.5);
+    x += 0.225 * x * (fabs(x) - 1.0);
+    return x;
+}
 
 /**
  * init MDCT or IMDCT computation.
@@ -83,10 +94,10 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
     for(i=0;i<n4;i++) {
         alpha = 2 * M_PI * (i + theta) / n;
 #if FFT_FIXED_32
-        s->tcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0);
+        s->tcos[i*tstep] = lrint(-cosfast(alpha) * 2147483648.0);
         s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0);
 #else
-        s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
+        s->tcos[i*tstep] = FIX15(-cosfast(alpha) * scale);
         s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
 #endif
     }
diff --git a/libavutil/ffmath.h b/libavutil/ffmath.h
@@ -28,6 +28,27 @@
 
 #include "attributes.h"
 #include "libm.h"
+static av_always_inline double fastexp2(double xmm0) {
+	//should upper bound be clamped too?
+	xmm0 = xmm0 > -1022 ? xmm0 : -1022;
+	
+	double xmm2 = 4.8425778448581696;
+	double xmm3 = 27.728333711624146;
+	
+	double xmm1 = floor(xmm0);
+	xmm0 = xmm0 - xmm1;
+	xmm1 = xmm1 + 1017.2740579843521;
+	xmm2 = xmm2 - xmm0;
+	
+	//fmadd would be nice here
+	xmm1 += (xmm0*-0.49013227410614491);
+	
+	xmm2 = xmm3 / xmm2;//this could be done via rcp_ss, the value will be within float32 range and we can refine
+	xmm0 = xmm1 + xmm2;
+	xmm0 = xmm0 * 4503599627370496.0;
+	signed long long r0 = (signed long long)(xmm0);
+	return *(double*)&r0;
+}
 
 /**
  * Compute 10^x for floating point values. Note: this function is by no means
@@ -41,7 +62,7 @@
  */
 static av_always_inline double ff_exp10(double x)
 {
-    return exp2(M_LOG2_10 * x);
+    return fastexp2(M_LOG2_10 * x);
 }
 
 static av_always_inline float ff_exp10f(float x)