Skip to content

Commit a14f5c0

Browse files
committed
Use faster approx for cos and exp2, greatly reduce cpu usage on decoder thread
1 parent a437fe6 commit a14f5c0

File tree

2 files changed

+35
-3
lines changed

2 files changed

+35
-3
lines changed

Diff for: libavcodec/mdct_template.c

+13-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,17 @@
4141
# define RSCALE(x, y) ((int)((x) + (unsigned)(y)) >> 1)
4242
#endif /* FFT_FIXED_32 */
4343
#endif
44+
static av_always_inline
45+
double cosfast(double x)
46+
{
47+
//todo:calc sin too
48+
double inv2pi = 1.0 / (2.0 * M_PI);
49+
x *= inv2pi;
50+
x -= 0.25 + floor(x + 0.25);
51+
x *= 16.0 * (fabs(x) - 0.5);
52+
x += 0.225 * x * (fabs(x) - 1.0);
53+
return x;
54+
}
4455

4556
/**
4657
* init MDCT or IMDCT computation.
@@ -83,10 +94,10 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
8394
for(i=0;i<n4;i++) {
8495
alpha = 2 * M_PI * (i + theta) / n;
8596
#if FFT_FIXED_32
86-
s->tcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0);
97+
s->tcos[i*tstep] = lrint(-cosfast(alpha) * 2147483648.0);
8798
s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0);
8899
#else
89-
s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
100+
s->tcos[i*tstep] = FIX15(-cosfast(alpha) * scale);
90101
s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
91102
#endif
92103
}

Diff for: libavutil/ffmath.h

+22-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,27 @@
2828

2929
#include "attributes.h"
3030
#include "libm.h"
31+
static av_always_inline double fastexp2(double xmm0) {
32+
//should upper bound be clamped too?
33+
xmm0 = xmm0 > -1022 ? xmm0 : -1022;
34+
35+
double xmm2 = 4.8425778448581696;
36+
double xmm3 = 27.728333711624146;
37+
38+
double xmm1 = floor(xmm0);
39+
xmm0 = xmm0 - xmm1;
40+
xmm1 = xmm1 + 1017.2740579843521;
41+
xmm2 = xmm2 - xmm0;
42+
43+
//fmadd would be nice here
44+
xmm1 += (xmm0*-0.49013227410614491);
45+
46+
xmm2 = xmm3 / xmm2;//this could be done via rcp_ss, the value will be within float32 range and we can refine
47+
xmm0 = xmm1 + xmm2;
48+
xmm0 = xmm0 * 4503599627370496.0;
49+
signed long long r0 = (signed long long)(xmm0);
50+
return *(double*)&r0;
51+
}
3152

3253
/**
3354
* Compute 10^x for floating point values. Note: this function is by no means
@@ -41,7 +62,7 @@
4162
*/
4263
static av_always_inline double ff_exp10(double x)
4364
{
44-
return exp2(M_LOG2_10 * x);
65+
return fastexp2(M_LOG2_10 * x);
4566
}
4667

4768
static av_always_inline float ff_exp10f(float x)

0 commit comments

Comments
 (0)