diff --git a/CHANGELOG b/CHANGELOG index 4d445478..aae74d38 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,12 @@ * RECENT CHANGES ******************************************************************************* +=== 1.0.22 === +* Implemented functions for computing correlation between two signals. +* Fixed error in AVX and AVX-512 implementation of lr_to_ms functions. +* Updated build scripts. +* Updated module versions in dependencies. + === 1.0.21 === * Updated build scripts. * Updated module versions in dependencies. diff --git a/include/lsp-plug.in/dsp/common/correlation.h b/include/lsp-plug.in/dsp/common/correlation.h new file mode 100644 index 00000000..bd86c0d9 --- /dev/null +++ b/include/lsp-plug.in/dsp/common/correlation.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 7 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ +#ifndef LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ +#define LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ + +#include + +LSP_DSP_LIB_BEGIN_NAMESPACE + +#pragma pack(push, 1) + +/** + * Object to store correlation state. + * + * The correlation is computed using the following formula: + * + * sum(a[i] * b[i]) + * corr = --------------------------------------- + * sqrt(sum(a[i]*a[i]) * sum(b[i]*b[i])) + * + * where i is in range of 0 to count-1. + * + */ +typedef struct LSP_DSP_LIB_TYPE(correlation_t) +{ + float v; // the aggregated value of sum(a*b) + float a; // the aggregated value of sum(a*a) + float b; // the aggregated value of sum(b*b) +} LSP_DSP_LIB_TYPE(correlation_t); + +#pragma pack(pop) + +LSP_DSP_LIB_END_NAMESPACE + +/** + * Compute the initial intermediate values of correlation between two signals, + * the function can be called multiple times, so the value of corr structure + * should be cleared before first call. + * + * @param corr the object to initialize with intermediate results + * @param a the pointer to the first signal buffer + * @param b the pointer to the second signal buffer + * @param count number of samples to process + */ +LSP_DSP_LIB_SYMBOL(void, corr_init, + LSP_DSP_LIB_TYPE(correlation_t) *corr, + const float *a, const float *b, + size_t count); + +/** + * Compute incremental value of normalized correlation between two signals + * + * @param corr the object that holds intermediate results + * @param dst destination buffer to store result + * @param a_head the pointer to the head of the first signal buffer + * @param b_head the pointer to the head of the second signal buffer + * @param a_tail the pointer to the tail of the first signal buffer + * @param b_tail the pointer to the tail of the second signal buffer + * @param head the offset of the head element relative to the first one in correlation (length of the window minus one) + * @param count number of samples to process + */ +LSP_DSP_LIB_SYMBOL(void, corr_incr, + LSP_DSP_LIB_TYPE(correlation_t) *corr, + float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + +#endif /* LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ */ diff --git a/include/lsp-plug.in/dsp/dsp.h b/include/lsp-plug.in/dsp/dsp.h index 900547f1..38c0a815 100644 --- a/include/lsp-plug.in/dsp/dsp.h +++ b/include/lsp-plug.in/dsp/dsp.h @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/include/lsp-plug.in/dsp/version.h b/include/lsp-plug.in/dsp/version.h index bd21c8d5..8fa10d85 100644 --- a/include/lsp-plug.in/dsp/version.h +++ b/include/lsp-plug.in/dsp/version.h @@ -25,7 +25,7 @@ // Define version of headers #define LSP_DSP_LIB_MAJOR 1 #define LSP_DSP_LIB_MINOR 0 -#define LSP_DSP_LIB_MICRO 21 +#define LSP_DSP_LIB_MICRO 22 #if defined(__WINDOWS__) || defined(__WIN32__) || defined(__WIN64__) || defined(_WIN64) || defined(_WIN32) || defined(__WINNT) || defined(__WINNT__) #define LSP_DSP_LIB_EXPORT_MODIFIER __declspec(dllexport) diff --git a/include/private/dsp/arch/aarch64/asimd/correlation.h b/include/private/dsp/arch/aarch64/asimd/correlation.h new file mode 100644 index 00000000..4debc2af --- /dev/null +++ b/include/private/dsp/arch/aarch64/asimd/correlation.h @@ -0,0 +1,361 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL */ + +namespace lsp +{ + namespace asimd + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + ARCH_AARCH64_ASM( + __ASM_EMIT("eor v0.16b, v0.16b, v0.16b") /* xv = 0 */ + __ASM_EMIT("eor v1.16b, v1.16b, v1.16b") /* xa = 0 */ + __ASM_EMIT("eor v2.16b, v2.16b, v2.16b") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("subs %[count], %[count], #16") + __ASM_EMIT("b.lo 2f") + __ASM_EMIT("eor v3.16b, v3.16b, v3.16b") /* xv = 0 */ + __ASM_EMIT("eor v4.16b, v4.16b, v4.16b") /* xa = 0 */ + __ASM_EMIT("eor v5.16b, v5.16b, v5.16b") /* xb = 0 */ + __ASM_EMIT("1:") + __ASM_EMIT("ldp q8, q9, [%[a], 0x00]") /* v8 = a0, v9 = a1 */ + __ASM_EMIT("ldp q10, q11, [%[a], 0x20]") /* v10 = a2, v11 = a3 */ + __ASM_EMIT("ldp q12, q13, [%[b], 0x00]") /* v12 = b0, v13 = b1 */ + __ASM_EMIT("ldp q14, q15, [%[b], 0x20]") /* v14 = b2, v15 = b3 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v3.4s, v9.4s, v13.4s") /* v3 = xv + a1*b1 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v4.4s, v9.4s, v9.4s") /* v4 = xa + a1*a1 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("fmla v5.4s, v13.4s, v13.4s") /* v5 = xb + b1*b1 */ + __ASM_EMIT("fmla v0.4s, v10.4s, v14.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v3.4s, v11.4s, v15.4s") /* v3 = xv + a1*b1 */ + __ASM_EMIT("fmla v1.4s, v10.4s, v10.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v4.4s, v11.4s, v11.4s") /* v4 = xa + a1*a1 */ + __ASM_EMIT("fmla v2.4s, v14.4s, v14.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("fmla v5.4s, v15.4s, v15.4s") /* v5 = xb + b1*b1 */ + __ASM_EMIT("subs %[count], %[count], #16") + __ASM_EMIT("add %[a], %[a], #0x40") + __ASM_EMIT("add %[b], %[b], #0x40") + __ASM_EMIT("b.hs 1b") + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("adds %[count], %[count], #8") + __ASM_EMIT("b.lt 4f") + __ASM_EMIT("ldp q8, q9, [%[a], 0x00]") /* v8 = a0, v9 = a1 */ + __ASM_EMIT("ldp q12, q13, [%[b], 0x00]") /* v12 = b0, v13 = b1 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("fmla v0.4s, v9.4s, v13.4s") /* v0 = xv + a1*b1 */ + __ASM_EMIT("fmla v1.4s, v9.4s, v9.4s") /* v1 = xa + a1*a1 */ + __ASM_EMIT("fmla v2.4s, v13.4s, v13.4s") /* v2 = xb + b1*b1 */ + __ASM_EMIT("sub %[count], %[count], #8") + __ASM_EMIT("add %[a], %[a], #0x20") + __ASM_EMIT("add %[b], %[b], #0x20") + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("adds %[count], %[count], #4") + __ASM_EMIT("b.lt 6f") + __ASM_EMIT("ldr q8, [%[a], 0x00]") /* v8 = a0, v9 = a1 */ + __ASM_EMIT("ldr q12, [%[b], 0x00]") /* v12 = b0, v13 = b1 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("sub %[count], %[count], #4") + __ASM_EMIT("add %[a], %[a], #0x10") + __ASM_EMIT("add %[b], %[b], #0x10") + __ASM_EMIT("6:") + __ASM_EMIT("eor v6.16b, v6.16b, v6.16b") /* v6 = 0 */ + __ASM_EMIT("ext v3.16b, v0.16b, v6.16b, #8")/* v3 = xv2 xv3 0 0 */ + __ASM_EMIT("ext v4.16b, v1.16b, v6.16b, #8")/* v4 = xa2 xb3 0 0 */ + __ASM_EMIT("ext v5.16b, v2.16b, v6.16b, #8")/* v5 = xa2 xb3 0 0 */ + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") /* v0 = xv0+xv2 xv1+xv3 xv2 xv3 */ + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") /* v1 = xa0+xa2 xa1+xa3 xa2 xa3 */ + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") /* v2 = xb0+xb2 xb1+xb3 xb2 xb3 */ + __ASM_EMIT("ext v3.16b, v0.16b, v6.16b, #4")/* v3 = xv1+xv3 xv2 xv3 0 */ + __ASM_EMIT("ext v4.16b, v1.16b, v6.16b, #4")/* v4 = xa1+xa3 xa2 xa3 0 */ + __ASM_EMIT("ext v5.16b, v2.16b, v6.16b, #4")/* v5 = xb1+xb3 xb2 xb3 0 */ + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") /* v0 = xv0+xv1+xv2+xv3 xv1+xv2+xv3 xv2+xv3 xv3 */ + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") /* v1 = xa0+xa1+xa2+xa3 xa1+xa2+xa3 xa2+xa3 xa3 */ + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") /* v2 = xb0+xb1+xb2+xb3 xb1+xb2+xb3 xb2+xb3 xb3 */ + /* 1x blocks */ + __ASM_EMIT("adds %[count], %[count], #3") + __ASM_EMIT("b.lt 8f") + __ASM_EMIT("7:") + __ASM_EMIT("ld1r {v8.4s}, [%[a]]") /* v8 = a0 */ + __ASM_EMIT("ld1r {v12.4s}, [%[b]]") /* q12 = b0 */ + __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */ + __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */ + __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */ + __ASM_EMIT("subs %[count], %[count], #1") + __ASM_EMIT("add %[a], %[a], #0x04") + __ASM_EMIT("add %[b], %[b], #0x04") + __ASM_EMIT("b.ge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("ld3r {v3.4s, v4.4s, v5.4s}, [%[corr]]") /* v3 = v, v4 = a, v5 = b */ + __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") + __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") + __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") + __ASM_EMIT("st3 {v0.s, v1.s, v2.s}[0], [%[corr]]") + + : [a] "+r" (a), [b] "+r" (b), [count] "+r" (count) + : [corr] "r" (corr) + : "cc", "memory", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); + } + + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC8(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + ARCH_AARCH64_ASM( + __ASM_EMIT("ld3r {v0.4s, v1.4s, v2.4s}, [%[corr]]") /* v0 = xv, v1 = xa, v2 = xb */ + __ASM_EMIT("eor v3.16b, v3.16b, v3.16b") /* v3 = 0 */ + /* 8x blocks */ + __ASM_EMIT("subs %[count], %[count], #8") + __ASM_EMIT("b.lo 2f") + __ASM_EMIT("1:") + __ASM_EMIT("ldp q4, q5, [%[a_head], 0x00]") /* v4 = ah0, v5 = ah1 */ + __ASM_EMIT("ldp q6, q7, [%[b_head], 0x00]") /* v6 = ah0, v7 = ah1 */ + __ASM_EMIT("ldp q8, q9, [%[a_tail], 0x00]") /* v8 = ah0, v9 = ah1 */ + __ASM_EMIT("ldp q10, q11, [%[b_tail], 0x00]") /* v10 = ah0, v11 = ah1 */ + __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */ + __ASM_EMIT("fmul v13.4s, v5.4s, v7.4s") + __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */ + __ASM_EMIT("fmul v5.4s, v5.4s, v5.4s") + __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */ + __ASM_EMIT("fmul v7.4s, v7.4s, v7.4s") + __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("fmls v13.4s, v9.4s, v11.4s") + __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("fmls v5.4s, v9.4s, v9.4s") + __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */ + __ASM_EMIT("fmls v7.4s, v11.4s, v11.4s") + + __ASM_EMIT("ext v14.16b, v3.16b, v12.16b, #8") /* v14 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("ext v15.16b, v3.16b, v13.16b, #8") /* v15 = 0 0 DV[4] DV[5] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v14.4s") /* v12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("fadd v13.4s, v13.4s, v15.4s") /* v13 = DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v12.16b, #12") /* v14 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("ext v15.16b, v3.16b, v13.16b, #12") /* v15 = 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v14.4s") /* v12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("fadd v13.4s, v13.4s, v15.4s") /* v13 = V[4..7] = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v4.16b, #8") /* v14 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("ext v15.16b, v3.16b, v5.16b, #8") /* v15 = 0 0 DA[4] DA[5] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v14.4s") /* v4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("fadd v5.4s, v5.4s, v15.4s") /* v5 = DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v4.16b, #12") /* v14 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("ext v15.16b, v3.16b, v5.16b, #12") /* v15 = 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v14.4s") /* v4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("fadd v5.4s, v5.4s, v15.4s") /* v5 = A[4..7] = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v6.16b, #8") /* v14 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("ext v15.16b, v3.16b, v7.16b, #8") /* v15 = 0 0 DB[4] DB[5] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v14.4s") /* v6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("fadd v7.4s, v7.4s, v15.4s") /* v7 = DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("ext v14.16b, v3.16b, v6.16b, #12") /* v14 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("ext v15.16b, v3.16b, v7.16b, #12") /* v15 = 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v14.4s") /* v6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("fadd v7.4s, v7.4s, v15.4s") /* v7 = B[4..7] = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("dup v8.4s, v12.s[3]") /* v8 = V[3] V[3] V[3] V[3] */ + __ASM_EMIT("dup v9.4s, v4.s[3]") /* v9 = A[3] A[3] A[3] A[3] */ + __ASM_EMIT("dup v10.4s, v6.s[3]") /* v10 = B[3] B[3] B[3] B[3] */ + __ASM_EMIT("fadd v13.4s, v13.4s, v8.4s") /* v13 = V[3]+V[4] V[3]+V[5] V[3]+V[6] V[3]+V[7] */ + __ASM_EMIT("fadd v5.4s, v5.4s, v9.4s") /* v5 = A[3]+A[4] A[3]+A[5] A[3]+A[6] A[3]+A[7] */ + __ASM_EMIT("fadd v7.4s, v7.4s, v10.4s") /* v7 = B[3]+B[4] B[3]+B[5] B[3]+B[6] B[3]+B[7] */ + + __ASM_EMIT("fadd v4.4s, v4.4s, v1.4s") /* v4 = BA = xa + A */ + __ASM_EMIT("fadd v5.4s, v5.4s, v1.4s") + __ASM_EMIT("fadd v6.4s, v6.4s, v2.4s") /* v6 = BB = xb + B */ + __ASM_EMIT("fadd v7.4s, v7.4s, v2.4s") + __ASM_EMIT("fadd v8.4s, v12.4s, v0.4s") /* v8 = T = xv + V */ + __ASM_EMIT("fadd v9.4s, v13.4s, v0.4s") + __ASM_EMIT("fmul v10.4s, v4.4s, v6.4s") /* v10 = B = BA * BB */ + __ASM_EMIT("fmul v11.4s, v5.4s, v7.4s") + __ASM_EMIT("dup v0.4s, v9.s[3]") /* v0 = xv' = T[7] */ + __ASM_EMIT("dup v1.4s, v5.s[3]") /* v1 = xa' = BA[7] */ + __ASM_EMIT("dup v2.4s, v7.s[3]") /* v2 = xb' = BB[7] */ + __ASM_EMIT("ldp q14, q15, [%[CORR_CC]]") /* v14 = 1e-10, v15 = 1e-10 */ + + __ASM_EMIT("fcmge v14.4s, v8.4s, v14.4s") /* v14 = T >= 1e-10 */ + __ASM_EMIT("fcmge v15.4s, v9.4s, v15.4s") + __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */ + __ASM_EMIT("frsqrte v5.4s, v11.4s") + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */ + __ASM_EMIT("fmul v7.4s, v5.4s, v11.4s") + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("frsqrts v13.4s, v7.4s, v5.4s") + __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v5.4s, v5.4s, v13.4s") + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */ + __ASM_EMIT("fmul v7.4s, v5.4s, v11.4s") + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("frsqrts v13.4s, v7.4s, v5.4s") + __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v11.4s, v5.4s, v13.4s") + __ASM_EMIT("fmul v10.4s, v8.4s, v10.4s") /* v10 = T/svrtf(B) */ + __ASM_EMIT("fmul v11.4s, v9.4s, v11.4s") + __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */ + __ASM_EMIT("and v11.16b, v11.16b, v15.16b") + __ASM_EMIT("add %[a_head], %[a_head], #0x20") + __ASM_EMIT("add %[b_head], %[b_head], #0x20") + __ASM_EMIT("subs %[count], %[count], #8") + __ASM_EMIT("stp q10, q11, [%[dst], 0x00]") + __ASM_EMIT("add %[a_tail], %[a_tail], #0x20") + __ASM_EMIT("add %[b_tail], %[b_tail], #0x20") + __ASM_EMIT("add %[dst], %[dst], #0x20") + __ASM_EMIT("b.hs 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("adds %[count], %[count], #4") + __ASM_EMIT("b.lt 4f") + __ASM_EMIT("ldr q4, [%[a_head], 0x00]") /* v4 = ah0 */ + __ASM_EMIT("ldr q6, [%[b_head], 0x00]") /* v6 = ah0 */ + __ASM_EMIT("ldr q8, [%[a_tail], 0x00]") /* v8 = ah0 */ + __ASM_EMIT("ldr q10, [%[b_tail], 0x00]") /* v10 = ah0 */ + __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */ + __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */ + __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("ext v13.16b, v3.16b, v12.16b, #8") /* v13 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("ext v5.16b, v3.16b, v4.16b, #8") /* v5 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("ext v7.16b, v3.16b, v6.16b, #8") /* v7 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v13.4s") /* v12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v5.4s") /* v4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v7.4s") /* v6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("ext v13.16b, v3.16b, v12.16b, #12") /* v13 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("ext v5.16b, v3.16b, v4.16b, #12") /* v5 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("ext v7.16b, v3.16b, v6.16b, #12") /* v7 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("fadd v12.4s, v12.4s, v13.4s") /* v12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("fadd v4.4s, v4.4s, v5.4s") /* v4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("fadd v6.4s, v6.4s, v7.4s") /* v6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + + __ASM_EMIT("fadd v4.4s, v4.4s, v1.4s") /* v4 = BA = xa + A */ + __ASM_EMIT("fadd v6.4s, v6.4s, v2.4s") /* v6 = BB = xb + B */ + __ASM_EMIT("fadd v8.4s, v12.4s, v0.4s") /* v8 = T = xv + V */ + __ASM_EMIT("fmul v10.4s, v4.4s, v6.4s") /* v10 = B = BA * BB */ + __ASM_EMIT("dup v1.4s, v4.s[3]") /* v1 = xa' = BA[7] */ + __ASM_EMIT("dup v2.4s, v6.s[3]") /* v2 = xb' = BB[7] */ + __ASM_EMIT("dup v0.4s, v8.s[3]") /* v0 = xv' = T[7] */ + __ASM_EMIT("ldr q14, [%[CORR_CC]]") /* v14 = 1e-10 */ + + __ASM_EMIT("fcmge v14.4s, v8.4s, v14.4s") /* v14 = T >= 1e-10 */ + __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v8.4s, v10.4s") /* v10 = T/svrtf(B) */ + __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */ + __ASM_EMIT("add %[a_head], %[a_head], #0x10") + __ASM_EMIT("add %[b_head], %[b_head], #0x10") + __ASM_EMIT("sub %[count], %[count], #4") + __ASM_EMIT("str q10, [%[dst], 0x00]") + __ASM_EMIT("add %[a_tail], %[a_tail], #0x10") + __ASM_EMIT("add %[b_tail], %[b_tail], #0x10") + __ASM_EMIT("add %[dst], %[dst], #0x10") + __ASM_EMIT("4:") + /* 1x blocks */ + __ASM_EMIT("adds %[count], %[count], #3") + __ASM_EMIT("blt 6f") + __ASM_EMIT("ldr q3, [%[CORR_CC]]") /* v3 = 1e-10 */ + __ASM_EMIT("5:") + __ASM_EMIT("ld1r {v4.4s}, [%[a_head]]") /* v4 = ah0 */ + __ASM_EMIT("ld1r {v6.4s}, [%[b_head]]") /* v6 = bh0 */ + __ASM_EMIT("ld1r {v8.4s}, [%[a_tail]]") /* v8 = at0 */ + __ASM_EMIT("ld1r {v10.4s}, [%[b_tail]]") /* v10 = bt0 */ + __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */ + __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */ + __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("fadd v1.4s, v4.4s, v1.4s") /* v1 = BA = xa + DA */ + __ASM_EMIT("fadd v2.4s, v6.4s, v2.4s") /* v2 = BB = xb + DB */ + __ASM_EMIT("fadd v0.4s, v12.4s, v0.4s") /* v0 = T = xv + DV */ + __ASM_EMIT("fmul v10.4s, v1.4s, v2.4s") /* v10 = B = BA * BB */ + + __ASM_EMIT("fcmge v14.4s, v0.4s, v3.4s") /* v14 = T >= 1e-10 */ + __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */ + __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("fmul v10.4s, v0.4s, v10.4s") /* v10 = T/svrtf(B) */ + __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */ + __ASM_EMIT("add %[a_head], %[a_head], #0x04") + __ASM_EMIT("add %[b_head], %[b_head], #0x04") + __ASM_EMIT("subs %[count], %[count], #1") + __ASM_EMIT("st1 {v10.s}[0], [%[dst]]") + __ASM_EMIT("add %[a_tail], %[a_tail], #0x04") + __ASM_EMIT("add %[b_tail], %[b_tail], #0x04") + __ASM_EMIT("add %[dst], %[dst], #0x04") + __ASM_EMIT("b.ge 5b") + __ASM_EMIT("6:") + /* Store state */ + __ASM_EMIT("st3 {v0.s, v1.s, v2.s}[0], [%[corr]]") + + : [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + : [corr] "r" (corr), + [CORR_CC] "r" (&corr_const[0]) + : "cc", "memory", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15" + ); + } + + } /* namespace asimd */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/arm/neon-d32/correlation.h b/include/private/dsp/arch/arm/neon-d32/correlation.h new file mode 100644 index 00000000..0021fdb7 --- /dev/null +++ b/include/private/dsp/arch/arm/neon-d32/correlation.h @@ -0,0 +1,337 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */ + +namespace lsp +{ + namespace neon_d32 + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + ARCH_ARM_ASM( + __ASM_EMIT("veor q0, q0, q0") /* xv = 0 */ + __ASM_EMIT("veor q1, q1, q1") /* xa = 0 */ + __ASM_EMIT("veor q2, q2, q2") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("subs %[count], #16") + __ASM_EMIT("blo 2f") + __ASM_EMIT("veor q3, q3, q3") /* xv = 0 */ + __ASM_EMIT("veor q4, q4, q4") /* xa = 0 */ + __ASM_EMIT("veor q5, q5, q5") /* xb = 0 */ + __ASM_EMIT("1:") + __ASM_EMIT("vldm %[a]!, {q8-q11}") /* q8 = a0, q9 = a1, q10 = a2, q11 = a3 */ + __ASM_EMIT("vldm %[b]!, {q12-q15}") /* q12 = b0, q13 = b1, q14 = b2, q15 = b3 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q3, q9, q13") /* q3 = xv + a1*b1 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q4, q9, q9") /* q4 = xa + a1*a1 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("vmla.f32 q5, q13, q13") /* q5 = xb + b1*b1 */ + __ASM_EMIT("vmla.f32 q0, q10, q14") /* q0 = xv + a2*b2 */ + __ASM_EMIT("vmla.f32 q3, q11, q15") /* q3 = xv + a3*b3 */ + __ASM_EMIT("vmla.f32 q1, q10, q10") /* q1 = xa + a2*a2 */ + __ASM_EMIT("vmla.f32 q4, q11, q11") /* q4 = xa + a3*a3 */ + __ASM_EMIT("vmla.f32 q2, q14, q14") /* q2 = xb + b2*b3 */ + __ASM_EMIT("vmla.f32 q5, q15, q15") /* q5 = xb + b3*b3 */ + __ASM_EMIT("subs %[count], #16") + __ASM_EMIT("bhs 1b") + __ASM_EMIT("vadd.f32 q0, q0, q3") + __ASM_EMIT("vadd.f32 q1, q1, q4") + __ASM_EMIT("vadd.f32 q2, q2, q5") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("adds %[count], #8") + __ASM_EMIT("blt 4f") + __ASM_EMIT("vldm %[a]!, {q8-q9}") /* q8 = a0, q9 = a1 */ + __ASM_EMIT("vldm %[b]!, {q12-q13}") /* q12 = b0, q13 = b1 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("vmla.f32 q0, q9, q13") /* q0 = xv + a1*b1 */ + __ASM_EMIT("vmla.f32 q1, q9, q9") /* q1 = xa + a1*a1 */ + __ASM_EMIT("vmla.f32 q2, q13, q13") /* q2 = xb + b1*b1 */ + __ASM_EMIT("sub %[count], #8") + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("adds %[count], #4") + __ASM_EMIT("blt 6f") + __ASM_EMIT("vldm %[a]!, {q8}") /* q8 = a0 */ + __ASM_EMIT("vldm %[b]!, {q12}") /* q12 = b0 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("sub %[count], #4") + __ASM_EMIT("6:") + __ASM_EMIT("veor q6, q6, q6") /* q6 = 0 */ + __ASM_EMIT("vext.32 q3, q0, q6, #2") /* q3 = xv2 xv3 0 0 */ + __ASM_EMIT("vext.32 q4, q1, q6, #2") /* q4 = xa2 xa3 0 0 */ + __ASM_EMIT("vext.32 q5, q2, q6, #2") /* q5 = xb2 xb3 0 0 */ + __ASM_EMIT("vadd.f32 q0, q0, q3") /* q0 = xv0+xv2 xv1+xv3 xv2 xv3 */ + __ASM_EMIT("vadd.f32 q1, q1, q4") /* q1 = xa0+xa2 xa1+xa3 xv2 xv3 */ + __ASM_EMIT("vadd.f32 q2, q2, q5") /* q2 = xb0+xb2 xb1+xb3 xv2 xv3 */ + __ASM_EMIT("vext.32 q3, q0, q6, #1") /* q3 = xv1+xv3 xv2 xv3 0 */ + __ASM_EMIT("vext.32 q4, q1, q6, #1") /* q4 = xa1+xa3 xv2 xv3 0 */ + __ASM_EMIT("vext.32 q5, q2, q6, #1") /* q5 = xb1+xb3 xv2 xv3 */ + __ASM_EMIT("vadd.f32 q0, q0, q3") /* q0 = xv0+xv1+xv2+xv3 xv1+xv2+xv3 xv2+xv3 xv3 */ + __ASM_EMIT("vadd.f32 q1, q1, q4") /* q1 = xa0+xa1+xa2+xa3 xa1+xa2+xa3 xa2+xa3 xa3 */ + __ASM_EMIT("vadd.f32 q2, q2, q5") /* q2 = xb0+xb1+xb2+xb3 xb1+xb2+xb3 xb2+xb3 xb3 */ + /* 1x blocks */ + __ASM_EMIT("adds %[count], #3") + __ASM_EMIT("blt 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vld1.32 {d16[], d17[]}, [%[a]]!") /* q8 = a0 */ + __ASM_EMIT("vld1.32 {d24[], d25[]}, [%[b]]!") /* q12 = b0 */ + __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */ + __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */ + __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */ + __ASM_EMIT("subs %[count], #1") + __ASM_EMIT("bge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("vld3.32 {d6[], d8[], d10[]}, [%[corr]]") /* q3 = v, q4 = a, q5 = b */ + __ASM_EMIT("vadd.f32 q0, q0, q3") + __ASM_EMIT("vadd.f32 q1, q1, q4") + __ASM_EMIT("vadd.f32 q2, q2, q5") + __ASM_EMIT("vst3.32 {d0[0], d2[0], d4[0]}, [%[corr]]") + + : [a] "+r" (a), [b] "+r" (b), [count] "+r" (count) + : [corr] "r" (corr) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } + + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC8(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + ARCH_ARM_ASM( + __ASM_EMIT("vld3.32 {d0[], d2[], d4[]}, [%[corr]]") + __ASM_EMIT("vld3.32 {d1[], d3[], d5[]}, [%[corr]]") /* q0 = xv, q1 = xa, q2 = xb */ + __ASM_EMIT("veor q3, q3, q3") /* q3 = 0 */ + /* 8x blocks */ + __ASM_EMIT("subs %[count], #8") + __ASM_EMIT("blo 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vldm %[a_head]!, {q4-q5}") /* q4 = ah0, q5 = ah1 */ + __ASM_EMIT("vldm %[b_head]!, {q6-q7}") /* q6 = bh0, q7 = bh1 */ + __ASM_EMIT("vldm %[a_tail]!, {q8-q9}") /* q8 = at0, q9 = at1 */ + __ASM_EMIT("vldm %[b_tail]!, {q10-q11}") /* q10 = bt0, q11 = bt1 */ + __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */ + __ASM_EMIT("vmul.f32 q13, q5, q7") + __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */ + __ASM_EMIT("vmul.f32 q5, q5, q5") + __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */ + __ASM_EMIT("vmul.f32 q7, q7, q7") + __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("vmls.f32 q13, q9, q11") + __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("vmls.f32 q5, q9, q9") + __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */ + __ASM_EMIT("vmls.f32 q7, q11, q11") + + __ASM_EMIT("vext.32 q14, q3, q12, #2") /* q14 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("vext.32 q15, q3, q13, #2") /* q15 = 0 0 DV[4] DV[5] */ + __ASM_EMIT("vadd.f32 q12, q12, q14") /* q12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("vadd.f32 q13, q13, q15") /* q13 = DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vext.32 q14, q3, q12, #3") /* q14 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("vext.32 q15, q3, q13, #3") /* q15 = 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vadd.f32 q12, q12, q14") /* q12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vadd.f32 q13, q13, q15") /* q13 = V[4..7] = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vext.32 q14, q3, q4, #2") /* q14 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("vext.32 q15, q3, q5, #2") /* q15 = 0 0 DA[4] DA[5] */ + __ASM_EMIT("vadd.f32 q4, q4, q14") /* q4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("vadd.f32 q5, q5, q15") /* q5 = DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vext.32 q14, q3, q4, #3") /* q14 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("vext.32 q15, q3, q5, #3") /* q15 = 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vadd.f32 q4, q4, q14") /* q4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vadd.f32 q5, q5, q15") /* q5 = A[4..7] = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vext.32 q14, q3, q6, #2") /* q14 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("vext.32 q15, q3, q7, #2") /* q15 = 0 0 DB[4] DB[5] */ + __ASM_EMIT("vadd.f32 q6, q6, q14") /* q6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("vadd.f32 q7, q7, q15") /* q7 = DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vext.32 q14, q3, q6, #3") /* q14 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("vext.32 q15, q3, q7, #3") /* q15 = 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vadd.f32 q6, q6, q14") /* q6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vadd.f32 q7, q7, q15") /* q7 = B[4..7] = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vdup.32 q8, d25[1]") /* q8 = V[3] V[3] V[3] V[3] */ + __ASM_EMIT("vdup.32 q9, d9[1]") /* q9 = A[3] A[3] A[3] A[3] */ + __ASM_EMIT("vdup.32 q10, d13[1]") /* q10 = B[3] B[3] B[3] B[3] */ + __ASM_EMIT("vadd.f32 q13, q13, q8") /* q13 = V[3]+V[4] V[3]+V[5] V[3]+V[6] V[3]+V[7] */ + __ASM_EMIT("vadd.f32 q5, q5, q9") /* q5 = A[3]+A[4] A[3]+A[5] A[3]+A[6] A[3]+A[7] */ + __ASM_EMIT("vadd.f32 q7, q7, q10") /* q7 = B[3]+B[4] B[3]+B[5] B[3]+B[6] B[3]+B[7] */ + + __ASM_EMIT("vadd.f32 q4, q4, q1") /* q4 = BA = xa + A */ + __ASM_EMIT("vadd.f32 q5, q5, q1") + __ASM_EMIT("vadd.f32 q6, q6, q2") /* q6 = BB = xb + B */ + __ASM_EMIT("vadd.f32 q7, q7, q2") + __ASM_EMIT("vadd.f32 q8, q12, q0") /* q8 = T = xv + V */ + __ASM_EMIT("vadd.f32 q9, q13, q0") + __ASM_EMIT("vmul.f32 q10, q4, q6") /* q10 = B = BA * BB */ + __ASM_EMIT("vmul.f32 q11, q5, q7") + __ASM_EMIT("vdup.32 q0, d19[1]") /* q4 = xv' = T[7] */ + __ASM_EMIT("vdup.32 q1, d11[1]") /* q5 = xa' = BA[7] */ + __ASM_EMIT("vdup.32 q2, d15[1]") /* q6 = xb' = BB[7] */ + __ASM_EMIT("vldm %[CORR_CC], {q14-q15}") /* q14 = 1e-10, q15 = 1e-10 */ + + __ASM_EMIT("vcge.f32 q14, q8, q14") /* q14 = T >= 1e-10 */ + __ASM_EMIT("vcge.f32 q15, q9, q15") + __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */ + __ASM_EMIT("vrsqrte.f32 q5, q11") + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */ + __ASM_EMIT("vmul.f32 q7, q5, q11") + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vrsqrts.f32 q13, q7, q5") + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q5, q5, q13") + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */ + __ASM_EMIT("vmul.f32 q7, q5, q11") + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vrsqrts.f32 q13, q7, q5") + __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q11, q5, q13") + __ASM_EMIT("vmul.f32 q10, q8, q10") /* q10 = T/sqrtf(B) */ + __ASM_EMIT("vmul.f32 q11, q9, q11") + __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */ + __ASM_EMIT("vand q11, q11, q15") + __ASM_EMIT("subs %[count], #8") + __ASM_EMIT("vstm %[dst]!, {q10-q11}") + __ASM_EMIT("bhs 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("adds %[count], #4") + __ASM_EMIT("blt 4f") + __ASM_EMIT("vldm %[a_head]!, {q4}") /* q4 = ah0 */ + __ASM_EMIT("vldm %[b_head]!, {q6}") /* q6 = bh0 */ + __ASM_EMIT("vldm %[a_tail]!, {q8}") /* q8 = at0 */ + __ASM_EMIT("vldm %[b_tail]!, {q10}") /* q10 = bt0 */ + __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */ + __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */ + __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */ + __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("vext.32 q13, q3, q12, #2") /* q13 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("vext.32 q5, q3, q4, #2") /* q5 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("vext.32 q7, q3, q6, #2") /* q7 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("vadd.f32 q12, q12, q13") /* q12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("vadd.f32 q4, q4, q5") /* q4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("vadd.f32 q6, q6, q7") /* q6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("vext.32 q13, q3, q12, #3") /* q13 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("vext.32 q5, q3, q4, #3") /* q5 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("vext.32 q7, q3, q6, #3") /* q7 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("vadd.f32 q12, q12, q13") /* q12 = V = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vadd.f32 q4, q4, q5") /* q4 = A = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vadd.f32 q6, q6, q7") /* q6 = B = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + + __ASM_EMIT("vadd.f32 q4, q4, q1") /* q4 = BA = xa + A */ + __ASM_EMIT("vadd.f32 q6, q6, q2") /* q6 = BB = xb + B */ + __ASM_EMIT("vadd.f32 q8, q12, q0") /* q8 = T = xv + V */ + __ASM_EMIT("vmul.f32 q10, q4, q6") /* q10 = B = BA * BB */ + __ASM_EMIT("vdup.32 q1, d9[1]") /* q1 = xa' = BA[3] */ + __ASM_EMIT("vdup.32 q2, d13[1]") /* q2 = xb' = BB[3] */ + __ASM_EMIT("vdup.32 q0, d17[1]") /* q0 = xv' = T[3] */ + __ASM_EMIT("vldm %[CORR_CC], {q14}") /* q14 = 1e-10 */ + + __ASM_EMIT("vcge.f32 q14, q8, q14") /* q14 = T >= 1e-10 */ + __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q8, q10") /* q10 = T/sqrtf(B) */ + __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */ + __ASM_EMIT("sub %[count], #4") + __ASM_EMIT("vstm %[dst]!, {q10}") + __ASM_EMIT("4:") + /* 1x blocks */ + __ASM_EMIT("adds %[count], #3") + __ASM_EMIT("blt 6f") + __ASM_EMIT("vldm %[CORR_CC], {q3}") /* q3 = 1e-10 */ + __ASM_EMIT("5:") + __ASM_EMIT("vld1.32 {d8[], d9[]}, [%[a_head]]!") /* q4 = ah0 */ + __ASM_EMIT("vld1.32 {d12[], d13[]}, [%[b_head]]!") /* q6 = bh0 */ + __ASM_EMIT("vld1.32 {d16[], d17[]}, [%[a_tail]]!") /* q8 = at0 */ + __ASM_EMIT("vld1.32 {d20[], d21[]}, [%[b_tail]]!") /* q10 = bt0 */ + __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */ + __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */ + __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */ + __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */ + __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */ + __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */ + + __ASM_EMIT("vadd.f32 q1, q4, q1") /* q1 = BA = xa + DA */ + __ASM_EMIT("vadd.f32 q2, q6, q2") /* q2 = BB = xb + DB */ + __ASM_EMIT("vadd.f32 q0, q12, q0") /* q0 = T = xv + DV */ + __ASM_EMIT("vmul.f32 q10, q1, q2") /* q10 = B = BA * BB */ + + __ASM_EMIT("vcge.f32 q14, q0, q3") /* q14 = T >= 1e-10 */ + __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */ + __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */ + __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */ + __ASM_EMIT("vmul.f32 q10, q0, q10") /* q10 = T/sqrtf(B) */ + __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */ + __ASM_EMIT("subs %[count], #1") + __ASM_EMIT("vst1.32 {d20[0]}, [%[dst]]!") + __ASM_EMIT("bge 5b") + __ASM_EMIT("6:") + /* Store state */ + __ASM_EMIT("vst3.32 {d0[0], d2[0], d4[0]}, [%[corr]]") + + : [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + : [corr] "r" (corr), + [CORR_CC] "r" (&corr_const[0]) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); + } + + } /* namespace neon_d32 */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/generic/correlation.h b/include/private/dsp/arch/generic/correlation.h new file mode 100644 index 00000000..9b6dc311 --- /dev/null +++ b/include/private/dsp/arch/generic/correlation.h @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 8 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more deheads. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + + +#ifndef PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_GENERIC_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_GENERIC_IMPL */ + +namespace lsp +{ + namespace generic + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + float xv = 0.0f; + float xa = 0.0f; + float xb = 0.0f; + + if (count >= 4) + { + float T[4], A[4], B[4]; + + T[0] = 0.0f; + T[1] = 0.0f; + T[2] = 0.0f; + T[3] = 0.0f; + + A[0] = 0.0f; + A[1] = 0.0f; + A[2] = 0.0f; + A[3] = 0.0f; + + B[0] = 0.0f; + B[1] = 0.0f; + B[2] = 0.0f; + B[3] = 0.0f; + + for ( ; count >= 4; count -= 4) + { + T[0] += a[0] * b[0]; + T[1] += a[1] * b[1]; + T[2] += a[2] * b[2]; + T[3] += a[3] * b[3]; + + A[0] += a[0] * a[0]; + A[1] += a[1] * a[1]; + A[2] += a[2] * a[2]; + A[3] += a[3] * a[3]; + + B[0] += b[0] * b[0]; + B[1] += b[1] * b[1]; + B[2] += b[2] * b[2]; + B[3] += b[3] * b[3]; + + a += 4; + b += 4; + } + + xv = T[0] + T[1] + T[2] + T[3]; + xa = A[0] + A[1] + A[2] + A[3]; + xb = B[0] + B[1] + B[2] + B[3]; + } + + for ( ; count > 0; --count) + { + xv += a[0] * b[0]; + xa += a[0] * a[0]; + xb += b[0] * b[0]; + + a += 1; + b += 1; + } + + corr->v += xv; + corr->a += xa; + corr->b += xb; + } + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + float T[4], BA[4], BB[4], B[4], DV[4], DA[4], DB[4]; + + float vv = corr->v; + float va = corr->a; + float vb = corr->b; + + for ( ; count >= 4; count -= 4) + { + DV[0] = a_head[0]*b_head[0] - a_tail[0]*b_tail[0]; + DV[1] = a_head[1]*b_head[1] - a_tail[1]*b_tail[1]; + DV[2] = a_head[2]*b_head[2] - a_tail[2]*b_tail[2]; + DV[3] = a_head[3]*b_head[3] - a_tail[3]*b_tail[3]; + + DA[0] = a_head[0]*a_head[0] - a_tail[0]*a_tail[0]; + DA[1] = a_head[1]*a_head[1] - a_tail[1]*a_tail[1]; + DA[2] = a_head[2]*a_head[2] - a_tail[2]*a_tail[2]; + DA[3] = a_head[3]*a_head[3] - a_tail[3]*a_tail[3]; + + DB[0] = b_head[0]*b_head[0] - b_tail[0]*b_tail[0]; + DB[1] = b_head[1]*b_head[1] - b_tail[1]*b_tail[1]; + DB[2] = b_head[2]*b_head[2] - b_tail[2]*b_tail[2]; + DB[3] = b_head[3]*b_head[3] - b_tail[3]*b_tail[3]; + + T[0] = vv + DV[0]; + T[1] = T[0] + DV[1]; + T[2] = T[1] + DV[2]; + T[3] = T[2] + DV[3]; + + BA[0] = va + DA[0]; + BA[1] = BA[0] + DA[1]; + BA[2] = BA[1] + DA[2]; + BA[3] = BA[2] + DA[3]; + + BB[0] = vb + DB[0]; + BB[1] = BB[0] + DB[1]; + BB[2] = BB[1] + DB[2]; + BB[3] = BB[2] + DB[3]; + + B[0] = BA[0] * BB[0]; + B[1] = BA[1] * BB[1]; + B[2] = BA[2] * BB[2]; + B[3] = BA[3] * BB[3]; + + dst[0] = (B[0] >= 1e-10f) ? T[0] / sqrtf(B[0]) : 0.0f; + dst[1] = (B[1] >= 1e-10f) ? T[1] / sqrtf(B[1]) : 0.0f; + dst[2] = (B[2] >= 1e-10f) ? T[2] / sqrtf(B[2]) : 0.0f; + dst[3] = (B[3] >= 1e-10f) ? T[3] / sqrtf(B[3]) : 0.0f; + + vv = T[3]; + va = BA[3]; + vb = BB[3]; + + a_head += 4; + b_head += 4; + a_tail += 4; + b_tail += 4; + dst += 4; + } + + for (; count > 0; --count) + { + DV[0] = a_head[0]*b_head[0] - a_tail[0]*b_tail[0]; + DA[0] = a_head[0]*a_head[0] - a_tail[0]*a_tail[0]; + DB[0] = b_head[0]*b_head[0] - b_tail[0]*b_tail[0]; + + T[0] = vv + DV[0]; + BA[0] = va + DA[0]; + BB[0] = vb + DB[0]; + B[0] = BA[0] * BB[0]; + + dst[0] = (B[0] >= 1e-10f) ? T[0] / sqrtf(B[0]) : 0.0f; + + vv = T[0]; + va = BA[0]; + vb = BB[0]; + + a_head += 1; + b_head += 1; + a_tail += 1; + b_tail += 1; + dst += 1; + } + + corr->v = vv; + corr->a = va; + corr->b = vb; + } + + } /* namespace generic */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/x86/avx/correlation.h b/include/private/dsp/arch/x86/avx/correlation.h new file mode 100644 index 00000000..3aa8c2ff --- /dev/null +++ b/include/private/dsp/arch/x86/avx/correlation.h @@ -0,0 +1,752 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX_IMPL */ + +namespace lsp +{ + namespace avx + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%ymm0, %%ymm0, %%ymm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%ymm2, %%ymm2, %%ymm2") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vmulps %%ymm3, %%ymm5, %%ymm7") /* ymm7 = a0*b0 */ + __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = a0*a0 */ + __ASM_EMIT("vmulps %%ymm5, %%ymm5, %%ymm5") /* ymm5 = b0*b0 */ + __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%ymm4, %%ymm6, %%ymm7") /* ymm7 = a0*b0 */ + __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = a0*a0 */ + __ASM_EMIT("vmulps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = b0*b0 */ + __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("vmulps %%xmm4, %%xmm6, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmulss %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("vmulss %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("vaddss %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vaddss %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%ymm0, %%ymm0, %%ymm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%ymm2, %%ymm2, %%ymm2") /* xb = 0 */ + /* 16x blocks */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm5, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm3, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm2") /* ymm2 = xv + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm6, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%ymm6, %%ymm6, %%ymm2") /* ymm2 = xv + b1*b1 */ + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm6, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%xmm6, %%xmm6, %%xmm2") /* xmm2 = xv + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ss %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 7b") + __ASM_EMIT("8:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + static const float corr_const[] __lsp_aligned32 = + { + LSP_DSP_VEC8(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 8x blocks */ + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */ + __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */ + __ASM_EMIT("vmulps %%ymm4, %%ymm3, %%ymm5") /* ymm5 = at*bt */ + __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */ + __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */ + __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = at*at */ + __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = bt*bt */ + __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vsubps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vsubps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2") + + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */ + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1")/* ymm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x20, %[ptr]") + __ASM_EMIT64("add $0x20, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulps %%xmm4, %%xmm3, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vsubps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vsubps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vsubps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("4:") + + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("5:") + __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulss %%xmm4, %%xmm3, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("vmulss %%xmm4, %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("vsubss %%xmm5, %%xmm2, %%xmm5") /* xmm5 = DV = ah*bh - at*bt */ + __ASM_EMIT("vsubss %%xmm3, %%xmm0, %%xmm3") /* xmm3 = DA = ah*ah - at*at */ + __ASM_EMIT("vsubss %%xmm4, %%xmm1, %%xmm4") /* xmm4 = DB = bh*bh - bt*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA */ + __ASM_EMIT("vaddss %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB */ + __ASM_EMIT("vaddss %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV */ + __ASM_EMIT("vmulss %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm1, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm2, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm2, 0x08(%[corr])") + + __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivss %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 5b") + __ASM_EMIT("6:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + void corr_incr_fma3(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 8x blocks */ + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */ + __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */ + __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */ + __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */ + __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm3, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%ymm3, %%ymm3, %%ymm0") /* ymm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2") + + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */ + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1")/* ymm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x20, %[ptr]") + __ASM_EMIT64("add $0x20, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("4:") + + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("5:") + __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ss %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm3") + __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm4") + __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm5") + __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm3") + __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm4") + __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm5") + + __ASM_EMIT("vaddss %%xmm3, %%xmm2, %%xmm2") /* xmm2 = T = xv+DV */ + __ASM_EMIT("vaddss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = BA = xa+DA */ + __ASM_EMIT("vaddss %%xmm5, %%xmm1, %%xmm1") /* xmm1 = BB = xb+DB */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm7") /* xmm7 = B = BA*BB */ + + __ASM_EMIT32("vmovss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm0, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm1, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm2, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm0, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm1, 0x08(%[corr])") + + __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivss %%xmm6, %%xmm2, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 5b") + __ASM_EMIT("6:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + } /* namespace avx */ +} /* namespace lsp */ + + + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/x86/avx/msmatrix.h b/include/private/dsp/arch/x86/avx/msmatrix.h index 5d158b82..bca9b985 100644 --- a/include/private/dsp/arch/x86/avx/msmatrix.h +++ b/include/private/dsp/arch/x86/avx/msmatrix.h @@ -1,6 +1,6 @@ /* - * Copyright (C) 2020 Linux Studio Plugins Project - * (C) 2020 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -91,8 +91,8 @@ namespace lsp __ASM_EMIT("vmovups %%xmm0, 0x00(%[mid], %[off])") __ASM_EMIT("vmovups %%xmm2, 0x00(%[side], %[off])") __ASM_EMIT("add $0x10, %[off]") - __ASM_EMIT32("subl $8, %[count]") - __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") __ASM_EMIT("6:") // 1x blocks __ASM_EMIT32("addl $3, %[count]") diff --git a/include/private/dsp/arch/x86/avx512/correlation.h b/include/private/dsp/arch/x86/avx512/correlation.h new file mode 100644 index 00000000..3321bf26 --- /dev/null +++ b/include/private/dsp/arch/x86/avx512/correlation.h @@ -0,0 +1,509 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */ + +namespace lsp +{ + namespace avx512 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") /* xv = 0 */ + __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") /* xa = 0 */ + __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") /* xb = 0 */ + /* 32x blocks */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm3") /* zmm3 = a0 */ + __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm4") /* zmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%zmm5") /* zmm5 = b0 */ + __ASM_EMIT("vmovups 0x40(%[b], %[off]), %%zmm6") /* zmm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%zmm3, %%zmm5, %%zmm0") /* zmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%zmm3, %%zmm3, %%zmm1") /* zmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm2") /* zmm2 = xb + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm6, %%zmm0") /* zmm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm1") /* zmm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2") /* zmm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x80, %[off]") /* ++off */ + __ASM_EMIT("sub $32, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm4") + __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm5") + __ASM_EMIT("vextractf64x4 $1, %%zmm2, %%ymm6") + __ASM_EMIT("vaddps %%ymm4, %%ymm0, %%ymm0") + __ASM_EMIT("vaddps %%ymm5, %%ymm1, %%ymm1") + __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") + __ASM_EMIT("2:") + /* 16x blocks */ + __ASM_EMIT("add $16, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */ + __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */ + __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm5, %%ymm0") /* ymm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm3, %%ymm1") /* ymm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm2") /* ymm2 = xb + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm6, %%ymm0") /* ymm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%ymm6, %%ymm6, %%ymm2") /* ymm2 = xb + b1*b1 */ + __ASM_EMIT("sub $16, %[count]") + __ASM_EMIT("add $0x40, %[off]") /* ++off */ + __ASM_EMIT("4:") + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4") + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5") + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6") + __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0") + __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1") + __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") + /* 8x block */ + __ASM_EMIT("add $8, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm6, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("vfmadd231ps %%xmm6, %%xmm6, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("6:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("8:") + /* Do horizontal sum */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */ + __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 10f") + __ASM_EMIT("9:") + __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("vfmadd231ss %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 9b") + __ASM_EMIT("10:") + /* Store result */ + __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0") + __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1") + __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2") + __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + static const float corr_const[] __lsp_aligned64 = + { + LSP_DSP_VEC16(1e-10f) + }; + + static const uint32_t corr_idx[] __lsp_aligned64 = + { + 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 7, // 0 0 A0 A1 + 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, // 0 A0 A1 A2 + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15 + }; + + static const uint16_t corr_kmask[] = + { + 0xff00, + 0xfff0 + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 16x blocks */ + __ASM_EMIT32("subl $16, %[count]") + __ASM_EMIT64("sub $16, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("kmovw 0x00 + %[CORR_KMASK], %%k1") + __ASM_EMIT("kmovw 0x02 + %[CORR_KMASK], %%k2") + __ASM_EMIT("1:") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%zmm0") /* zmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%zmm1") /* zmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%zmm3") /* zmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%zmm4") /* zmm4 = bt */ + __ASM_EMIT("vmulps %%zmm1, %%zmm0, %%zmm2") /* zmm2 = ah*bh */ + __ASM_EMIT("vmulps %%zmm0, %%zmm0, %%zmm0") /* zmm0 = ah*ah */ + __ASM_EMIT("vmulps %%zmm1, %%zmm1, %%zmm1") /* zmm1 = bh*bh */ + __ASM_EMIT("vxorps %%zmm6, %%zmm6, %%zmm6") /* zmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%zmm4, %%zmm3, %%zmm2") /* zmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%zmm3, %%zmm3, %%zmm0") /* zmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%zmm4, %%zmm4, %%zmm1") /* zmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%zmm0, %%zmm6, %%zmm3") /* zmm3 = 0 0 DA[0] DA[1] ... */ + __ASM_EMIT("vshufps $0x44, %%zmm1, %%zmm6, %%zmm4") /* zmm4 = 0 0 DB[0] DB[1] ... */ + __ASM_EMIT("vshufps $0x44, %%zmm2, %%zmm6, %%zmm5") /* zmm5 = 0 0 DV[0] DV[1] ... */ + __ASM_EMIT("vaddps %%zmm3, %%zmm0, %%zmm0") /* zmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] ... */ + __ASM_EMIT("vaddps %%zmm4, %%zmm1, %%zmm1") /* zmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] ... */ + __ASM_EMIT("vaddps %%zmm5, %%zmm2, %%zmm2") /* zmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] ... */ + __ASM_EMIT("vmovaps 0x00 + %[CORR_IDX], %%zmm6") /* zmm6 = permute mask */ + __ASM_EMIT("vmovaps 0x40 + %[CORR_IDX], %%zmm7") /* zmm7 = permute mask */ + __ASM_EMIT("vshufps $0x99, %%zmm0, %%zmm3, %%zmm3") /* zmm3 = 0 DA[0] DA[1] DA[0]+DA[2] ... */ + __ASM_EMIT("vshufps $0x99, %%zmm1, %%zmm4, %%zmm4") /* zmm4 = 0 DB[0] DB[1] DB[0]+DB[2] ... */ + __ASM_EMIT("vshufps $0x99, %%zmm2, %%zmm5, %%zmm5") /* zmm5 = 0 DV[0] DV[1] DV[0]+DV[2] ... */ + __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3") /* zmm3 = A = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] ... */ + __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4") /* zmm4 = B = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] ... */ + __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5") /* zmm5 = V = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] ... */ + __ASM_EMIT("vpermps %%zmm3, %%zmm6, %%zmm0") /* zmm0 = A0 A0 A0 A1 */ + __ASM_EMIT("vpermps %%zmm4, %%zmm6, %%zmm1") /* zmm1 = B0 B0 B0 B1 */ + __ASM_EMIT("vpermps %%zmm5, %%zmm6, %%zmm2") /* zmm2 = V0 V0 V0 V1 */ + __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3 %{%%k1%}") /* zmm3 = A0 A1 A0+A2 A1+A3 */ + __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4 %{%%k1%}") /* zmm4 = B0 B1 B0+B2 B1+B3 */ + __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5 %{%%k1%}") /* zmm5 = V0 V1 V0+V2 V1+V3 */ + __ASM_EMIT("vpermps %%zmm3, %%zmm7, %%zmm0") /* zmm0 = A0 A0 A1 A0+A2 */ + __ASM_EMIT("vpermps %%zmm4, %%zmm7, %%zmm1") /* zmm1 = B0 B0 B1 B0+B2 */ + __ASM_EMIT("vpermps %%zmm5, %%zmm7, %%zmm2") /* zmm2 = V0 V0 V1 V0+V2 */ + __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3 %{%%k2%}") /* zmm3 = A0 A0+A1 A0+A1+A2 A0+A1+A2+A3 */ + __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4 %{%%k2%}") /* zmm4 = B0 B0+B1 B0+B1+B2 B0+B1+B2+B3 */ + __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5 %{%%k2%}") /* zmm5 = V0 V0+V1 V0+V1+V2 V0+V1+V2+V3 */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%zmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%zmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%zmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%zmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%zmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%zmm2") + + __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") /* zmm1 = BA = xa + A */ + __ASM_EMIT("vaddps %%zmm4, %%zmm2, %%zmm2") /* zmm2 = BB = xb + B */ + __ASM_EMIT("vaddps %%zmm5, %%zmm0, %%zmm0") /* zmm0 = T = xv + V */ + __ASM_EMIT("vmovaps 0x80 + %[CORR_IDX], %%zmm6") /* zmm6 = permute mask */ + __ASM_EMIT("vmulps %%zmm2, %%zmm1, %%zmm7") /* zmm7 = B = BA*BB */ + __ASM_EMIT("vpermps %%zmm0, %%zmm6, %%zmm3") /* zmm3 = xv' */ + __ASM_EMIT("vpermps %%zmm1, %%zmm6, %%zmm4") /* zmm4 = xa' */ + __ASM_EMIT("vpermps %%zmm2, %%zmm6, %%zmm5") /* zmm5 = xb' */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%zmm7, %%zmm6") /* zmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $1, %[CORR_CC], %%zmm7, %%k3") /* k3 = B < 1e-10f */ + __ASM_EMIT("vdivps %%zmm6, %%zmm0, %%zmm0") /* zmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0 %{%%k3%}") /* zmm0 = (B < 1e-10f) ? 0 : T/sqrtf(B) */ + __ASM_EMIT("add $0x40, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x40, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%zmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%zmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x40, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x40, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x40, %[ptr]") + __ASM_EMIT64("add $0x40, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $16, %[count]") + __ASM_EMIT64("sub $16, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 8x block */ + __ASM_EMIT32("addl $8, %[count]") + __ASM_EMIT64("add $8, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */ + __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */ + __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */ + __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */ + __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm3, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%ymm3, %%ymm3, %%ymm0") /* ymm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2") + + __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */ + __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */ + __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1") /* ymm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])") + __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x20, %[ptr]") + __ASM_EMIT64("add $0x20, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $8, %[count]") + __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT("4:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ps %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */ + __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */ + __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */ + __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */ + __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */ + __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */ + __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */ + __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */ + __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */ + __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */ + __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */ + __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */ + + __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])") + + __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1") /* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("6:") + + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 8f") + __ASM_EMIT("7:") + __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + __ASM_EMIT("vfnmadd231ss %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm3") + __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm4") + __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm5") + __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm3") + __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm4") + __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm5") + + __ASM_EMIT("vaddss %%xmm3, %%xmm2, %%xmm2") /* xmm2 = T = xv+DV */ + __ASM_EMIT("vaddss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = BA = xa+DA */ + __ASM_EMIT("vaddss %%xmm5, %%xmm1, %%xmm1") /* xmm1 = BB = xb+DB */ + __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm7") /* xmm7 = B = BA*BB */ + + __ASM_EMIT32("vmovss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT32("vmovss %%xmm0, 0x04(%[ptr])") + __ASM_EMIT32("vmovss %%xmm1, 0x08(%[ptr])") + __ASM_EMIT64("vmovss %%xmm2, 0x00(%[corr])") + __ASM_EMIT64("vmovss %%xmm0, 0x04(%[corr])") + __ASM_EMIT64("vmovss %%xmm1, 0x08(%[corr])") + + __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */ + __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1") /* xmm1 = B >= 1e-10f */ + __ASM_EMIT("vdivss %%xmm6, %%xmm2, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 7b") + __ASM_EMIT("8:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const), + [CORR_IDX] "o" (corr_idx), + [CORR_KMASK] "o" (corr_kmask) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%k1", "%k2", "%k3" + ); + } + + } /* namespace avx512 */ +} /* namespace lsp */ + + + + + +#endif /* PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/x86/avx512/msmatrix.h b/include/private/dsp/arch/x86/avx512/msmatrix.h index c8798e4f..cfb1be5c 100644 --- a/include/private/dsp/arch/x86/avx512/msmatrix.h +++ b/include/private/dsp/arch/x86/avx512/msmatrix.h @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 9 сент. 2023 г. @@ -107,8 +107,8 @@ namespace lsp __ASM_EMIT("vmovups %%xmm0, 0x00(%[mid], %[off])") __ASM_EMIT("vmovups %%xmm2, 0x00(%[side], %[off])") __ASM_EMIT("add $0x10, %[off]") - __ASM_EMIT32("subl $8, %[count]") - __ASM_EMIT64("sub $8, %[count]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") __ASM_EMIT("8:") // 1x blocks __ASM_EMIT32("addl $3, %[count]") @@ -273,8 +273,8 @@ namespace lsp ARCH_X86_ASM( __ASM_EMIT("xor %[off], %[off]") // 64x blocks - __ASM_EMIT32("subl $32, %[count]") - __ASM_EMIT64("sub $32, %[count]") + __ASM_EMIT32("subl $64, %[count]") + __ASM_EMIT64("sub $64, %[count]") __ASM_EMIT("jb 2f") __ASM_EMIT("1:") __ASM_EMIT("vmovups 0x00(%[mid], %[off]), %%zmm0") // zmm0 = m diff --git a/include/private/dsp/arch/x86/sse/correlation.h b/include/private/dsp/arch/x86/sse/correlation.h new file mode 100644 index 00000000..e40d11a3 --- /dev/null +++ b/include/private/dsp/arch/x86/sse/correlation.h @@ -0,0 +1,324 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_SSE_IMPL */ + +namespace lsp +{ + namespace sse + { + + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + IF_ARCH_X86( + size_t off; + ); + + ARCH_X86_ASM + ( + __ASM_EMIT("xor %[off], %[off]") + __ASM_EMIT("xorps %%xmm0, %%xmm0") /* xv = 0 */ + __ASM_EMIT("xorps %%xmm1, %%xmm1") /* xa = 0 */ + __ASM_EMIT("xorps %%xmm2, %%xmm2") /* xb = 0 */ + /* 8x blocks */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("movups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */ + __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */ + __ASM_EMIT("movups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("movups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */ + __ASM_EMIT("mulps %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("mulps %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("movaps %%xmm4, %%xmm7") /* xmm7 = a1 */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("mulps %%xmm6, %%xmm7") /* xmm7 = a1*b1 */ + __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = a1*a1 */ + __ASM_EMIT("mulps %%xmm6, %%xmm6") /* xmm6 = b1*b1 */ + __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a1*b1 */ + __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */ + __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm2 = xb + b1*b1 */ + __ASM_EMIT("add $0x20, %[off]") /* ++off */ + __ASM_EMIT("sub $8, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 4x block */ + __ASM_EMIT("add $4, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("movups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("movups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("mulps %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("mulps %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x10, %[off]") /* ++off */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("4:") + /* Do horizontal sum */ + __ASM_EMIT("movhlps %%xmm0, %%xmm4") /* xmm4 = xv2 xv3 ? ? */ + __ASM_EMIT("movhlps %%xmm1, %%xmm5") /* xmm5 = xa2 xa3 ? ? */ + __ASM_EMIT("movhlps %%xmm2, %%xmm6") /* xmm6 = xb2 xb3 ? ? */ + __ASM_EMIT("addps %%xmm4, %%xmm0") /* xmm0 = xv0+xv2 xv1+xv3 ? ? */ + __ASM_EMIT("addps %%xmm5, %%xmm1") /* xmm1 = xa0+xa2 xa1+xa3 ? ? */ + __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm2 = xb0+xb2 xb1+xb3 ? ? */ + __ASM_EMIT("unpcklps %%xmm4, %%xmm0") /* xmm0 = xv0+xv2 xv2 xv1+xv3 xv3 */ + __ASM_EMIT("unpcklps %%xmm5, %%xmm1") /* xmm1 = xa0+xa2 xa2 xa1+xa3 xa3 */ + __ASM_EMIT("unpcklps %%xmm6, %%xmm2") /* xmm2 = xb0+xb2 xb2 xb1+xb3 xb3 */ + __ASM_EMIT("movhlps %%xmm0, %%xmm4") /* xmm4 = xv1+xv3 xv3 ? ? */ + __ASM_EMIT("movhlps %%xmm1, %%xmm5") /* xmm5 = xa1+xa3 xa3 ? ? */ + __ASM_EMIT("movhlps %%xmm2, %%xmm6") /* xmm6 = xb1+xb3 xb3 ? ? */ + __ASM_EMIT("addss %%xmm4, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 xv2+xv3 ? ? */ + __ASM_EMIT("addss %%xmm5, %%xmm1") /* xmm1 = xa0+xa1+xa2+xa3 xa2+xa3 ? ? */ + __ASM_EMIT("addss %%xmm6, %%xmm2") /* xmm2 = xb0+xb1+xb2+xb3 xb2+xb3 ? ? */ + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 6f") + __ASM_EMIT("5:") + __ASM_EMIT("movss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */ + __ASM_EMIT("movss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */ + __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */ + __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = a0*a0 */ + __ASM_EMIT("mulss %%xmm5, %%xmm7") /* xmm7 = a0*b0 */ + __ASM_EMIT("mulss %%xmm5, %%xmm5") /* xmm5 = b0*b0 */ + __ASM_EMIT("addss %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */ + __ASM_EMIT("addss %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */ + __ASM_EMIT("addss %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */ + __ASM_EMIT("add $0x04, %[off]") /* ++off */ + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 5b") + __ASM_EMIT("6:") + /* Store result */ + __ASM_EMIT("movss 0x00(%[corr]), %%xmm4") + __ASM_EMIT("movss 0x04(%[corr]), %%xmm5") + __ASM_EMIT("movss 0x08(%[corr]), %%xmm6") + __ASM_EMIT("addss %%xmm4, %%xmm0") + __ASM_EMIT("addss %%xmm5, %%xmm1") + __ASM_EMIT("addss %%xmm6, %%xmm2") + __ASM_EMIT("movss %%xmm0, 0x00(%[corr])") + __ASM_EMIT("movss %%xmm1, 0x04(%[corr])") + __ASM_EMIT("movss %%xmm2, 0x08(%[corr])") + + : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count) + : [a] "r" (a), [b] "r" (b) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC4(1e-10f) + }; + + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + IF_ARCH_I386( + void *ptr; + ); + + ARCH_X86_ASM + ( + /* 4x blocks */ + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulps %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulps %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulps %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulps %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subps %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subps %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 0 0 0 */ + __ASM_EMIT("movlhps %%xmm0, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("movlhps %%xmm1, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("movlhps %%xmm2, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("addps %%xmm3, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("shufps $0x99, %%xmm0, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("shufps $0x99, %%xmm1, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("shufps $0x99, %%xmm2, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("addps %%xmm0, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm1, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("movss 0x00(%[ptr]), %%xmm0") + __ASM_EMIT32("movss 0x04(%[ptr]), %%xmm1") + __ASM_EMIT32("movss 0x08(%[ptr]), %%xmm2") + __ASM_EMIT64("movss 0x00(%[corr]), %%xmm0") + __ASM_EMIT64("movss 0x04(%[corr]), %%xmm1") + __ASM_EMIT64("movss 0x08(%[corr]), %%xmm2") + + __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = xv xv xv xv */ + __ASM_EMIT("shufps $0x00, %%xmm1, %%xmm1") /* xmm1 = xa xa xa xa */ + __ASM_EMIT("shufps $0x00, %%xmm2, %%xmm2") /* xmm2 = xb xb xb xb */ + __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm2") /* xmm2 = BB = xb+DV[0] xb+DV[0]+DV[1] xb+DV[0]+DV[1]+DV[2] xb+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("movaps %%xmm1, %%xmm6") /* xmm6 = BA */ + __ASM_EMIT("addps %%xmm5, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("mulps %%xmm2, %%xmm1") /* xmm1 = B = BA*BB */ + __ASM_EMIT("movaps %%xmm0, %%xmm5") /* xmm5 = T */ + __ASM_EMIT("shufps $0xff, %%xmm6, %%xmm6") /* xmm6 = BA[3] BA[3] BA[3] BA[3] */ + __ASM_EMIT("shufps $0xff, %%xmm5, %%xmm5") /* xmm5 = T[3] T[3] T[3] T[3] */ + __ASM_EMIT("shufps $0xff, %%xmm2, %%xmm2") /* xmm7 = BB[3] BB[3] BB[3] BB[3] */ + + __ASM_EMIT32("movss %%xmm5, 0x00(%[ptr])") + __ASM_EMIT32("movss %%xmm6, 0x04(%[ptr])") + __ASM_EMIT32("movss %%xmm2, 0x08(%[ptr])") + __ASM_EMIT64("movss %%xmm5, 0x00(%[corr])") + __ASM_EMIT64("movss %%xmm6, 0x04(%[corr])") + __ASM_EMIT64("movss %%xmm2, 0x08(%[corr])") + + __ASM_EMIT("sqrtps %%xmm1, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpps $5, %[CORR_CC], %%xmm1") /* xmm1 = B >= 1e-10f */ + __ASM_EMIT("divps %%xmm7, %%xmm0") /* xmm0 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("andps %%xmm1, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT32("movups %%xmm0, 0x00(%[ptr])") + __ASM_EMIT64("movups %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("add $0x10, %[ptr]") + __ASM_EMIT64("add $0x10, %[dst]") + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("subl $4, %[count]") + __ASM_EMIT64("sub $4, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("3:") + __ASM_EMIT("movss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movss %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movss %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulss %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulss %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulss %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulss %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulss %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subss %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subss %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subss %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT32("mov %[corr], %[ptr]") + __ASM_EMIT32("movss 0x00(%[ptr]), %%xmm3") + __ASM_EMIT32("movss 0x04(%[ptr]), %%xmm4") + __ASM_EMIT32("movss 0x08(%[ptr]), %%xmm5") + __ASM_EMIT64("movss 0x00(%[corr]), %%xmm3") + __ASM_EMIT64("movss 0x04(%[corr]), %%xmm4") + __ASM_EMIT64("movss 0x08(%[corr]), %%xmm5") + + __ASM_EMIT("addss %%xmm4, %%xmm0") /* xmm0 = BA = xa+DA */ + __ASM_EMIT("addss %%xmm3, %%xmm2") /* xmm2 = T = xv+DV */ + __ASM_EMIT("movaps %%xmm0, %%xmm3") /* xmm3 = BA */ + __ASM_EMIT("addss %%xmm5, %%xmm1") /* xmm1 = BB = xb+DB */ + __ASM_EMIT("mulss %%xmm1, %%xmm3") /* xmm3 = B = BA*BB */ + + __ASM_EMIT32("movss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT32("movss %%xmm0, 0x04(%[ptr])") + __ASM_EMIT32("movss %%xmm1, 0x08(%[ptr])") + __ASM_EMIT64("movss %%xmm2, 0x00(%[corr])") + __ASM_EMIT64("movss %%xmm0, 0x04(%[corr])") + __ASM_EMIT64("movss %%xmm1, 0x08(%[corr])") + + __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpss $5, %[CORR_CC], %%xmm3") /* xmm3 = B >= 1e-10f */ + __ASM_EMIT("divss %%xmm7, %%xmm2") /* xmm2 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("andps %%xmm3, %%xmm2") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT32("movss %%xmm2, 0x00(%[ptr])") + __ASM_EMIT64("movss %%xmm2, 0x00(%[dst])") + __ASM_EMIT32("add $0x04, %[ptr]") + __ASM_EMIT64("add $0x04, %[dst]") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT32("mov %[ptr], %[dst]") + __ASM_EMIT32("decl %[count]") + __ASM_EMIT64("dec %[count]") + __ASM_EMIT("jge 3b") + __ASM_EMIT("4:") + + : __IF_32( + [ptr] "=&r" (ptr), + [corr] "+m" (corr), [dst] "+m" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+g" (count) + ) + __IF_64( + [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + ) + : __IF_64( [corr] "r" (corr), ) + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" + ); + } + + } /* namespace sse */ +} /* namespace lsp */ + + + +#endif /* PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ */ diff --git a/include/private/dsp/arch/x86/sse3/correlation.h b/include/private/dsp/arch/x86/sse3/correlation.h new file mode 100644 index 00000000..14be34ed --- /dev/null +++ b/include/private/dsp/arch/x86/sse3/correlation.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 10 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ +#define PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ + +#ifndef PRIVATE_DSP_ARCH_X86_SSE3_IMPL + #error "This header should not be included directly" +#endif /* PRIVATE_DSP_ARCH_X86_SSE3_IMPL */ + +namespace lsp +{ + namespace sse3 + { + static const float corr_const[] __lsp_aligned16 = + { + LSP_DSP_VEC4(1e-10f) + }; + + void x64_corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + ARCH_X86_64_ASM + ( + /* load data */ + __ASM_EMIT("movss 0x00(%[corr]), %%xmm8") /* xmm8 = xv */ + __ASM_EMIT("movss 0x04(%[corr]), %%xmm9") /* xmm9 = xa */ + __ASM_EMIT("movss 0x08(%[corr]), %%xmm10") /* xmm10 = xb */ + __ASM_EMIT("movaps %[CORR_CC], %%xmm11") /* xmm11 = 1e-10f */ + /* 4x blocks */ + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("jb 2f") + __ASM_EMIT("shufps $0x00, %%xmm8, %%xmm8") + __ASM_EMIT("shufps $0x00, %%xmm9, %%xmm9") + __ASM_EMIT("shufps $0x00, %%xmm10, %%xmm10") + __ASM_EMIT(".align 0x10") + __ASM_EMIT("1:") + __ASM_EMIT("movups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulps %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulps %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulps %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulps %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subps %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subps %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 0 0 0 */ + __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 0 0 0 */ + __ASM_EMIT("movlhps %%xmm0, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] */ + __ASM_EMIT("movlhps %%xmm1, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] */ + __ASM_EMIT("movlhps %%xmm2, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] */ + __ASM_EMIT("addps %%xmm3, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */ + __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */ + __ASM_EMIT("shufps $0x99, %%xmm0, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] */ + __ASM_EMIT("shufps $0x99, %%xmm1, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] */ + __ASM_EMIT("shufps $0x99, %%xmm2, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] */ + __ASM_EMIT("addps %%xmm0, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm1, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */ + __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */ + + __ASM_EMIT("addps %%xmm3, %%xmm9") /* xmm9 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] */ + __ASM_EMIT("addps %%xmm4, %%xmm10") /* xmm10 = BB = xb+DV[0] xb+DV[0]+DV[1] xb+DV[0]+DV[1]+DV[2] xb+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("movaps %%xmm9, %%xmm6") /* xmm6 = BA */ + __ASM_EMIT("addps %%xmm5, %%xmm8") /* xmm8 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] */ + __ASM_EMIT("mulps %%xmm10, %%xmm6") /* xmm6 = B = BA*BB */ + __ASM_EMIT("movaps %%xmm8, %%xmm5") /* xmm5 = T */ + __ASM_EMIT("shufps $0xff, %%xmm8, %%xmm8") /* xmm8 = T[3] T[3] T[3] T[3] */ + __ASM_EMIT("shufps $0xff, %%xmm9, %%xmm9") /* xmm9 = BA[3] BA[3] BA[3] BA[3] */ + __ASM_EMIT("shufps $0xff, %%xmm10, %%xmm10") /* xmm10 = BB[3] BB[3] BB[3] BB[3] */ + + __ASM_EMIT("sqrtps %%xmm6, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpps $5, %%xmm11, %%xmm6") /* xmm6 = B >= 1e-10f */ + __ASM_EMIT("divps %%xmm7, %%xmm5") /* xmm5 = T/sqrtf(B) */ + __ASM_EMIT("andps %%xmm6, %%xmm5") /* xmm5 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */ + __ASM_EMIT("movups %%xmm5, 0x00(%[dst])") + __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */ + __ASM_EMIT("add $0x10, %[dst]") + __ASM_EMIT("sub $4, %[count]") + __ASM_EMIT("jae 1b") + __ASM_EMIT("2:") + /* 1x blocks */ + __ASM_EMIT("add $3, %[count]") + __ASM_EMIT("jl 4f") + __ASM_EMIT("3:") + __ASM_EMIT("movss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */ + __ASM_EMIT("movss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */ + __ASM_EMIT("movss %%xmm0, %%xmm2") /* xmm2 = ah */ + __ASM_EMIT("movss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */ + __ASM_EMIT("movss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */ + __ASM_EMIT("movss %%xmm3, %%xmm5") /* xmm5 = at */ + __ASM_EMIT("mulss %%xmm1, %%xmm2") /* xmm2 = ah*bh */ + __ASM_EMIT("mulss %%xmm0, %%xmm0") /* xmm0 = ah*ah */ + __ASM_EMIT("mulss %%xmm1, %%xmm1") /* xmm1 = bh*bh */ + __ASM_EMIT("mulss %%xmm4, %%xmm5") /* xmm5 = at*bt */ + __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = at*at */ + __ASM_EMIT("mulss %%xmm4, %%xmm4") /* xmm4 = bt*bt */ + __ASM_EMIT("subss %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */ + __ASM_EMIT("subss %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */ + __ASM_EMIT("subss %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */ + + __ASM_EMIT("addss %%xmm0, %%xmm9") /* xmm9 = BA = xa+DA */ + __ASM_EMIT("addss %%xmm1, %%xmm10") /* xmm10 = BB = xb+DB */ + __ASM_EMIT("movaps %%xmm9, %%xmm3") /* xmm3 = BA */ + __ASM_EMIT("addss %%xmm2, %%xmm8") /* xmm8 = T = xv+DV */ + __ASM_EMIT("mulss %%xmm10, %%xmm3") /* xmm3 = B = BA*BB */ + __ASM_EMIT("movaps %%xmm8, %%xmm0") /* xmm0 = T */ + + __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */ + __ASM_EMIT("cmpss $5, %[CORR_CC], %%xmm3") /* xmm3 = B >= 1e-10f */ + __ASM_EMIT("divss %%xmm7, %%xmm0") /* xmm2 = T/sqrtf(B) */ + __ASM_EMIT32("mov %[dst], %[ptr]") + __ASM_EMIT("andps %%xmm3, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */ + __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */ + __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */ + __ASM_EMIT("movss %%xmm0, 0x00(%[dst])") + __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */ + __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */ + __ASM_EMIT("add $0x04, %[dst]") + __ASM_EMIT("dec %[count]") + __ASM_EMIT("jge 3b") + __ASM_EMIT("4:") + + /* Store result */ + __ASM_EMIT("movss %%xmm8, 0x00(%[corr])") + __ASM_EMIT("movss %%xmm9, 0x04(%[corr])") + __ASM_EMIT("movss %%xmm10, 0x08(%[corr])") + + : [dst] "+r" (dst), + [a_head] "+r" (a_head), [b_head] "+r" (b_head), + [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail), + [count] "+r" (count) + : [corr] "r" (corr), + [CORR_CC] "o" (corr_const) + : "cc", "memory", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm9", "%xmm10" + ); + } + + } /* namespace sse3 */ +} /* namespace lsp */ + + + + +#endif /* PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ */ diff --git a/make/system.mk b/make/system.mk index 5796e895..66c7f58d 100644 --- a/make/system.mk +++ b/make/system.mk @@ -1,21 +1,21 @@ # -# Copyright (C) 2020 Linux Studio Plugins Project -# (C) 2020 Vladimir Sadovnikov +# Copyright (C) 2024 Linux Studio Plugins Project +# (C) 2024 Vladimir Sadovnikov # -# This file is part of lsp-plugins +# This file is part of lsp-dsp-lib # -# lsp-plugins is free software: you can redistribute it and/or modify +# lsp-dsp-lib is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # any later version. # -# lsp-plugins is distributed in the hope that it will be useful, +# lsp-dsp-lib is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License -# along with lsp-plugins. If not, see . +# along with lsp-dsp-lib. If not, see . # # Detect operating system diff --git a/make/tools.mk b/make/tools.mk index e99498dd..ade5d571 100644 --- a/make/tools.mk +++ b/make/tools.mk @@ -76,6 +76,8 @@ ifeq ($(PLATFORM),Solaris) else ifeq ($(PLATFORM),Windows) FLAG_RELRO = FLAG_STDLIB = + CFLAGS_EXT += -DWINVER=0x600 -D_WIN32_WINNT=0x600 + CXXFLAGS_EXT += -DWINVER=0x600 -D_WIN32_WINNT=0x600 EXE_FLAGS_EXT += -static-libgcc -static-libstdc++ SO_FLAGS_EXT += -static-libgcc -static-libstdc++ LDFLAGS_EXT += -T $(CURDIR)/make/ld-windows.script diff --git a/modules.mk b/modules.mk index 929f615c..8a17adde 100644 --- a/modules.mk +++ b/modules.mk @@ -19,13 +19,13 @@ # # Variables that describe dependencies -LSP_COMMON_LIB_VERSION := 1.0.34 +LSP_COMMON_LIB_VERSION := 1.0.35 LSP_COMMON_LIB_NAME := lsp-common-lib LSP_COMMON_LIB_TYPE := src LSP_COMMON_LIB_URL_RO := https://github.com/lsp-plugins/$(LSP_COMMON_LIB_NAME).git LSP_COMMON_LIB_URL_RW := git@github.com:lsp-plugins/$(LSP_COMMON_LIB_NAME).git -LSP_TEST_FW_VERSION := 1.0.24 +LSP_TEST_FW_VERSION := 1.0.25 LSP_TEST_FW_NAME := lsp-test-fw LSP_TEST_FW_TYPE := src LSP_TEST_FW_URL_RO := https://github.com/lsp-plugins/$(LSP_TEST_FW_NAME).git diff --git a/project.mk b/project.mk index 86bee130..3e183d1f 100644 --- a/project.mk +++ b/project.mk @@ -23,4 +23,4 @@ ARTIFACT_ID = LSP_DSP_LIB ARTIFACT_NAME = lsp-dsp-lib ARTIFACT_DESC = DSP library for digital signal processing ARTIFACT_HEADERS = lsp-plug.in -ARTIFACT_VERSION = 1.0.21 +ARTIFACT_VERSION = 1.0.22 diff --git a/src/main/aarch64/asimd.cpp b/src/main/aarch64/asimd.cpp index 417c89af..a5eac977 100644 --- a/src/main/aarch64/asimd.cpp +++ b/src/main/aarch64/asimd.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -52,6 +52,7 @@ #define PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL #include #include + #include #include #include #include @@ -429,6 +430,8 @@ EXPORT1(downsample_8x); EXPORT1(convolve); + EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(abgr32_to_bgrff32); EXPORT1(rgba32_to_bgra32); diff --git a/src/main/arm/neon-d32.cpp b/src/main/arm/neon-d32.cpp index edf61ec2..b7ae9953 100644 --- a/src/main/arm/neon-d32.cpp +++ b/src/main/arm/neon-d32.cpp @@ -50,6 +50,7 @@ #define PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL #include #include + #include #include #include #include @@ -141,6 +142,8 @@ EXPORT1(pcomplex_r2c_rdiv2); EXPORT1(convolve); + EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(axis_apply_lin1); EXPORT1(axis_apply_log1); diff --git a/src/main/generic/generic.cpp b/src/main/generic/generic.cpp index 12324237..a99fffed 100644 --- a/src/main/generic/generic.cpp +++ b/src/main/generic/generic.cpp @@ -47,6 +47,7 @@ namespace lsp #include #include #include + #include #include @@ -598,6 +599,8 @@ namespace lsp EXPORT1(unit_vector_p1pv); EXPORT1(convolve); + EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(base64_enc); EXPORT1(base64_dec); diff --git a/src/main/x86/avx.cpp b/src/main/x86/avx.cpp index 99e76b59..54e517a7 100644 --- a/src/main/x86/avx.cpp +++ b/src/main/x86/avx.cpp @@ -69,6 +69,7 @@ #include #include #include + #include #include @@ -391,6 +392,8 @@ CEXPORT1(favx, downsample_8x); CEXPORT1(favx, convolve); + CEXPORT1(favx, corr_init); + CEXPORT1(favx, corr_incr); CEXPORT1(favx, lin_inter_set); CEXPORT1(favx, lin_inter_mul2); @@ -483,6 +486,8 @@ CEXPORT2(favx, filter_transfer_apply_pc, filter_transfer_apply_pc_fma3); CEXPORT2(favx, convolve, convolve_fma3); + CEXPORT2(favx, corr_init, corr_init_fma3); + CEXPORT2(favx, corr_incr, corr_incr_fma3); CEXPORT2(favx, axis_apply_lin1, axis_apply_lin1_fma3); diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp index 64329fb1..9946c4ba 100644 --- a/src/main/x86/avx512.cpp +++ b/src/main/x86/avx512.cpp @@ -49,6 +49,8 @@ #include #include #include + + #include #undef PRIVATE_DSP_ARCH_X86_AVX512_IMPL namespace lsp @@ -275,6 +277,9 @@ CEXPORT1(vl, uexpander_x1_curve); CEXPORT1(vl, dexpander_x1_gain); CEXPORT1(vl, dexpander_x1_curve); + + CEXPORT1(vl, corr_init); + CEXPORT1(vl, corr_incr); } } /* namespace avx2 */ } /* namespace lsp */ diff --git a/src/main/x86/sse.cpp b/src/main/x86/sse.cpp index 96d344ae..7211c429 100644 --- a/src/main/x86/sse.cpp +++ b/src/main/x86/sse.cpp @@ -66,6 +66,7 @@ #include #include + #include #include #include @@ -492,6 +493,8 @@ EXPORT1(cull_triangle_raw); EXPORT1(convolve); + EXPORT1(corr_init); + EXPORT1(corr_incr); EXPORT1(lin_inter_set); EXPORT1(lin_inter_mul2); diff --git a/src/main/x86/sse3.cpp b/src/main/x86/sse3.cpp index 5d59d951..78b9be9c 100644 --- a/src/main/x86/sse3.cpp +++ b/src/main/x86/sse3.cpp @@ -48,6 +48,7 @@ #include #include #include + #include #undef PRIVATE_DSP_ARCH_X86_SSE3_IMPL namespace lsp @@ -115,6 +116,8 @@ EXPORT1(split_triangle_raw); EXPORT1(cull_triangle_raw); + + EXPORT2_X64(corr_incr, x64_corr_incr); } #undef EXPORT2 diff --git a/src/test/ptest/convolve.cpp b/src/test/ptest/convolve.cpp index 5393dc6b..9f2d2a42 100644 --- a/src/test/ptest/convolve.cpp +++ b/src/test/ptest/convolve.cpp @@ -109,6 +109,9 @@ PTEST_BEGIN("dsp", convolve, 5, 1000) float *in = &out[buf_size*2]; float *conv = &in[buf_size]; float *backup = &conv[buf_size]; + lsp_finally { + free_aligned(data); + }; for (size_t i=0; i < buf_size*4; ++i) out[i] = randf(-1.0f, 1.0f); diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp new file mode 100644 index 00000000..597c7ecf --- /dev/null +++ b/src/test/ptest/corr_incr.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 10 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 15 + +namespace lsp +{ + namespace generic + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + IF_ARCH_X86( + namespace sse + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + namespace avx + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + + void corr_incr_fma3(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + namespace avx512 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_X86_64( + namespace sse3 + { + void x64_corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); +} + +//----------------------------------------------------------------------------- +// Performance test for lanczos resampling +PTEST_BEGIN("dsp", corr_incr, 5, 10000) + + void call(const char *label, + float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count, corr_incr_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + sprintf(buf, "%s x %d", label, int(count)); + printf("Testing %s correlation ...\n", buf); + + dsp::correlation_t corr; + + PTEST_LOOP(buf, + corr.v = 0.0f; + corr.a = 0.0f; + corr.b = 0.0f; + + func(&corr, dst, a_head, b_head, a_tail, b_tail, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *a_head = alloc_aligned(data, buf_size * 5, 64); + float *a_tail = &a_head[buf_size]; + float *b_head = &a_tail[buf_size]; + float *b_tail = &b_head[buf_size]; + float *dst = &b_tail[buf_size]; + lsp_finally { + free_aligned(data); + }; + + for (size_t i=0; i < buf_size*5; ++i) + a_head[i] = randf(-1.0f, 1.0f); + + #define CALL(func, count) \ + call(#func, dst, a_head, b_head, a_tail, b_tail, count, func) + + for (size_t i=MIN_RANK; i<=MAX_RANK; ++i) + { + const size_t count = 1 << i; + + CALL(generic::corr_incr, count); + IF_ARCH_X86(CALL(sse::corr_incr, count)); + IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, count)); + IF_ARCH_X86(CALL(avx::corr_incr, count)); + IF_ARCH_X86(CALL(avx::corr_incr_fma3, count)); + IF_ARCH_X86(CALL(avx512::corr_incr, count)); + IF_ARCH_ARM(CALL(neon_d32::corr_incr, count)); + IF_ARCH_AARCH64(CALL(asimd::corr_incr, count)); + + PTEST_SEPARATOR; + } + } + +PTEST_END + + diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp new file mode 100644 index 00000000..8bba049d --- /dev/null +++ b/src/test/ptest/corr_init.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 9 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +#define MIN_RANK 8 +#define MAX_RANK 15 + +namespace lsp +{ + namespace generic + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + IF_ARCH_X86( + namespace sse + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace avx + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace avx512 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + + typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count); +} + +//----------------------------------------------------------------------------- +// Performance test for lanczos resampling +PTEST_BEGIN("dsp", corr_init, 5, 10000) + + void call(const char *label, const float *a, const float *b, size_t count, corr_init_t func) + { + if (!PTEST_SUPPORTED(func)) + return; + + char buf[80]; + sprintf(buf, "%s x %d", label, int(count)); + printf("Testing %s correlation ...\n", buf); + + dsp::correlation_t corr; + + PTEST_LOOP(buf, + corr.v = 0.0f; + corr.a = 0.0f; + corr.b = 0.0f; + + func(&corr, a, b, count); + ); + } + + PTEST_MAIN + { + size_t buf_size = 1 << MAX_RANK; + uint8_t *data = NULL; + float *a = alloc_aligned(data, buf_size * 2, 64); + float *b = &a[buf_size]; + lsp_finally { + free_aligned(data); + }; + + for (size_t i=0; i < buf_size*2; ++i) + a[i] = randf(-1.0f, 1.0f); + + #define CALL(func, count) \ + call(#func, a, b, count, func) + + for (size_t i=MIN_RANK; i<=MAX_RANK; ++i) + { + const size_t count = 1 << i; + + CALL(generic::corr_init, count); + IF_ARCH_X86(CALL(sse::corr_init, count)); + IF_ARCH_X86(CALL(avx::corr_init, count)); + IF_ARCH_X86(CALL(avx::corr_init_fma3, count)); + IF_ARCH_X86(CALL(avx512::corr_init, count)); + IF_ARCH_ARM(CALL(neon_d32::corr_init, count)); + IF_ARCH_AARCH64(CALL(asimd::corr_init, count)); + + PTEST_SEPARATOR; + } + } + +PTEST_END + + diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp new file mode 100644 index 00000000..34df2084 --- /dev/null +++ b/src/test/utest/corr_incr.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 8 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +namespace lsp +{ + namespace generic + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + IF_ARCH_X86( + namespace sse + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + namespace avx + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + + void corr_incr_fma3(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + + namespace avx512 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_X86_64( + namespace sse3 + { + void x64_corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); + } + ) + + static void corr_incr(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count) + { + float vv = corr->v; + float va = corr->a; + float vb = corr->b; + + for (size_t i=0; i= 1e-10f) ? vv / sqrtf(d) : 0.0f; + } + + corr->v = vv; + corr->a = va; + corr->b = vb; + } + + typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst, + const float *a_head, const float *b_head, + const float *a_tail, const float *b_tail, + size_t count); +} + +UTEST_BEGIN("dsp", corr_incr) + void call(const char *label, size_t align, corr_incr_t func) + { + if (!UTEST_SUPPORTED(func)) + return; + + for (size_t mask=0; mask <= 0x07; ++mask) + { + UTEST_FOREACH(tail, 0x80, 0x1ff) + { + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) + { + FloatBuffer a(tail + count + 1, align, mask & 0x01); + FloatBuffer b(tail + count + 1, align, mask & 0x02); + FloatBuffer dst1(count, align, mask & 0x04); + FloatBuffer dst2(count, align, mask & 0x04); + + dsp::correlation_t corr_a, corr_b; + corr_a.v = 0.0f; + corr_a.a = 0.0f; + corr_a.b = 0.0f; + dsp::corr_init(&corr_a, a, b, tail); + corr_b = corr_a; + + printf("Tesing %s correlation tail=%d on buffer count=%d mask=0x%x\n", label, int(tail), int(count), int(mask)); + + const float *a_tail = a; + const float *b_tail = b; + const float *a_head = &a_tail[tail]; + const float *b_head = &b_tail[tail]; + + corr_incr(&corr_a, dst1, a_head, b_head, a_tail, b_tail, count); + func(&corr_b, dst2, a_head, b_head, a_tail, b_tail, count); + + UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted"); + UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); + UTEST_ASSERT_MSG(dst1.valid(), "Destination buffer 1 corrupted"); + UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted"); + + // Compare buffers + if (!dst1.equals_adaptive(dst2, 1e-4)) + { + a.dump("a "); + b.dump("b "); + dst1.dump("dst1"); + dst2.dump("dst2"); + UTEST_FAIL_MSG("Output of functions for test '%s' differs at index %d, value=%f vs %f\n" + "correlation state a={%f, %f, %f}, b={%f, %f, %f}", + label, int(dst1.last_diff()), dst1.get(dst1.last_diff()), dst2.get(dst1.last_diff()), + corr_a.v, corr_a.a, corr_a.b, + corr_b.v, corr_b.a, corr_b.b); + } + + // Compare state + if ((!float_equals_adaptive(corr_a.v, corr_b.v, 1e-5)) || + (!float_equals_adaptive(corr_a.a, corr_b.a, 1e-5)) || + (!float_equals_adaptive(corr_a.b, corr_b.b, 1e-5))) + { + UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}", + corr_a.v, corr_a.a, corr_a.b, + corr_b.v, corr_b.a, corr_b.b); + } + } + } + } + } + + UTEST_MAIN + { + #define CALL(func, align) \ + call(#func, align, func) + + CALL(generic::corr_incr, 16); + IF_ARCH_X86(CALL(sse::corr_incr, 16)); + IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, 16)); + IF_ARCH_X86(CALL(avx::corr_incr, 32)); + IF_ARCH_X86(CALL(avx::corr_incr_fma3, 32)); + IF_ARCH_X86(CALL(avx512::corr_incr, 64)); + IF_ARCH_ARM(CALL(neon_d32::corr_incr, 16)); + IF_ARCH_AARCH64(CALL(asimd::corr_incr, 16)); + } + +UTEST_END; + + + diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp new file mode 100644 index 00000000..a1a3c325 --- /dev/null +++ b/src/test/utest/corr_init.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov + * + * This file is part of lsp-dsp-lib + * Created on: 8 мар. 2024 г. + * + * lsp-dsp-lib is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * lsp-dsp-lib is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with lsp-dsp-lib. If not, see . + */ + +#include +#include +#include +#include +#include + +namespace lsp +{ + namespace generic + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + IF_ARCH_X86( + namespace sse + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace avx + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + + namespace avx512 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_ARM( + namespace neon_d32 + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + + IF_ARCH_AARCH64( + namespace asimd + { + void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count); + } + ) + + static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count) + { + float vv = 0.0f; + float va = 0.0f; + float vb = 0.0f; + + for (size_t i=0; iv += vv; + corr->a += va; + corr->b += vb; + } + + typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count); +} + +UTEST_BEGIN("dsp", corr_init) + void call(const char *label, size_t align, corr_init_t func) + { + if (!UTEST_SUPPORTED(func)) + return; + + for (size_t mask=0; mask <= 0x07; ++mask) + { + UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff) + { + FloatBuffer a(count, align, mask & 0x01); + FloatBuffer b(count, align, mask & 0x02); + + dsp::correlation_t corr_a, corr_b; + corr_a.v = 0.1f; + corr_a.a = 0.2f; + corr_a.b = 0.3f; + corr_b.v = 0.1f; + corr_b.a = 0.2f; + corr_b.b = 0.3f; + + printf("Tesing %s corr_init on buffer count=%d mask=0x%x\n", label, int(count), int(mask)); + + corr_init(&corr_a, a, b, count); + func(&corr_b, a, b, count); + + UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted"); + UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted"); + + // Compare state + if ((!float_equals_adaptive(corr_a.v, corr_b.v, 1e-5f)) || + (!float_equals_adaptive(corr_a.a, corr_b.a, 1e-5f)) || + (!float_equals_adaptive(corr_a.b, corr_b.b, 1e-5f))) + { + UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}", + corr_a.v, corr_a.a, corr_a.b, + corr_b.v, corr_b.a, corr_b.b); + } + } + } + } + + UTEST_MAIN + { + #define CALL(func, align) \ + call(#func, align, func) + + CALL(generic::corr_init, 16); + IF_ARCH_X86(CALL(sse::corr_init, 16)); + IF_ARCH_X86(CALL(avx::corr_init, 32)); + IF_ARCH_X86(CALL(avx::corr_init_fma3, 32)); + IF_ARCH_X86(CALL(avx512::corr_init, 64)); + IF_ARCH_ARM(CALL(neon_d32::corr_init, 16)); + IF_ARCH_AARCH64(CALL(asimd::corr_init, 16)); + } + +UTEST_END; + + + diff --git a/src/test/utest/msmatrix/conv2.cpp b/src/test/utest/msmatrix/conv2.cpp index f22e21e4..540e3c46 100644 --- a/src/test/utest/msmatrix/conv2.cpp +++ b/src/test/utest/msmatrix/conv2.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -85,7 +85,7 @@ UTEST_BEGIN("dsp.msmatrix", conv2) return; UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 32, 64, 65, 100, 999, 0xfff) + 32, 33, 34, 35, 36, 37, 38, 39, 40, 63, 64, 65, 100, 999, 0xfff) { for (size_t mask=0; mask <= 0x0f; ++mask) { diff --git a/src/test/utest/msmatrix/conv2x1.cpp b/src/test/utest/msmatrix/conv2x1.cpp index ba75853e..889e8a74 100644 --- a/src/test/utest/msmatrix/conv2x1.cpp +++ b/src/test/utest/msmatrix/conv2x1.cpp @@ -1,6 +1,6 @@ /* - * Copyright (C) 2023 Linux Studio Plugins Project - * (C) 2023 Vladimir Sadovnikov + * Copyright (C) 2024 Linux Studio Plugins Project + * (C) 2024 Vladimir Sadovnikov * * This file is part of lsp-dsp-lib * Created on: 31 мар. 2020 г. @@ -97,7 +97,7 @@ UTEST_BEGIN("dsp.msmatrix", conv2x1) return; UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 32, 64, 65, 100, 999, 0xfff) + 32, 33, 34, 35, 36, 37, 38, 39, 40, 63, 64, 65, 100, 999, 0xfff) { for (size_t mask=0; mask <= 0x07; ++mask) { diff --git a/src/test/utest/resampling/oversampling.cpp b/src/test/utest/resampling/oversampling.cpp index 6ec2ae51..2e484c9f 100644 --- a/src/test/utest/resampling/oversampling.cpp +++ b/src/test/utest/resampling/oversampling.cpp @@ -23,6 +23,7 @@ #include #include #include +#include namespace lsp {