diff --git a/CHANGELOG b/CHANGELOG
index 4d445478..aae74d38 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,12 @@
* RECENT CHANGES
*******************************************************************************
+=== 1.0.22 ===
+* Implemented functions for computing correlation between two signals.
+* Fixed error in AVX and AVX-512 implementation of lr_to_ms functions.
+* Updated build scripts.
+* Updated module versions in dependencies.
+
=== 1.0.21 ===
* Updated build scripts.
* Updated module versions in dependencies.
diff --git a/include/lsp-plug.in/dsp/common/correlation.h b/include/lsp-plug.in/dsp/common/correlation.h
new file mode 100644
index 00000000..bd86c0d9
--- /dev/null
+++ b/include/lsp-plug.in/dsp/common/correlation.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 7 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+#ifndef LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_
+#define LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_
+
+#include
+
+LSP_DSP_LIB_BEGIN_NAMESPACE
+
+#pragma pack(push, 1)
+
+/**
+ * Object to store correlation state.
+ *
+ * The correlation is computed using the following formula:
+ *
+ * sum(a[i] * b[i])
+ * corr = ---------------------------------------
+ * sqrt(sum(a[i]*a[i]) * sum(b[i]*b[i]))
+ *
+ * where i is in range of 0 to count-1.
+ *
+ */
+typedef struct LSP_DSP_LIB_TYPE(correlation_t)
+{
+ float v; // the aggregated value of sum(a*b)
+ float a; // the aggregated value of sum(a*a)
+ float b; // the aggregated value of sum(b*b)
+} LSP_DSP_LIB_TYPE(correlation_t);
+
+#pragma pack(pop)
+
+LSP_DSP_LIB_END_NAMESPACE
+
+/**
+ * Compute the initial intermediate values of correlation between two signals,
+ * the function can be called multiple times, so the value of corr structure
+ * should be cleared before first call.
+ *
+ * @param corr the object to initialize with intermediate results
+ * @param a the pointer to the first signal buffer
+ * @param b the pointer to the second signal buffer
+ * @param count number of samples to process
+ */
+LSP_DSP_LIB_SYMBOL(void, corr_init,
+ LSP_DSP_LIB_TYPE(correlation_t) *corr,
+ const float *a, const float *b,
+ size_t count);
+
+/**
+ * Compute incremental value of normalized correlation between two signals
+ *
+ * @param corr the object that holds intermediate results
+ * @param dst destination buffer to store result
+ * @param a_head the pointer to the head of the first signal buffer
+ * @param b_head the pointer to the head of the second signal buffer
+ * @param a_tail the pointer to the tail of the first signal buffer
+ * @param b_tail the pointer to the tail of the second signal buffer
+ * @param head the offset of the head element relative to the first one in correlation (length of the window minus one)
+ * @param count number of samples to process
+ */
+LSP_DSP_LIB_SYMBOL(void, corr_incr,
+ LSP_DSP_LIB_TYPE(correlation_t) *corr,
+ float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+
+#endif /* LSP_PLUG_IN_DSP_COMMON_CORRELATION_H_ */
diff --git a/include/lsp-plug.in/dsp/dsp.h b/include/lsp-plug.in/dsp/dsp.h
index 900547f1..38c0a815 100644
--- a/include/lsp-plug.in/dsp/dsp.h
+++ b/include/lsp-plug.in/dsp/dsp.h
@@ -48,6 +48,7 @@
#include
#include
#include
+#include
#include
#include
#include
diff --git a/include/lsp-plug.in/dsp/version.h b/include/lsp-plug.in/dsp/version.h
index bd21c8d5..8fa10d85 100644
--- a/include/lsp-plug.in/dsp/version.h
+++ b/include/lsp-plug.in/dsp/version.h
@@ -25,7 +25,7 @@
// Define version of headers
#define LSP_DSP_LIB_MAJOR 1
#define LSP_DSP_LIB_MINOR 0
-#define LSP_DSP_LIB_MICRO 21
+#define LSP_DSP_LIB_MICRO 22
#if defined(__WINDOWS__) || defined(__WIN32__) || defined(__WIN64__) || defined(_WIN64) || defined(_WIN32) || defined(__WINNT) || defined(__WINNT__)
#define LSP_DSP_LIB_EXPORT_MODIFIER __declspec(dllexport)
diff --git a/include/private/dsp/arch/aarch64/asimd/correlation.h b/include/private/dsp/arch/aarch64/asimd/correlation.h
new file mode 100644
index 00000000..4debc2af
--- /dev/null
+++ b/include/private/dsp/arch/aarch64/asimd/correlation.h
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 9 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL */
+
+namespace lsp
+{
+ namespace asimd
+ {
+
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ ARCH_AARCH64_ASM(
+ __ASM_EMIT("eor v0.16b, v0.16b, v0.16b") /* xv = 0 */
+ __ASM_EMIT("eor v1.16b, v1.16b, v1.16b") /* xa = 0 */
+ __ASM_EMIT("eor v2.16b, v2.16b, v2.16b") /* xb = 0 */
+ /* 16x blocks */
+ __ASM_EMIT("subs %[count], %[count], #16")
+ __ASM_EMIT("b.lo 2f")
+ __ASM_EMIT("eor v3.16b, v3.16b, v3.16b") /* xv = 0 */
+ __ASM_EMIT("eor v4.16b, v4.16b, v4.16b") /* xa = 0 */
+ __ASM_EMIT("eor v5.16b, v5.16b, v5.16b") /* xb = 0 */
+ __ASM_EMIT("1:")
+ __ASM_EMIT("ldp q8, q9, [%[a], 0x00]") /* v8 = a0, v9 = a1 */
+ __ASM_EMIT("ldp q10, q11, [%[a], 0x20]") /* v10 = a2, v11 = a3 */
+ __ASM_EMIT("ldp q12, q13, [%[b], 0x00]") /* v12 = b0, v13 = b1 */
+ __ASM_EMIT("ldp q14, q15, [%[b], 0x20]") /* v14 = b2, v15 = b3 */
+ __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */
+ __ASM_EMIT("fmla v3.4s, v9.4s, v13.4s") /* v3 = xv + a1*b1 */
+ __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */
+ __ASM_EMIT("fmla v4.4s, v9.4s, v9.4s") /* v4 = xa + a1*a1 */
+ __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */
+ __ASM_EMIT("fmla v5.4s, v13.4s, v13.4s") /* v5 = xb + b1*b1 */
+ __ASM_EMIT("fmla v0.4s, v10.4s, v14.4s") /* v0 = xv + a0*b0 */
+ __ASM_EMIT("fmla v3.4s, v11.4s, v15.4s") /* v3 = xv + a1*b1 */
+ __ASM_EMIT("fmla v1.4s, v10.4s, v10.4s") /* v1 = xa + a0*a0 */
+ __ASM_EMIT("fmla v4.4s, v11.4s, v11.4s") /* v4 = xa + a1*a1 */
+ __ASM_EMIT("fmla v2.4s, v14.4s, v14.4s") /* v2 = xb + b0*b0 */
+ __ASM_EMIT("fmla v5.4s, v15.4s, v15.4s") /* v5 = xb + b1*b1 */
+ __ASM_EMIT("subs %[count], %[count], #16")
+ __ASM_EMIT("add %[a], %[a], #0x40")
+ __ASM_EMIT("add %[b], %[b], #0x40")
+ __ASM_EMIT("b.hs 1b")
+ __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s")
+ __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s")
+ __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s")
+ __ASM_EMIT("2:")
+ /* 8x block */
+ __ASM_EMIT("adds %[count], %[count], #8")
+ __ASM_EMIT("b.lt 4f")
+ __ASM_EMIT("ldp q8, q9, [%[a], 0x00]") /* v8 = a0, v9 = a1 */
+ __ASM_EMIT("ldp q12, q13, [%[b], 0x00]") /* v12 = b0, v13 = b1 */
+ __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */
+ __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */
+ __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */
+ __ASM_EMIT("fmla v0.4s, v9.4s, v13.4s") /* v0 = xv + a1*b1 */
+ __ASM_EMIT("fmla v1.4s, v9.4s, v9.4s") /* v1 = xa + a1*a1 */
+ __ASM_EMIT("fmla v2.4s, v13.4s, v13.4s") /* v2 = xb + b1*b1 */
+ __ASM_EMIT("sub %[count], %[count], #8")
+ __ASM_EMIT("add %[a], %[a], #0x20")
+ __ASM_EMIT("add %[b], %[b], #0x20")
+ __ASM_EMIT("4:")
+ /* 4x block */
+ __ASM_EMIT("adds %[count], %[count], #4")
+ __ASM_EMIT("b.lt 6f")
+ __ASM_EMIT("ldr q8, [%[a], 0x00]") /* v8 = a0, v9 = a1 */
+ __ASM_EMIT("ldr q12, [%[b], 0x00]") /* v12 = b0, v13 = b1 */
+ __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */
+ __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */
+ __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */
+ __ASM_EMIT("sub %[count], %[count], #4")
+ __ASM_EMIT("add %[a], %[a], #0x10")
+ __ASM_EMIT("add %[b], %[b], #0x10")
+ __ASM_EMIT("6:")
+ __ASM_EMIT("eor v6.16b, v6.16b, v6.16b") /* v6 = 0 */
+ __ASM_EMIT("ext v3.16b, v0.16b, v6.16b, #8")/* v3 = xv2 xv3 0 0 */
+ __ASM_EMIT("ext v4.16b, v1.16b, v6.16b, #8")/* v4 = xa2 xb3 0 0 */
+ __ASM_EMIT("ext v5.16b, v2.16b, v6.16b, #8")/* v5 = xa2 xb3 0 0 */
+ __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") /* v0 = xv0+xv2 xv1+xv3 xv2 xv3 */
+ __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") /* v1 = xa0+xa2 xa1+xa3 xa2 xa3 */
+ __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") /* v2 = xb0+xb2 xb1+xb3 xb2 xb3 */
+ __ASM_EMIT("ext v3.16b, v0.16b, v6.16b, #4")/* v3 = xv1+xv3 xv2 xv3 0 */
+ __ASM_EMIT("ext v4.16b, v1.16b, v6.16b, #4")/* v4 = xa1+xa3 xa2 xa3 0 */
+ __ASM_EMIT("ext v5.16b, v2.16b, v6.16b, #4")/* v5 = xb1+xb3 xb2 xb3 0 */
+ __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s") /* v0 = xv0+xv1+xv2+xv3 xv1+xv2+xv3 xv2+xv3 xv3 */
+ __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s") /* v1 = xa0+xa1+xa2+xa3 xa1+xa2+xa3 xa2+xa3 xa3 */
+ __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s") /* v2 = xb0+xb1+xb2+xb3 xb1+xb2+xb3 xb2+xb3 xb3 */
+ /* 1x blocks */
+ __ASM_EMIT("adds %[count], %[count], #3")
+ __ASM_EMIT("b.lt 8f")
+ __ASM_EMIT("7:")
+ __ASM_EMIT("ld1r {v8.4s}, [%[a]]") /* v8 = a0 */
+ __ASM_EMIT("ld1r {v12.4s}, [%[b]]") /* q12 = b0 */
+ __ASM_EMIT("fmla v0.4s, v8.4s, v12.4s") /* v0 = xv + a0*b0 */
+ __ASM_EMIT("fmla v1.4s, v8.4s, v8.4s") /* v1 = xa + a0*a0 */
+ __ASM_EMIT("fmla v2.4s, v12.4s, v12.4s") /* v2 = xb + b0*b0 */
+ __ASM_EMIT("subs %[count], %[count], #1")
+ __ASM_EMIT("add %[a], %[a], #0x04")
+ __ASM_EMIT("add %[b], %[b], #0x04")
+ __ASM_EMIT("b.ge 7b")
+ __ASM_EMIT("8:")
+ /* Store result */
+ __ASM_EMIT("ld3r {v3.4s, v4.4s, v5.4s}, [%[corr]]") /* v3 = v, v4 = a, v5 = b */
+ __ASM_EMIT("fadd v0.4s, v0.4s, v3.4s")
+ __ASM_EMIT("fadd v1.4s, v1.4s, v4.4s")
+ __ASM_EMIT("fadd v2.4s, v2.4s, v5.4s")
+ __ASM_EMIT("st3 {v0.s, v1.s, v2.s}[0], [%[corr]]")
+
+ : [a] "+r" (a), [b] "+r" (b), [count] "+r" (count)
+ : [corr] "r" (corr)
+ : "cc", "memory",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+ );
+ }
+
+ static const float corr_const[] __lsp_aligned16 =
+ {
+ LSP_DSP_VEC8(1e-10f)
+ };
+
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ ARCH_AARCH64_ASM(
+ __ASM_EMIT("ld3r {v0.4s, v1.4s, v2.4s}, [%[corr]]") /* v0 = xv, v1 = xa, v2 = xb */
+ __ASM_EMIT("eor v3.16b, v3.16b, v3.16b") /* v3 = 0 */
+ /* 8x blocks */
+ __ASM_EMIT("subs %[count], %[count], #8")
+ __ASM_EMIT("b.lo 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("ldp q4, q5, [%[a_head], 0x00]") /* v4 = ah0, v5 = ah1 */
+ __ASM_EMIT("ldp q6, q7, [%[b_head], 0x00]") /* v6 = ah0, v7 = ah1 */
+ __ASM_EMIT("ldp q8, q9, [%[a_tail], 0x00]") /* v8 = ah0, v9 = ah1 */
+ __ASM_EMIT("ldp q10, q11, [%[b_tail], 0x00]") /* v10 = ah0, v11 = ah1 */
+ __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */
+ __ASM_EMIT("fmul v13.4s, v5.4s, v7.4s")
+ __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */
+ __ASM_EMIT("fmul v5.4s, v5.4s, v5.4s")
+ __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */
+ __ASM_EMIT("fmul v7.4s, v7.4s, v7.4s")
+ __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */
+ __ASM_EMIT("fmls v13.4s, v9.4s, v11.4s")
+ __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */
+ __ASM_EMIT("fmls v5.4s, v9.4s, v9.4s")
+ __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */
+ __ASM_EMIT("fmls v7.4s, v11.4s, v11.4s")
+
+ __ASM_EMIT("ext v14.16b, v3.16b, v12.16b, #8") /* v14 = 0 0 DV[0] DV[1] */
+ __ASM_EMIT("ext v15.16b, v3.16b, v13.16b, #8") /* v15 = 0 0 DV[4] DV[5] */
+ __ASM_EMIT("fadd v12.4s, v12.4s, v14.4s") /* v12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */
+ __ASM_EMIT("fadd v13.4s, v13.4s, v15.4s") /* v13 = DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("ext v14.16b, v3.16b, v12.16b, #12") /* v14 = 0 DV[0] DV[1] DV[0]+DV[2] */
+ __ASM_EMIT("ext v15.16b, v3.16b, v13.16b, #12") /* v15 = 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("fadd v12.4s, v12.4s, v14.4s") /* v12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("fadd v13.4s, v13.4s, v15.4s") /* v13 = V[4..7] = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("ext v14.16b, v3.16b, v4.16b, #8") /* v14 = 0 0 DA[0] DA[1] */
+ __ASM_EMIT("ext v15.16b, v3.16b, v5.16b, #8") /* v15 = 0 0 DA[4] DA[5] */
+ __ASM_EMIT("fadd v4.4s, v4.4s, v14.4s") /* v4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */
+ __ASM_EMIT("fadd v5.4s, v5.4s, v15.4s") /* v5 = DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("ext v14.16b, v3.16b, v4.16b, #12") /* v14 = 0 DA[0] DA[1] DA[0]+DA[2] */
+ __ASM_EMIT("ext v15.16b, v3.16b, v5.16b, #12") /* v15 = 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("fadd v4.4s, v4.4s, v14.4s") /* v4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("fadd v5.4s, v5.4s, v15.4s") /* v5 = A[4..7] = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("ext v14.16b, v3.16b, v6.16b, #8") /* v14 = 0 0 DB[0] DB[1] */
+ __ASM_EMIT("ext v15.16b, v3.16b, v7.16b, #8") /* v15 = 0 0 DB[4] DB[5] */
+ __ASM_EMIT("fadd v6.4s, v6.4s, v14.4s") /* v6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */
+ __ASM_EMIT("fadd v7.4s, v7.4s, v15.4s") /* v7 = DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("ext v14.16b, v3.16b, v6.16b, #12") /* v14 = 0 DB[0] DB[1] DB[0]+DB[2] */
+ __ASM_EMIT("ext v15.16b, v3.16b, v7.16b, #12") /* v15 = 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("fadd v6.4s, v6.4s, v14.4s") /* v6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("fadd v7.4s, v7.4s, v15.4s") /* v7 = B[4..7] = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("dup v8.4s, v12.s[3]") /* v8 = V[3] V[3] V[3] V[3] */
+ __ASM_EMIT("dup v9.4s, v4.s[3]") /* v9 = A[3] A[3] A[3] A[3] */
+ __ASM_EMIT("dup v10.4s, v6.s[3]") /* v10 = B[3] B[3] B[3] B[3] */
+ __ASM_EMIT("fadd v13.4s, v13.4s, v8.4s") /* v13 = V[3]+V[4] V[3]+V[5] V[3]+V[6] V[3]+V[7] */
+ __ASM_EMIT("fadd v5.4s, v5.4s, v9.4s") /* v5 = A[3]+A[4] A[3]+A[5] A[3]+A[6] A[3]+A[7] */
+ __ASM_EMIT("fadd v7.4s, v7.4s, v10.4s") /* v7 = B[3]+B[4] B[3]+B[5] B[3]+B[6] B[3]+B[7] */
+
+ __ASM_EMIT("fadd v4.4s, v4.4s, v1.4s") /* v4 = BA = xa + A */
+ __ASM_EMIT("fadd v5.4s, v5.4s, v1.4s")
+ __ASM_EMIT("fadd v6.4s, v6.4s, v2.4s") /* v6 = BB = xb + B */
+ __ASM_EMIT("fadd v7.4s, v7.4s, v2.4s")
+ __ASM_EMIT("fadd v8.4s, v12.4s, v0.4s") /* v8 = T = xv + V */
+ __ASM_EMIT("fadd v9.4s, v13.4s, v0.4s")
+ __ASM_EMIT("fmul v10.4s, v4.4s, v6.4s") /* v10 = B = BA * BB */
+ __ASM_EMIT("fmul v11.4s, v5.4s, v7.4s")
+ __ASM_EMIT("dup v0.4s, v9.s[3]") /* v0 = xv' = T[7] */
+ __ASM_EMIT("dup v1.4s, v5.s[3]") /* v1 = xa' = BA[7] */
+ __ASM_EMIT("dup v2.4s, v7.s[3]") /* v2 = xb' = BB[7] */
+ __ASM_EMIT("ldp q14, q15, [%[CORR_CC]]") /* v14 = 1e-10, v15 = 1e-10 */
+
+ __ASM_EMIT("fcmge v14.4s, v8.4s, v14.4s") /* v14 = T >= 1e-10 */
+ __ASM_EMIT("fcmge v15.4s, v9.4s, v15.4s")
+ __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */
+ __ASM_EMIT("frsqrte v5.4s, v11.4s")
+ __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */
+ __ASM_EMIT("fmul v7.4s, v5.4s, v11.4s")
+ __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("frsqrts v13.4s, v7.4s, v5.4s")
+ __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("fmul v5.4s, v5.4s, v13.4s")
+ __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */
+ __ASM_EMIT("fmul v7.4s, v5.4s, v11.4s")
+ __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("frsqrts v13.4s, v7.4s, v5.4s")
+ __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("fmul v11.4s, v5.4s, v13.4s")
+ __ASM_EMIT("fmul v10.4s, v8.4s, v10.4s") /* v10 = T/svrtf(B) */
+ __ASM_EMIT("fmul v11.4s, v9.4s, v11.4s")
+ __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */
+ __ASM_EMIT("and v11.16b, v11.16b, v15.16b")
+ __ASM_EMIT("add %[a_head], %[a_head], #0x20")
+ __ASM_EMIT("add %[b_head], %[b_head], #0x20")
+ __ASM_EMIT("subs %[count], %[count], #8")
+ __ASM_EMIT("stp q10, q11, [%[dst], 0x00]")
+ __ASM_EMIT("add %[a_tail], %[a_tail], #0x20")
+ __ASM_EMIT("add %[b_tail], %[b_tail], #0x20")
+ __ASM_EMIT("add %[dst], %[dst], #0x20")
+ __ASM_EMIT("b.hs 1b")
+ __ASM_EMIT("2:")
+ /* 4x block */
+ __ASM_EMIT("adds %[count], %[count], #4")
+ __ASM_EMIT("b.lt 4f")
+ __ASM_EMIT("ldr q4, [%[a_head], 0x00]") /* v4 = ah0 */
+ __ASM_EMIT("ldr q6, [%[b_head], 0x00]") /* v6 = ah0 */
+ __ASM_EMIT("ldr q8, [%[a_tail], 0x00]") /* v8 = ah0 */
+ __ASM_EMIT("ldr q10, [%[b_tail], 0x00]") /* v10 = ah0 */
+ __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */
+ __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */
+ __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */
+ __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */
+ __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */
+ __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */
+
+ __ASM_EMIT("ext v13.16b, v3.16b, v12.16b, #8") /* v13 = 0 0 DV[0] DV[1] */
+ __ASM_EMIT("ext v5.16b, v3.16b, v4.16b, #8") /* v5 = 0 0 DA[0] DA[1] */
+ __ASM_EMIT("ext v7.16b, v3.16b, v6.16b, #8") /* v7 = 0 0 DB[0] DB[1] */
+ __ASM_EMIT("fadd v12.4s, v12.4s, v13.4s") /* v12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */
+ __ASM_EMIT("fadd v4.4s, v4.4s, v5.4s") /* v4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */
+ __ASM_EMIT("fadd v6.4s, v6.4s, v7.4s") /* v6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */
+ __ASM_EMIT("ext v13.16b, v3.16b, v12.16b, #12") /* v13 = 0 DV[0] DV[1] DV[0]+DV[2] */
+ __ASM_EMIT("ext v5.16b, v3.16b, v4.16b, #12") /* v5 = 0 DA[0] DA[1] DA[0]+DA[2] */
+ __ASM_EMIT("ext v7.16b, v3.16b, v6.16b, #12") /* v7 = 0 DB[0] DB[1] DB[0]+DB[2] */
+ __ASM_EMIT("fadd v12.4s, v12.4s, v13.4s") /* v12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("fadd v4.4s, v4.4s, v5.4s") /* v4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("fadd v6.4s, v6.4s, v7.4s") /* v6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */
+
+ __ASM_EMIT("fadd v4.4s, v4.4s, v1.4s") /* v4 = BA = xa + A */
+ __ASM_EMIT("fadd v6.4s, v6.4s, v2.4s") /* v6 = BB = xb + B */
+ __ASM_EMIT("fadd v8.4s, v12.4s, v0.4s") /* v8 = T = xv + V */
+ __ASM_EMIT("fmul v10.4s, v4.4s, v6.4s") /* v10 = B = BA * BB */
+ __ASM_EMIT("dup v1.4s, v4.s[3]") /* v1 = xa' = BA[7] */
+ __ASM_EMIT("dup v2.4s, v6.s[3]") /* v2 = xb' = BB[7] */
+ __ASM_EMIT("dup v0.4s, v8.s[3]") /* v0 = xv' = T[7] */
+ __ASM_EMIT("ldr q14, [%[CORR_CC]]") /* v14 = 1e-10 */
+
+ __ASM_EMIT("fcmge v14.4s, v8.4s, v14.4s") /* v14 = T >= 1e-10 */
+ __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */
+ __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */
+ __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */
+ __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("fmul v10.4s, v8.4s, v10.4s") /* v10 = T/svrtf(B) */
+ __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */
+ __ASM_EMIT("add %[a_head], %[a_head], #0x10")
+ __ASM_EMIT("add %[b_head], %[b_head], #0x10")
+ __ASM_EMIT("sub %[count], %[count], #4")
+ __ASM_EMIT("str q10, [%[dst], 0x00]")
+ __ASM_EMIT("add %[a_tail], %[a_tail], #0x10")
+ __ASM_EMIT("add %[b_tail], %[b_tail], #0x10")
+ __ASM_EMIT("add %[dst], %[dst], #0x10")
+ __ASM_EMIT("4:")
+ /* 1x blocks */
+ __ASM_EMIT("adds %[count], %[count], #3")
+ __ASM_EMIT("blt 6f")
+ __ASM_EMIT("ldr q3, [%[CORR_CC]]") /* v3 = 1e-10 */
+ __ASM_EMIT("5:")
+ __ASM_EMIT("ld1r {v4.4s}, [%[a_head]]") /* v4 = ah0 */
+ __ASM_EMIT("ld1r {v6.4s}, [%[b_head]]") /* v6 = bh0 */
+ __ASM_EMIT("ld1r {v8.4s}, [%[a_tail]]") /* v8 = at0 */
+ __ASM_EMIT("ld1r {v10.4s}, [%[b_tail]]") /* v10 = bt0 */
+ __ASM_EMIT("fmul v12.4s, v4.4s, v6.4s") /* v12 = ah0*bh0 */
+ __ASM_EMIT("fmul v4.4s, v4.4s, v4.4s") /* v4 = ah0*ah0 */
+ __ASM_EMIT("fmul v6.4s, v6.4s, v6.4s") /* v6 = bh0*bh0 */
+ __ASM_EMIT("fmls v12.4s, v8.4s, v10.4s") /* v12 = DV = ah0*bh0 - at0*bt0 */
+ __ASM_EMIT("fmls v4.4s, v8.4s, v8.4s") /* v4 = DA = ah0*ah0 - at0*at0 */
+ __ASM_EMIT("fmls v6.4s, v10.4s, v10.4s") /* v6 = DB = bh0*bh0 - bt0*bt0 */
+
+ __ASM_EMIT("fadd v1.4s, v4.4s, v1.4s") /* v1 = BA = xa + DA */
+ __ASM_EMIT("fadd v2.4s, v6.4s, v2.4s") /* v2 = BB = xb + DB */
+ __ASM_EMIT("fadd v0.4s, v12.4s, v0.4s") /* v0 = T = xv + DV */
+ __ASM_EMIT("fmul v10.4s, v1.4s, v2.4s") /* v10 = B = BA * BB */
+
+ __ASM_EMIT("fcmge v14.4s, v0.4s, v3.4s") /* v14 = T >= 1e-10 */
+ __ASM_EMIT("frsqrte v4.4s, v10.4s") /* v4 = x0 */
+ __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x0 */
+ __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("fmul v4.4s, v4.4s, v12.4s") /* v4 = x1 = x0 * (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("fmul v6.4s, v4.4s, v10.4s") /* v6 = R * x1 */
+ __ASM_EMIT("frsqrts v12.4s, v6.4s, v4.4s") /* v12 = (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("fmul v10.4s, v4.4s, v12.4s") /* v10 = 1/svrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("fmul v10.4s, v0.4s, v10.4s") /* v10 = T/svrtf(B) */
+ __ASM_EMIT("and v10.16b, v10.16b, v14.16b") /* v10 = (T >= 1e-10) ? T/svrt(B) : 0 */
+ __ASM_EMIT("add %[a_head], %[a_head], #0x04")
+ __ASM_EMIT("add %[b_head], %[b_head], #0x04")
+ __ASM_EMIT("subs %[count], %[count], #1")
+ __ASM_EMIT("st1 {v10.s}[0], [%[dst]]")
+ __ASM_EMIT("add %[a_tail], %[a_tail], #0x04")
+ __ASM_EMIT("add %[b_tail], %[b_tail], #0x04")
+ __ASM_EMIT("add %[dst], %[dst], #0x04")
+ __ASM_EMIT("b.ge 5b")
+ __ASM_EMIT("6:")
+ /* Store state */
+ __ASM_EMIT("st3 {v0.s, v1.s, v2.s}[0], [%[corr]]")
+
+ : [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ : [corr] "r" (corr),
+ [CORR_CC] "r" (&corr_const[0])
+ : "cc", "memory",
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
+ );
+ }
+
+ } /* namespace asimd */
+} /* namespace lsp */
+
+
+
+#endif /* PRIVATE_DSP_ARCH_AARCH64_ASIMD_CORRELATION_H_ */
diff --git a/include/private/dsp/arch/arm/neon-d32/correlation.h b/include/private/dsp/arch/arm/neon-d32/correlation.h
new file mode 100644
index 00000000..0021fdb7
--- /dev/null
+++ b/include/private/dsp/arch/arm/neon-d32/correlation.h
@@ -0,0 +1,337 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 9 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL */
+
+namespace lsp
+{
+ namespace neon_d32
+ {
+
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ ARCH_ARM_ASM(
+ __ASM_EMIT("veor q0, q0, q0") /* xv = 0 */
+ __ASM_EMIT("veor q1, q1, q1") /* xa = 0 */
+ __ASM_EMIT("veor q2, q2, q2") /* xb = 0 */
+ /* 16x blocks */
+ __ASM_EMIT("subs %[count], #16")
+ __ASM_EMIT("blo 2f")
+ __ASM_EMIT("veor q3, q3, q3") /* xv = 0 */
+ __ASM_EMIT("veor q4, q4, q4") /* xa = 0 */
+ __ASM_EMIT("veor q5, q5, q5") /* xb = 0 */
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vldm %[a]!, {q8-q11}") /* q8 = a0, q9 = a1, q10 = a2, q11 = a3 */
+ __ASM_EMIT("vldm %[b]!, {q12-q15}") /* q12 = b0, q13 = b1, q14 = b2, q15 = b3 */
+ __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */
+ __ASM_EMIT("vmla.f32 q3, q9, q13") /* q3 = xv + a1*b1 */
+ __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */
+ __ASM_EMIT("vmla.f32 q4, q9, q9") /* q4 = xa + a1*a1 */
+ __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */
+ __ASM_EMIT("vmla.f32 q5, q13, q13") /* q5 = xb + b1*b1 */
+ __ASM_EMIT("vmla.f32 q0, q10, q14") /* q0 = xv + a2*b2 */
+ __ASM_EMIT("vmla.f32 q3, q11, q15") /* q3 = xv + a3*b3 */
+ __ASM_EMIT("vmla.f32 q1, q10, q10") /* q1 = xa + a2*a2 */
+ __ASM_EMIT("vmla.f32 q4, q11, q11") /* q4 = xa + a3*a3 */
+ __ASM_EMIT("vmla.f32 q2, q14, q14") /* q2 = xb + b2*b3 */
+ __ASM_EMIT("vmla.f32 q5, q15, q15") /* q5 = xb + b3*b3 */
+ __ASM_EMIT("subs %[count], #16")
+ __ASM_EMIT("bhs 1b")
+ __ASM_EMIT("vadd.f32 q0, q0, q3")
+ __ASM_EMIT("vadd.f32 q1, q1, q4")
+ __ASM_EMIT("vadd.f32 q2, q2, q5")
+ __ASM_EMIT("2:")
+ /* 8x block */
+ __ASM_EMIT("adds %[count], #8")
+ __ASM_EMIT("blt 4f")
+ __ASM_EMIT("vldm %[a]!, {q8-q9}") /* q8 = a0, q9 = a1 */
+ __ASM_EMIT("vldm %[b]!, {q12-q13}") /* q12 = b0, q13 = b1 */
+ __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */
+ __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */
+ __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */
+ __ASM_EMIT("vmla.f32 q0, q9, q13") /* q0 = xv + a1*b1 */
+ __ASM_EMIT("vmla.f32 q1, q9, q9") /* q1 = xa + a1*a1 */
+ __ASM_EMIT("vmla.f32 q2, q13, q13") /* q2 = xb + b1*b1 */
+ __ASM_EMIT("sub %[count], #8")
+ __ASM_EMIT("4:")
+ /* 4x block */
+ __ASM_EMIT("adds %[count], #4")
+ __ASM_EMIT("blt 6f")
+ __ASM_EMIT("vldm %[a]!, {q8}") /* q8 = a0 */
+ __ASM_EMIT("vldm %[b]!, {q12}") /* q12 = b0 */
+ __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */
+ __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */
+ __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */
+ __ASM_EMIT("sub %[count], #4")
+ __ASM_EMIT("6:")
+ __ASM_EMIT("veor q6, q6, q6") /* q6 = 0 */
+ __ASM_EMIT("vext.32 q3, q0, q6, #2") /* q3 = xv2 xv3 0 0 */
+ __ASM_EMIT("vext.32 q4, q1, q6, #2") /* q4 = xa2 xa3 0 0 */
+ __ASM_EMIT("vext.32 q5, q2, q6, #2") /* q5 = xb2 xb3 0 0 */
+ __ASM_EMIT("vadd.f32 q0, q0, q3") /* q0 = xv0+xv2 xv1+xv3 xv2 xv3 */
+ __ASM_EMIT("vadd.f32 q1, q1, q4") /* q1 = xa0+xa2 xa1+xa3 xv2 xv3 */
+ __ASM_EMIT("vadd.f32 q2, q2, q5") /* q2 = xb0+xb2 xb1+xb3 xv2 xv3 */
+ __ASM_EMIT("vext.32 q3, q0, q6, #1") /* q3 = xv1+xv3 xv2 xv3 0 */
+ __ASM_EMIT("vext.32 q4, q1, q6, #1") /* q4 = xa1+xa3 xv2 xv3 0 */
+ __ASM_EMIT("vext.32 q5, q2, q6, #1") /* q5 = xb1+xb3 xv2 xv3 */
+ __ASM_EMIT("vadd.f32 q0, q0, q3") /* q0 = xv0+xv1+xv2+xv3 xv1+xv2+xv3 xv2+xv3 xv3 */
+ __ASM_EMIT("vadd.f32 q1, q1, q4") /* q1 = xa0+xa1+xa2+xa3 xa1+xa2+xa3 xa2+xa3 xa3 */
+ __ASM_EMIT("vadd.f32 q2, q2, q5") /* q2 = xb0+xb1+xb2+xb3 xb1+xb2+xb3 xb2+xb3 xb3 */
+ /* 1x blocks */
+ __ASM_EMIT("adds %[count], #3")
+ __ASM_EMIT("blt 8f")
+ __ASM_EMIT("7:")
+ __ASM_EMIT("vld1.32 {d16[], d17[]}, [%[a]]!") /* q8 = a0 */
+ __ASM_EMIT("vld1.32 {d24[], d25[]}, [%[b]]!") /* q12 = b0 */
+ __ASM_EMIT("vmla.f32 q0, q8, q12") /* q0 = xv + a0*b0 */
+ __ASM_EMIT("vmla.f32 q1, q8, q8") /* q1 = xa + a0*a0 */
+ __ASM_EMIT("vmla.f32 q2, q12, q12") /* q2 = xb + b0*b0 */
+ __ASM_EMIT("subs %[count], #1")
+ __ASM_EMIT("bge 7b")
+ __ASM_EMIT("8:")
+ /* Store result */
+ __ASM_EMIT("vld3.32 {d6[], d8[], d10[]}, [%[corr]]") /* q3 = v, q4 = a, q5 = b */
+ __ASM_EMIT("vadd.f32 q0, q0, q3")
+ __ASM_EMIT("vadd.f32 q1, q1, q4")
+ __ASM_EMIT("vadd.f32 q2, q2, q5")
+ __ASM_EMIT("vst3.32 {d0[0], d2[0], d4[0]}, [%[corr]]")
+
+ : [a] "+r" (a), [b] "+r" (b), [count] "+r" (count)
+ : [corr] "r" (corr)
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+
+ static const float corr_const[] __lsp_aligned16 =
+ {
+ LSP_DSP_VEC8(1e-10f)
+ };
+
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ ARCH_ARM_ASM(
+ __ASM_EMIT("vld3.32 {d0[], d2[], d4[]}, [%[corr]]")
+ __ASM_EMIT("vld3.32 {d1[], d3[], d5[]}, [%[corr]]") /* q0 = xv, q1 = xa, q2 = xb */
+ __ASM_EMIT("veor q3, q3, q3") /* q3 = 0 */
+ /* 8x blocks */
+ __ASM_EMIT("subs %[count], #8")
+ __ASM_EMIT("blo 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vldm %[a_head]!, {q4-q5}") /* q4 = ah0, q5 = ah1 */
+ __ASM_EMIT("vldm %[b_head]!, {q6-q7}") /* q6 = bh0, q7 = bh1 */
+ __ASM_EMIT("vldm %[a_tail]!, {q8-q9}") /* q8 = at0, q9 = at1 */
+ __ASM_EMIT("vldm %[b_tail]!, {q10-q11}") /* q10 = bt0, q11 = bt1 */
+ __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */
+ __ASM_EMIT("vmul.f32 q13, q5, q7")
+ __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */
+ __ASM_EMIT("vmul.f32 q5, q5, q5")
+ __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */
+ __ASM_EMIT("vmul.f32 q7, q7, q7")
+ __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */
+ __ASM_EMIT("vmls.f32 q13, q9, q11")
+ __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */
+ __ASM_EMIT("vmls.f32 q5, q9, q9")
+ __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */
+ __ASM_EMIT("vmls.f32 q7, q11, q11")
+
+ __ASM_EMIT("vext.32 q14, q3, q12, #2") /* q14 = 0 0 DV[0] DV[1] */
+ __ASM_EMIT("vext.32 q15, q3, q13, #2") /* q15 = 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vadd.f32 q12, q12, q14") /* q12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */
+ __ASM_EMIT("vadd.f32 q13, q13, q15") /* q13 = DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vext.32 q14, q3, q12, #3") /* q14 = 0 DV[0] DV[1] DV[0]+DV[2] */
+ __ASM_EMIT("vext.32 q15, q3, q13, #3") /* q15 = 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vadd.f32 q12, q12, q14") /* q12 = V[0..3] = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("vadd.f32 q13, q13, q15") /* q13 = V[4..7] = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vext.32 q14, q3, q4, #2") /* q14 = 0 0 DA[0] DA[1] */
+ __ASM_EMIT("vext.32 q15, q3, q5, #2") /* q15 = 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vadd.f32 q4, q4, q14") /* q4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */
+ __ASM_EMIT("vadd.f32 q5, q5, q15") /* q5 = DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vext.32 q14, q3, q4, #3") /* q14 = 0 DA[0] DA[1] DA[0]+DA[2] */
+ __ASM_EMIT("vext.32 q15, q3, q5, #3") /* q15 = 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vadd.f32 q4, q4, q14") /* q4 = A[0..3] = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("vadd.f32 q5, q5, q15") /* q5 = A[4..7] = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vext.32 q14, q3, q6, #2") /* q14 = 0 0 DB[0] DB[1] */
+ __ASM_EMIT("vext.32 q15, q3, q7, #2") /* q15 = 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vadd.f32 q6, q6, q14") /* q6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */
+ __ASM_EMIT("vadd.f32 q7, q7, q15") /* q7 = DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vext.32 q14, q3, q6, #3") /* q14 = 0 DB[0] DB[1] DB[0]+DB[2] */
+ __ASM_EMIT("vext.32 q15, q3, q7, #3") /* q15 = 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vadd.f32 q6, q6, q14") /* q6 = B[0..3] = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("vadd.f32 q7, q7, q15") /* q7 = B[4..7] = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vdup.32 q8, d25[1]") /* q8 = V[3] V[3] V[3] V[3] */
+ __ASM_EMIT("vdup.32 q9, d9[1]") /* q9 = A[3] A[3] A[3] A[3] */
+ __ASM_EMIT("vdup.32 q10, d13[1]") /* q10 = B[3] B[3] B[3] B[3] */
+ __ASM_EMIT("vadd.f32 q13, q13, q8") /* q13 = V[3]+V[4] V[3]+V[5] V[3]+V[6] V[3]+V[7] */
+ __ASM_EMIT("vadd.f32 q5, q5, q9") /* q5 = A[3]+A[4] A[3]+A[5] A[3]+A[6] A[3]+A[7] */
+ __ASM_EMIT("vadd.f32 q7, q7, q10") /* q7 = B[3]+B[4] B[3]+B[5] B[3]+B[6] B[3]+B[7] */
+
+ __ASM_EMIT("vadd.f32 q4, q4, q1") /* q4 = BA = xa + A */
+ __ASM_EMIT("vadd.f32 q5, q5, q1")
+ __ASM_EMIT("vadd.f32 q6, q6, q2") /* q6 = BB = xb + B */
+ __ASM_EMIT("vadd.f32 q7, q7, q2")
+ __ASM_EMIT("vadd.f32 q8, q12, q0") /* q8 = T = xv + V */
+ __ASM_EMIT("vadd.f32 q9, q13, q0")
+ __ASM_EMIT("vmul.f32 q10, q4, q6") /* q10 = B = BA * BB */
+ __ASM_EMIT("vmul.f32 q11, q5, q7")
+ __ASM_EMIT("vdup.32 q0, d19[1]") /* q4 = xv' = T[7] */
+ __ASM_EMIT("vdup.32 q1, d11[1]") /* q5 = xa' = BA[7] */
+ __ASM_EMIT("vdup.32 q2, d15[1]") /* q6 = xb' = BB[7] */
+ __ASM_EMIT("vldm %[CORR_CC], {q14-q15}") /* q14 = 1e-10, q15 = 1e-10 */
+
+ __ASM_EMIT("vcge.f32 q14, q8, q14") /* q14 = T >= 1e-10 */
+ __ASM_EMIT("vcge.f32 q15, q9, q15")
+ __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */
+ __ASM_EMIT("vrsqrte.f32 q5, q11")
+ __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */
+ __ASM_EMIT("vmul.f32 q7, q5, q11")
+ __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("vrsqrts.f32 q13, q7, q5")
+ __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("vmul.f32 q5, q5, q13")
+ __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */
+ __ASM_EMIT("vmul.f32 q7, q5, q11")
+ __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("vrsqrts.f32 q13, q7, q5")
+ __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("vmul.f32 q11, q5, q13")
+ __ASM_EMIT("vmul.f32 q10, q8, q10") /* q10 = T/sqrtf(B) */
+ __ASM_EMIT("vmul.f32 q11, q9, q11")
+ __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */
+ __ASM_EMIT("vand q11, q11, q15")
+ __ASM_EMIT("subs %[count], #8")
+ __ASM_EMIT("vstm %[dst]!, {q10-q11}")
+ __ASM_EMIT("bhs 1b")
+ __ASM_EMIT("2:")
+ /* 4x block */
+ __ASM_EMIT("adds %[count], #4")
+ __ASM_EMIT("blt 4f")
+ __ASM_EMIT("vldm %[a_head]!, {q4}") /* q4 = ah0 */
+ __ASM_EMIT("vldm %[b_head]!, {q6}") /* q6 = bh0 */
+ __ASM_EMIT("vldm %[a_tail]!, {q8}") /* q8 = at0 */
+ __ASM_EMIT("vldm %[b_tail]!, {q10}") /* q10 = bt0 */
+ __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */
+ __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */
+ __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */
+ __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */
+ __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */
+ __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */
+
+ __ASM_EMIT("vext.32 q13, q3, q12, #2") /* q13 = 0 0 DV[0] DV[1] */
+ __ASM_EMIT("vext.32 q5, q3, q4, #2") /* q5 = 0 0 DA[0] DA[1] */
+ __ASM_EMIT("vext.32 q7, q3, q6, #2") /* q7 = 0 0 DB[0] DB[1] */
+ __ASM_EMIT("vadd.f32 q12, q12, q13") /* q12 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */
+ __ASM_EMIT("vadd.f32 q4, q4, q5") /* q4 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */
+ __ASM_EMIT("vadd.f32 q6, q6, q7") /* q6 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */
+ __ASM_EMIT("vext.32 q13, q3, q12, #3") /* q13 = 0 DV[0] DV[1] DV[0]+DV[2] */
+ __ASM_EMIT("vext.32 q5, q3, q4, #3") /* q5 = 0 DA[0] DA[1] DA[0]+DA[2] */
+ __ASM_EMIT("vext.32 q7, q3, q6, #3") /* q7 = 0 DB[0] DB[1] DB[0]+DB[2] */
+ __ASM_EMIT("vadd.f32 q12, q12, q13") /* q12 = V = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("vadd.f32 q4, q4, q5") /* q4 = A = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("vadd.f32 q6, q6, q7") /* q6 = B = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */
+
+ __ASM_EMIT("vadd.f32 q4, q4, q1") /* q4 = BA = xa + A */
+ __ASM_EMIT("vadd.f32 q6, q6, q2") /* q6 = BB = xb + B */
+ __ASM_EMIT("vadd.f32 q8, q12, q0") /* q8 = T = xv + V */
+ __ASM_EMIT("vmul.f32 q10, q4, q6") /* q10 = B = BA * BB */
+ __ASM_EMIT("vdup.32 q1, d9[1]") /* q1 = xa' = BA[3] */
+ __ASM_EMIT("vdup.32 q2, d13[1]") /* q2 = xb' = BB[3] */
+ __ASM_EMIT("vdup.32 q0, d17[1]") /* q0 = xv' = T[3] */
+ __ASM_EMIT("vldm %[CORR_CC], {q14}") /* q14 = 1e-10 */
+
+ __ASM_EMIT("vcge.f32 q14, q8, q14") /* q14 = T >= 1e-10 */
+ __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */
+ __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */
+ __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */
+ __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("vmul.f32 q10, q8, q10") /* q10 = T/sqrtf(B) */
+ __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */
+ __ASM_EMIT("sub %[count], #4")
+ __ASM_EMIT("vstm %[dst]!, {q10}")
+ __ASM_EMIT("4:")
+ /* 1x blocks */
+ __ASM_EMIT("adds %[count], #3")
+ __ASM_EMIT("blt 6f")
+ __ASM_EMIT("vldm %[CORR_CC], {q3}") /* q3 = 1e-10 */
+ __ASM_EMIT("5:")
+ __ASM_EMIT("vld1.32 {d8[], d9[]}, [%[a_head]]!") /* q4 = ah0 */
+ __ASM_EMIT("vld1.32 {d12[], d13[]}, [%[b_head]]!") /* q6 = bh0 */
+ __ASM_EMIT("vld1.32 {d16[], d17[]}, [%[a_tail]]!") /* q8 = at0 */
+ __ASM_EMIT("vld1.32 {d20[], d21[]}, [%[b_tail]]!") /* q10 = bt0 */
+ __ASM_EMIT("vmul.f32 q12, q4, q6") /* q12 = ah0*bh0 */
+ __ASM_EMIT("vmul.f32 q4, q4, q4") /* q4 = ah0*ah0 */
+ __ASM_EMIT("vmul.f32 q6, q6, q6") /* q6 = bh0*bh0 */
+ __ASM_EMIT("vmls.f32 q12, q8, q10") /* q12 = DV = ah0*bh0 - at0*bt0 */
+ __ASM_EMIT("vmls.f32 q4, q8, q8") /* q4 = DA = ah0*ah0 - at0*at0 */
+ __ASM_EMIT("vmls.f32 q6, q10, q10") /* q6 = DB = bh0*bh0 - bt0*bt0 */
+
+ __ASM_EMIT("vadd.f32 q1, q4, q1") /* q1 = BA = xa + DA */
+ __ASM_EMIT("vadd.f32 q2, q6, q2") /* q2 = BB = xb + DB */
+ __ASM_EMIT("vadd.f32 q0, q12, q0") /* q0 = T = xv + DV */
+ __ASM_EMIT("vmul.f32 q10, q1, q2") /* q10 = B = BA * BB */
+
+ __ASM_EMIT("vcge.f32 q14, q0, q3") /* q14 = T >= 1e-10 */
+ __ASM_EMIT("vrsqrte.f32 q4, q10") /* q4 = x0 */
+ __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x0 */
+ __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("vmul.f32 q4, q4, q12") /* q4 = x1 = x0 * (3 - R * x0 * x0) / 2 */
+ __ASM_EMIT("vmul.f32 q6, q4, q10") /* q6 = R * x1 */
+ __ASM_EMIT("vrsqrts.f32 q12, q6, q4") /* q12 = (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("vmul.f32 q10, q4, q12") /* q10 = 1/sqrtf(B) = x2 = x1 * (3 - R * x1 * x1) / 2 */
+ __ASM_EMIT("vmul.f32 q10, q0, q10") /* q10 = T/sqrtf(B) */
+ __ASM_EMIT("vand q10, q10, q14") /* q10 = (T >= 1e-10) ? T/sqrt(B) : 0 */
+ __ASM_EMIT("subs %[count], #1")
+ __ASM_EMIT("vst1.32 {d20[0]}, [%[dst]]!")
+ __ASM_EMIT("bge 5b")
+ __ASM_EMIT("6:")
+ /* Store state */
+ __ASM_EMIT("vst3.32 {d0[0], d2[0], d4[0]}, [%[corr]]")
+
+ : [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ : [corr] "r" (corr),
+ [CORR_CC] "r" (&corr_const[0])
+ : "cc", "memory",
+ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+ }
+
+ } /* namespace neon_d32 */
+} /* namespace lsp */
+
+
+
+#endif /* PRIVATE_DSP_ARCH_ARM_NEON_D32_CORRELATION_H_ */
diff --git a/include/private/dsp/arch/generic/correlation.h b/include/private/dsp/arch/generic/correlation.h
new file mode 100644
index 00000000..9b6dc311
--- /dev/null
+++ b/include/private/dsp/arch/generic/correlation.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 8 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more deheads.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+
+#ifndef PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_GENERIC_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_GENERIC_IMPL */
+
+namespace lsp
+{
+ namespace generic
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ float xv = 0.0f;
+ float xa = 0.0f;
+ float xb = 0.0f;
+
+ if (count >= 4)
+ {
+ float T[4], A[4], B[4];
+
+ T[0] = 0.0f;
+ T[1] = 0.0f;
+ T[2] = 0.0f;
+ T[3] = 0.0f;
+
+ A[0] = 0.0f;
+ A[1] = 0.0f;
+ A[2] = 0.0f;
+ A[3] = 0.0f;
+
+ B[0] = 0.0f;
+ B[1] = 0.0f;
+ B[2] = 0.0f;
+ B[3] = 0.0f;
+
+ for ( ; count >= 4; count -= 4)
+ {
+ T[0] += a[0] * b[0];
+ T[1] += a[1] * b[1];
+ T[2] += a[2] * b[2];
+ T[3] += a[3] * b[3];
+
+ A[0] += a[0] * a[0];
+ A[1] += a[1] * a[1];
+ A[2] += a[2] * a[2];
+ A[3] += a[3] * a[3];
+
+ B[0] += b[0] * b[0];
+ B[1] += b[1] * b[1];
+ B[2] += b[2] * b[2];
+ B[3] += b[3] * b[3];
+
+ a += 4;
+ b += 4;
+ }
+
+ xv = T[0] + T[1] + T[2] + T[3];
+ xa = A[0] + A[1] + A[2] + A[3];
+ xb = B[0] + B[1] + B[2] + B[3];
+ }
+
+ for ( ; count > 0; --count)
+ {
+ xv += a[0] * b[0];
+ xa += a[0] * a[0];
+ xb += b[0] * b[0];
+
+ a += 1;
+ b += 1;
+ }
+
+ corr->v += xv;
+ corr->a += xa;
+ corr->b += xb;
+ }
+
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ float T[4], BA[4], BB[4], B[4], DV[4], DA[4], DB[4];
+
+ float vv = corr->v;
+ float va = corr->a;
+ float vb = corr->b;
+
+ for ( ; count >= 4; count -= 4)
+ {
+ DV[0] = a_head[0]*b_head[0] - a_tail[0]*b_tail[0];
+ DV[1] = a_head[1]*b_head[1] - a_tail[1]*b_tail[1];
+ DV[2] = a_head[2]*b_head[2] - a_tail[2]*b_tail[2];
+ DV[3] = a_head[3]*b_head[3] - a_tail[3]*b_tail[3];
+
+ DA[0] = a_head[0]*a_head[0] - a_tail[0]*a_tail[0];
+ DA[1] = a_head[1]*a_head[1] - a_tail[1]*a_tail[1];
+ DA[2] = a_head[2]*a_head[2] - a_tail[2]*a_tail[2];
+ DA[3] = a_head[3]*a_head[3] - a_tail[3]*a_tail[3];
+
+ DB[0] = b_head[0]*b_head[0] - b_tail[0]*b_tail[0];
+ DB[1] = b_head[1]*b_head[1] - b_tail[1]*b_tail[1];
+ DB[2] = b_head[2]*b_head[2] - b_tail[2]*b_tail[2];
+ DB[3] = b_head[3]*b_head[3] - b_tail[3]*b_tail[3];
+
+ T[0] = vv + DV[0];
+ T[1] = T[0] + DV[1];
+ T[2] = T[1] + DV[2];
+ T[3] = T[2] + DV[3];
+
+ BA[0] = va + DA[0];
+ BA[1] = BA[0] + DA[1];
+ BA[2] = BA[1] + DA[2];
+ BA[3] = BA[2] + DA[3];
+
+ BB[0] = vb + DB[0];
+ BB[1] = BB[0] + DB[1];
+ BB[2] = BB[1] + DB[2];
+ BB[3] = BB[2] + DB[3];
+
+ B[0] = BA[0] * BB[0];
+ B[1] = BA[1] * BB[1];
+ B[2] = BA[2] * BB[2];
+ B[3] = BA[3] * BB[3];
+
+ dst[0] = (B[0] >= 1e-10f) ? T[0] / sqrtf(B[0]) : 0.0f;
+ dst[1] = (B[1] >= 1e-10f) ? T[1] / sqrtf(B[1]) : 0.0f;
+ dst[2] = (B[2] >= 1e-10f) ? T[2] / sqrtf(B[2]) : 0.0f;
+ dst[3] = (B[3] >= 1e-10f) ? T[3] / sqrtf(B[3]) : 0.0f;
+
+ vv = T[3];
+ va = BA[3];
+ vb = BB[3];
+
+ a_head += 4;
+ b_head += 4;
+ a_tail += 4;
+ b_tail += 4;
+ dst += 4;
+ }
+
+ for (; count > 0; --count)
+ {
+ DV[0] = a_head[0]*b_head[0] - a_tail[0]*b_tail[0];
+ DA[0] = a_head[0]*a_head[0] - a_tail[0]*a_tail[0];
+ DB[0] = b_head[0]*b_head[0] - b_tail[0]*b_tail[0];
+
+ T[0] = vv + DV[0];
+ BA[0] = va + DA[0];
+ BB[0] = vb + DB[0];
+ B[0] = BA[0] * BB[0];
+
+ dst[0] = (B[0] >= 1e-10f) ? T[0] / sqrtf(B[0]) : 0.0f;
+
+ vv = T[0];
+ va = BA[0];
+ vb = BB[0];
+
+ a_head += 1;
+ b_head += 1;
+ a_tail += 1;
+ b_tail += 1;
+ dst += 1;
+ }
+
+ corr->v = vv;
+ corr->a = va;
+ corr->b = vb;
+ }
+
+ } /* namespace generic */
+} /* namespace lsp */
+
+
+
+#endif /* PRIVATE_DSP_ARCH_GENERIC_CORRELATION_H_ */
diff --git a/include/private/dsp/arch/x86/avx/correlation.h b/include/private/dsp/arch/x86/avx/correlation.h
new file mode 100644
index 00000000..3aa8c2ff
--- /dev/null
+++ b/include/private/dsp/arch/x86/avx/correlation.h
@@ -0,0 +1,752 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 9 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_X86_AVX_IMPL */
+
+namespace lsp
+{
+ namespace avx
+ {
+
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ IF_ARCH_X86(
+ size_t off;
+ );
+
+ ARCH_X86_ASM
+ (
+ __ASM_EMIT("xor %[off], %[off]")
+ __ASM_EMIT("vxorps %%ymm0, %%ymm0, %%ymm0") /* xv = 0 */
+ __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* xa = 0 */
+ __ASM_EMIT("vxorps %%ymm2, %%ymm2, %%ymm2") /* xb = 0 */
+ /* 16x blocks */
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */
+ __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */
+ __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */
+ __ASM_EMIT("vmulps %%ymm3, %%ymm5, %%ymm7") /* ymm7 = a0*b0 */
+ __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = a0*a0 */
+ __ASM_EMIT("vmulps %%ymm5, %%ymm5, %%ymm5") /* ymm5 = b0*b0 */
+ __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a0*b0 */
+ __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = xa + a0*a0 */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = xb + b0*b0 */
+ __ASM_EMIT("vmulps %%ymm4, %%ymm6, %%ymm7") /* ymm7 = a0*b0 */
+ __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = a0*a0 */
+ __ASM_EMIT("vmulps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = b0*b0 */
+ __ASM_EMIT("vaddps %%ymm7, %%ymm0, %%ymm0") /* ymm0 = xv + a1*b1 */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = xa + a1*a1 */
+ __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2") /* ymm2 = xb + b1*b1 */
+ __ASM_EMIT("add $0x40, %[off]") /* ++off */
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4")
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5")
+ __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6")
+ __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1")
+ __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2")
+ __ASM_EMIT("2:")
+ /* 8x block */
+ __ASM_EMIT("add $8, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */
+ __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */
+ __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */
+ __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("vmulps %%xmm4, %%xmm6, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = a0*a0 */
+ __ASM_EMIT("vmulps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = b0*b0 */
+ __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a1*b1 */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = xa + a1*a1 */
+ __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2") /* xmm2 = xb + b1*b1 */
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("add $0x20, %[off]") /* ++off */
+ __ASM_EMIT("4:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vmulps %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */
+ __ASM_EMIT("vmulps %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */
+ __ASM_EMIT("vaddps %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("add $0x10, %[off]") /* ++off */
+ __ASM_EMIT("6:")
+ /* Do horizontal sum */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 8f")
+ __ASM_EMIT("7:")
+ __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vmulss %%xmm3, %%xmm5, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = a0*a0 */
+ __ASM_EMIT("vmulss %%xmm5, %%xmm5, %%xmm5") /* xmm5 = b0*b0 */
+ __ASM_EMIT("vaddss %%xmm7, %%xmm0, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vaddss %%xmm5, %%xmm2, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("add $0x04, %[off]") /* ++off */
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 7b")
+ __ASM_EMIT("8:")
+ /* Store result */
+ __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1")
+ __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2")
+ __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])")
+ __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])")
+ __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])")
+
+ : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count)
+ : [a] "r" (a), [b] "r" (b)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ IF_ARCH_X86(
+ size_t off;
+ );
+
+ ARCH_X86_ASM
+ (
+ __ASM_EMIT("xor %[off], %[off]")
+ __ASM_EMIT("vxorps %%ymm0, %%ymm0, %%ymm0") /* xv = 0 */
+ __ASM_EMIT("vxorps %%ymm1, %%ymm1, %%ymm1") /* xa = 0 */
+ __ASM_EMIT("vxorps %%ymm2, %%ymm2, %%ymm2") /* xb = 0 */
+ /* 16x blocks */
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */
+ __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */
+ __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */
+ __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm5, %%ymm0") /* ymm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm3, %%ymm1") /* ymm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm2") /* ymm2 = xv + b0*b0 */
+ __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm6, %%ymm0") /* ymm0 = xv + a1*b1 */
+ __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = xa + a1*a1 */
+ __ASM_EMIT("vfmadd231ps %%ymm6, %%ymm6, %%ymm2") /* ymm2 = xv + b1*b1 */
+ __ASM_EMIT("add $0x40, %[off]") /* ++off */
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4")
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5")
+ __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6")
+ __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1")
+ __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2")
+ __ASM_EMIT("2:")
+ /* 8x block */
+ __ASM_EMIT("add $8, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm6, %%xmm0") /* xmm0 = xv + a1*b1 */
+ __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */
+ __ASM_EMIT("vfmadd231ps %%xmm6, %%xmm6, %%xmm2") /* xmm2 = xv + b1*b1 */
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("add $0x20, %[off]") /* ++off */
+ __ASM_EMIT("4:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("add $0x10, %[off]") /* ++off */
+ __ASM_EMIT("6:")
+ /* Do horizontal sum */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 8f")
+ __ASM_EMIT("7:")
+ __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ss %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xv + b0*b0 */
+ __ASM_EMIT("add $0x04, %[off]") /* ++off */
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 7b")
+ __ASM_EMIT("8:")
+ /* Store result */
+ __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1")
+ __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2")
+ __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])")
+ __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])")
+ __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])")
+
+ : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count)
+ : [a] "r" (a), [b] "r" (b)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ static const float corr_const[] __lsp_aligned32 =
+ {
+ LSP_DSP_VEC8(1e-10f)
+ };
+
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ IF_ARCH_I386(
+ void *ptr;
+ );
+
+ ARCH_X86_ASM
+ (
+ /* 8x blocks */
+ __ASM_EMIT32("subl $8, %[count]")
+ __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */
+ __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */
+ __ASM_EMIT("vmulps %%ymm4, %%ymm3, %%ymm5") /* ymm5 = at*bt */
+ __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */
+ __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */
+ __ASM_EMIT("vmulps %%ymm3, %%ymm3, %%ymm3") /* ymm3 = at*at */
+ __ASM_EMIT("vmulps %%ymm4, %%ymm4, %%ymm4") /* ymm4 = bt*bt */
+ __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vsubps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vsubps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vsubps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2")
+
+ __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1")/* ymm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x20, %[ptr]")
+ __ASM_EMIT64("add $0x20, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $8, %[count]")
+ __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("2:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("vmulps %%xmm4, %%xmm3, %%xmm5") /* xmm5 = at*bt */
+ __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("vmulps %%xmm3, %%xmm3, %%xmm3") /* xmm3 = at*at */
+ __ASM_EMIT("vmulps %%xmm4, %%xmm4, %%xmm4") /* xmm4 = bt*bt */
+ __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vsubps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vsubps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vsubps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2")
+
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */
+ __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x10, %[ptr]")
+ __ASM_EMIT64("add $0x10, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
+ __ASM_EMIT("4:")
+
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("5:")
+ __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("vmulss %%xmm4, %%xmm3, %%xmm5") /* xmm5 = at*bt */
+ __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("vmulss %%xmm3, %%xmm3, %%xmm3") /* xmm3 = at*at */
+ __ASM_EMIT("vmulss %%xmm4, %%xmm4, %%xmm4") /* xmm4 = bt*bt */
+ __ASM_EMIT("vsubss %%xmm5, %%xmm2, %%xmm5") /* xmm5 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vsubss %%xmm3, %%xmm0, %%xmm3") /* xmm3 = DA = ah*ah - at*at */
+ __ASM_EMIT("vsubss %%xmm4, %%xmm1, %%xmm4") /* xmm4 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm0")
+ __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm1")
+ __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm2")
+ __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm0")
+ __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm1")
+ __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm2")
+
+ __ASM_EMIT("vaddss %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA */
+ __ASM_EMIT("vaddss %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB */
+ __ASM_EMIT("vaddss %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV */
+ __ASM_EMIT("vmulss %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */
+
+ __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm1, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm2, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm0, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm1, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm2, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivss %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x04, %[ptr]")
+ __ASM_EMIT64("add $0x04, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("decl %[count]")
+ __ASM_EMIT64("dec %[count]")
+ __ASM_EMIT("jge 5b")
+ __ASM_EMIT("6:")
+
+ : __IF_32(
+ [ptr] "=&r" (ptr),
+ [corr] "+m" (corr), [dst] "+m" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+g" (count)
+ )
+ __IF_64(
+ [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ )
+ : __IF_64( [corr] "r" (corr), )
+ [CORR_CC] "o" (corr_const)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ void corr_incr_fma3(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ IF_ARCH_I386(
+ void *ptr;
+ );
+
+ ARCH_X86_ASM
+ (
+ /* 8x blocks */
+ __ASM_EMIT32("subl $8, %[count]")
+ __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */
+ __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */
+ __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */
+ __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */
+ __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm3, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ps %%ymm3, %%ymm3, %%ymm0") /* ymm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2")
+
+ __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1")/* ymm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x20, %[ptr]")
+ __ASM_EMIT64("add $0x20, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $8, %[count]")
+ __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("2:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ps %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2")
+
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */
+ __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x10, %[ptr]")
+ __ASM_EMIT64("add $0x10, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
+ __ASM_EMIT("4:")
+
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("5:")
+ __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ss %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm3")
+ __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm4")
+ __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm5")
+ __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm3")
+ __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm4")
+ __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm5")
+
+ __ASM_EMIT("vaddss %%xmm3, %%xmm2, %%xmm2") /* xmm2 = T = xv+DV */
+ __ASM_EMIT("vaddss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = BA = xa+DA */
+ __ASM_EMIT("vaddss %%xmm5, %%xmm1, %%xmm1") /* xmm1 = BB = xb+DB */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm7") /* xmm7 = B = BA*BB */
+
+ __ASM_EMIT32("vmovss %%xmm2, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm0, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm1, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm2, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm0, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm1, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1")/* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivss %%xmm6, %%xmm2, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x04, %[ptr]")
+ __ASM_EMIT64("add $0x04, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("decl %[count]")
+ __ASM_EMIT64("dec %[count]")
+ __ASM_EMIT("jge 5b")
+ __ASM_EMIT("6:")
+
+ : __IF_32(
+ [ptr] "=&r" (ptr),
+ [corr] "+m" (corr), [dst] "+m" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+g" (count)
+ )
+ __IF_64(
+ [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ )
+ : __IF_64( [corr] "r" (corr), )
+ [CORR_CC] "o" (corr_const)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ } /* namespace avx */
+} /* namespace lsp */
+
+
+
+
+#endif /* PRIVATE_DSP_ARCH_X86_AVX_CORRELATION_H_ */
diff --git a/include/private/dsp/arch/x86/avx/msmatrix.h b/include/private/dsp/arch/x86/avx/msmatrix.h
index 5d158b82..bca9b985 100644
--- a/include/private/dsp/arch/x86/avx/msmatrix.h
+++ b/include/private/dsp/arch/x86/avx/msmatrix.h
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2020 Linux Studio Plugins Project
- * (C) 2020 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -91,8 +91,8 @@ namespace lsp
__ASM_EMIT("vmovups %%xmm0, 0x00(%[mid], %[off])")
__ASM_EMIT("vmovups %%xmm2, 0x00(%[side], %[off])")
__ASM_EMIT("add $0x10, %[off]")
- __ASM_EMIT32("subl $8, %[count]")
- __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
__ASM_EMIT("6:")
// 1x blocks
__ASM_EMIT32("addl $3, %[count]")
diff --git a/include/private/dsp/arch/x86/avx512/correlation.h b/include/private/dsp/arch/x86/avx512/correlation.h
new file mode 100644
index 00000000..3321bf26
--- /dev/null
+++ b/include/private/dsp/arch/x86/avx512/correlation.h
@@ -0,0 +1,509 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 9 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_X86_AVX512_IMPL */
+
+namespace lsp
+{
+ namespace avx512
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ IF_ARCH_X86(
+ size_t off;
+ );
+
+ ARCH_X86_ASM
+ (
+ __ASM_EMIT("xor %[off], %[off]")
+ __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0") /* xv = 0 */
+ __ASM_EMIT("vxorps %%zmm1, %%zmm1, %%zmm1") /* xa = 0 */
+ __ASM_EMIT("vxorps %%zmm2, %%zmm2, %%zmm2") /* xb = 0 */
+ /* 32x blocks */
+ __ASM_EMIT("sub $32, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%zmm3") /* zmm3 = a0 */
+ __ASM_EMIT("vmovups 0x40(%[a], %[off]), %%zmm4") /* zmm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%zmm5") /* zmm5 = b0 */
+ __ASM_EMIT("vmovups 0x40(%[b], %[off]), %%zmm6") /* zmm6 = b1 */
+ __ASM_EMIT("vfmadd231ps %%zmm3, %%zmm5, %%zmm0") /* zmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%zmm3, %%zmm3, %%zmm1") /* zmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%zmm5, %%zmm5, %%zmm2") /* zmm2 = xb + b0*b0 */
+ __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm6, %%zmm0") /* zmm0 = xv + a1*b1 */
+ __ASM_EMIT("vfmadd231ps %%zmm4, %%zmm4, %%zmm1") /* zmm1 = xa + a1*a1 */
+ __ASM_EMIT("vfmadd231ps %%zmm6, %%zmm6, %%zmm2") /* zmm2 = xb + b1*b1 */
+ __ASM_EMIT("add $0x80, %[off]") /* ++off */
+ __ASM_EMIT("sub $32, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm0, %%ymm4")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm1, %%ymm5")
+ __ASM_EMIT("vextractf64x4 $1, %%zmm2, %%ymm6")
+ __ASM_EMIT("vaddps %%ymm4, %%ymm0, %%ymm0")
+ __ASM_EMIT("vaddps %%ymm5, %%ymm1, %%ymm1")
+ __ASM_EMIT("vaddps %%ymm6, %%ymm2, %%ymm2")
+ __ASM_EMIT("2:")
+ /* 16x blocks */
+ __ASM_EMIT("add $16, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%ymm3") /* ymm3 = a0 */
+ __ASM_EMIT("vmovups 0x20(%[a], %[off]), %%ymm4") /* ymm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%ymm5") /* ymm5 = b0 */
+ __ASM_EMIT("vmovups 0x20(%[b], %[off]), %%ymm6") /* ymm6 = b1 */
+ __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm5, %%ymm0") /* ymm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%ymm3, %%ymm3, %%ymm1") /* ymm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%ymm5, %%ymm5, %%ymm2") /* ymm2 = xb + b0*b0 */
+ __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm6, %%ymm0") /* ymm0 = xv + a1*b1 */
+ __ASM_EMIT("vfmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = xa + a1*a1 */
+ __ASM_EMIT("vfmadd231ps %%ymm6, %%ymm6, %%ymm2") /* ymm2 = xb + b1*b1 */
+ __ASM_EMIT("sub $16, %[count]")
+ __ASM_EMIT("add $0x40, %[off]") /* ++off */
+ __ASM_EMIT("4:")
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm4")
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm5")
+ __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm6")
+ __ASM_EMIT("vaddps %%xmm4, %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddps %%xmm5, %%xmm1, %%xmm1")
+ __ASM_EMIT("vaddps %%xmm6, %%xmm2, %%xmm2")
+ /* 8x block */
+ __ASM_EMIT("add $8, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vmovups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm6, %%xmm0") /* xmm0 = xv + a1*b1 */
+ __ASM_EMIT("vfmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */
+ __ASM_EMIT("vfmadd231ps %%xmm6, %%xmm6, %%xmm2") /* xmm2 = xb + b1*b1 */
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("add $0x20, %[off]") /* ++off */
+ __ASM_EMIT("6:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 8f")
+ __ASM_EMIT("vmovups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ps %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ps %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("add $0x10, %[off]") /* ++off */
+ __ASM_EMIT("8:")
+ /* Do horizontal sum */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1 xv2+xv3 xv0+xv1 xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = xv0+xv1+xv2+xv3 */
+ __ASM_EMIT("vhaddps %%xmm2, %%xmm2, %%xmm2") /* xmm2 = xv0+xv1+xv2+xv3 */
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 10f")
+ __ASM_EMIT("9:")
+ __ASM_EMIT("vmovss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("vmovss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm5, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("vfmadd231ss %%xmm3, %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("vfmadd231ss %%xmm5, %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("add $0x04, %[off]") /* ++off */
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 9b")
+ __ASM_EMIT("10:")
+ /* Store result */
+ __ASM_EMIT("vaddss 0x00(%[corr]), %%xmm0, %%xmm0")
+ __ASM_EMIT("vaddss 0x04(%[corr]), %%xmm1, %%xmm1")
+ __ASM_EMIT("vaddss 0x08(%[corr]), %%xmm2, %%xmm2")
+ __ASM_EMIT("vmovss %%xmm0, 0x00(%[corr])")
+ __ASM_EMIT("vmovss %%xmm1, 0x04(%[corr])")
+ __ASM_EMIT("vmovss %%xmm2, 0x08(%[corr])")
+
+ : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count)
+ : [a] "r" (a), [b] "r" (b)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ static const float corr_const[] __lsp_aligned64 =
+ {
+ LSP_DSP_VEC16(1e-10f)
+ };
+
+ static const uint32_t corr_idx[] __lsp_aligned64 =
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 7, // 0 0 A0 A1
+ 0, 0, 0, 0, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, // 0 A0 A1 A2
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
+ };
+
+ static const uint16_t corr_kmask[] =
+ {
+ 0xff00,
+ 0xfff0
+ };
+
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ IF_ARCH_I386(
+ void *ptr;
+ );
+
+ ARCH_X86_ASM
+ (
+ /* 16x blocks */
+ __ASM_EMIT32("subl $16, %[count]")
+ __ASM_EMIT64("sub $16, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("kmovw 0x00 + %[CORR_KMASK], %%k1")
+ __ASM_EMIT("kmovw 0x02 + %[CORR_KMASK], %%k2")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%zmm0") /* zmm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%zmm1") /* zmm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%zmm3") /* zmm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%zmm4") /* zmm4 = bt */
+ __ASM_EMIT("vmulps %%zmm1, %%zmm0, %%zmm2") /* zmm2 = ah*bh */
+ __ASM_EMIT("vmulps %%zmm0, %%zmm0, %%zmm0") /* zmm0 = ah*ah */
+ __ASM_EMIT("vmulps %%zmm1, %%zmm1, %%zmm1") /* zmm1 = bh*bh */
+ __ASM_EMIT("vxorps %%zmm6, %%zmm6, %%zmm6") /* zmm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vfnmadd231ps %%zmm4, %%zmm3, %%zmm2") /* zmm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ps %%zmm3, %%zmm3, %%zmm0") /* zmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ps %%zmm4, %%zmm4, %%zmm1") /* zmm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vshufps $0x44, %%zmm0, %%zmm6, %%zmm3") /* zmm3 = 0 0 DA[0] DA[1] ... */
+ __ASM_EMIT("vshufps $0x44, %%zmm1, %%zmm6, %%zmm4") /* zmm4 = 0 0 DB[0] DB[1] ... */
+ __ASM_EMIT("vshufps $0x44, %%zmm2, %%zmm6, %%zmm5") /* zmm5 = 0 0 DV[0] DV[1] ... */
+ __ASM_EMIT("vaddps %%zmm3, %%zmm0, %%zmm0") /* zmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] ... */
+ __ASM_EMIT("vaddps %%zmm4, %%zmm1, %%zmm1") /* zmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] ... */
+ __ASM_EMIT("vaddps %%zmm5, %%zmm2, %%zmm2") /* zmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] ... */
+ __ASM_EMIT("vmovaps 0x00 + %[CORR_IDX], %%zmm6") /* zmm6 = permute mask */
+ __ASM_EMIT("vmovaps 0x40 + %[CORR_IDX], %%zmm7") /* zmm7 = permute mask */
+ __ASM_EMIT("vshufps $0x99, %%zmm0, %%zmm3, %%zmm3") /* zmm3 = 0 DA[0] DA[1] DA[0]+DA[2] ... */
+ __ASM_EMIT("vshufps $0x99, %%zmm1, %%zmm4, %%zmm4") /* zmm4 = 0 DB[0] DB[1] DB[0]+DB[2] ... */
+ __ASM_EMIT("vshufps $0x99, %%zmm2, %%zmm5, %%zmm5") /* zmm5 = 0 DV[0] DV[1] DV[0]+DV[2] ... */
+ __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3") /* zmm3 = A = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] ... */
+ __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4") /* zmm4 = B = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] ... */
+ __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5") /* zmm5 = V = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] ... */
+ __ASM_EMIT("vpermps %%zmm3, %%zmm6, %%zmm0") /* zmm0 = A0 A0 A0 A1 */
+ __ASM_EMIT("vpermps %%zmm4, %%zmm6, %%zmm1") /* zmm1 = B0 B0 B0 B1 */
+ __ASM_EMIT("vpermps %%zmm5, %%zmm6, %%zmm2") /* zmm2 = V0 V0 V0 V1 */
+ __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3 %{%%k1%}") /* zmm3 = A0 A1 A0+A2 A1+A3 */
+ __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4 %{%%k1%}") /* zmm4 = B0 B1 B0+B2 B1+B3 */
+ __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5 %{%%k1%}") /* zmm5 = V0 V1 V0+V2 V1+V3 */
+ __ASM_EMIT("vpermps %%zmm3, %%zmm7, %%zmm0") /* zmm0 = A0 A0 A1 A0+A2 */
+ __ASM_EMIT("vpermps %%zmm4, %%zmm7, %%zmm1") /* zmm1 = B0 B0 B1 B0+B2 */
+ __ASM_EMIT("vpermps %%zmm5, %%zmm7, %%zmm2") /* zmm2 = V0 V0 V1 V0+V2 */
+ __ASM_EMIT("vaddps %%zmm0, %%zmm3, %%zmm3 %{%%k2%}") /* zmm3 = A0 A0+A1 A0+A1+A2 A0+A1+A2+A3 */
+ __ASM_EMIT("vaddps %%zmm1, %%zmm4, %%zmm4 %{%%k2%}") /* zmm4 = B0 B0+B1 B0+B1+B2 B0+B1+B2+B3 */
+ __ASM_EMIT("vaddps %%zmm2, %%zmm5, %%zmm5 %{%%k2%}") /* zmm5 = V0 V0+V1 V0+V1+V2 V0+V1+V2+V3 */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%zmm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%zmm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%zmm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%zmm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%zmm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%zmm2")
+
+ __ASM_EMIT("vaddps %%zmm3, %%zmm1, %%zmm1") /* zmm1 = BA = xa + A */
+ __ASM_EMIT("vaddps %%zmm4, %%zmm2, %%zmm2") /* zmm2 = BB = xb + B */
+ __ASM_EMIT("vaddps %%zmm5, %%zmm0, %%zmm0") /* zmm0 = T = xv + V */
+ __ASM_EMIT("vmovaps 0x80 + %[CORR_IDX], %%zmm6") /* zmm6 = permute mask */
+ __ASM_EMIT("vmulps %%zmm2, %%zmm1, %%zmm7") /* zmm7 = B = BA*BB */
+ __ASM_EMIT("vpermps %%zmm0, %%zmm6, %%zmm3") /* zmm3 = xv' */
+ __ASM_EMIT("vpermps %%zmm1, %%zmm6, %%zmm4") /* zmm4 = xa' */
+ __ASM_EMIT("vpermps %%zmm2, %%zmm6, %%zmm5") /* zmm5 = xb' */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%zmm7, %%zmm6") /* zmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $1, %[CORR_CC], %%zmm7, %%k3") /* k3 = B < 1e-10f */
+ __ASM_EMIT("vdivps %%zmm6, %%zmm0, %%zmm0") /* zmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vxorps %%zmm0, %%zmm0, %%zmm0 %{%%k3%}") /* zmm0 = (B < 1e-10f) ? 0 : T/sqrtf(B) */
+ __ASM_EMIT("add $0x40, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x40, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%zmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%zmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x40, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x40, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x40, %[ptr]")
+ __ASM_EMIT64("add $0x40, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $16, %[count]")
+ __ASM_EMIT64("sub $16, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("2:")
+ /* 8x block */
+ __ASM_EMIT32("addl $8, %[count]")
+ __ASM_EMIT64("add $8, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%ymm0") /* ymm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%ymm1") /* ymm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%ymm3") /* ymm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%ymm4") /* ymm4 = bt */
+ __ASM_EMIT("vmulps %%ymm1, %%ymm0, %%ymm2") /* ymm2 = ah*bh */
+ __ASM_EMIT("vmulps %%ymm0, %%ymm0, %%ymm0") /* ymm0 = ah*ah */
+ __ASM_EMIT("vmulps %%ymm1, %%ymm1, %%ymm1") /* ymm1 = bh*bh */
+ __ASM_EMIT("vxorps %%ymm6, %%ymm6, %%ymm6") /* ymm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm3, %%ymm2") /* ymm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ps %%ymm3, %%ymm3, %%ymm0") /* ymm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ps %%ymm4, %%ymm4, %%ymm1") /* ymm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vshufps $0x44, %%ymm0, %%ymm6, %%ymm3") /* ymm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vshufps $0x44, %%ymm1, %%ymm6, %%ymm4") /* ymm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vshufps $0x44, %%ymm2, %%ymm6, %%ymm5") /* ymm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vaddps %%ymm3, %%ymm0, %%ymm0") /* ymm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm1, %%ymm1") /* ymm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm2, %%ymm2") /* ymm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vshufps $0x99, %%ymm0, %%ymm3, %%ymm3") /* ymm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vshufps $0x99, %%ymm1, %%ymm4, %%ymm4") /* ymm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vshufps $0x99, %%ymm2, %%ymm5, %%ymm5") /* ymm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vaddps %%ymm0, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm1, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm2, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm2") /* xmm2 = DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("vextractf128 $1, %%ymm3, %%xmm6") /* xmm6 = DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm4, %%xmm7") /* xmm7 = DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm0, %%xmm6, %%xmm6") /* xmm6 = DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm1, %%xmm7, %%xmm7") /* xmm7 = DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm5, %%xmm0") /* xmm0 = DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm6, %%ymm3, %%ymm3") /* ymm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[0]+DA[1]+DA[2]+DA[3]+DA[4] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm2, %%xmm0, %%xmm0") /* xmm0 = DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm7, %%ymm4, %%ymm4") /* ymm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[0]+DB[1]+DB[2]+DB[3]+DB[4] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vinsertf128 $1, %%xmm0, %%ymm5, %%ymm5") /* ymm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[0]+DV[1]+DV[2]+DV[3]+DV[4] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%ymm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%ymm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%ymm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%ymm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%ymm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%ymm2")
+
+ __ASM_EMIT("vaddps %%ymm3, %%ymm1, %%ymm1") /* ymm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%ymm4, %%ymm2, %%ymm2") /* ymm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%ymm5, %%ymm0, %%ymm0") /* ymm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vmulps %%ymm2, %%ymm1, %%ymm7") /* ymm7 = B = BA*BB */
+ __ASM_EMIT("vextractf128 $1, %%ymm1, %%xmm4") /* xmm4 = BA[4] BA[5] BA[6] BA[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm0, %%xmm3") /* xmm3 = T[4] T[5] T[6] T[7] */
+ __ASM_EMIT("vextractf128 $1, %%ymm2, %%xmm5") /* xmm5 = BB[4] BB[5] BB[6] BB[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm3, %%xmm3, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm4, %%xmm4, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm5, %%xmm5, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%ymm7, %%ymm6") /* ymm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $5, %[CORR_CC], %%ymm7, %%ymm1") /* ymm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivps %%ymm6, %%ymm0, %%ymm0") /* ymm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%ymm1, %%ymm0, %%ymm0") /* ymm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x20, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x20, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%ymm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%ymm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x20, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x20, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x20, %[ptr]")
+ __ASM_EMIT64("add $0x20, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $8, %[count]")
+ __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT("4:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("vmovups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("vmovups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("vmovups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("vmovups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("vmulps %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("vmulps %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("vmulps %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("vxorps %%xmm6, %%xmm6, %%xmm6") /* xmm6 = 0 0 0 0 0 0 0 0 */
+ __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ps %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ps %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT("vmovlhps %%xmm0, %%xmm6, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] 0 0 DA[4] DA[5] */
+ __ASM_EMIT("vmovlhps %%xmm1, %%xmm6, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] 0 0 DB[4] DB[5] */
+ __ASM_EMIT("vmovlhps %%xmm2, %%xmm6, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] 0 0 DV[4] DV[5] */
+ __ASM_EMIT("vaddps %%xmm3, %%xmm0, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] DA[4] DA[5] DA[4]+DA[6] DA[5]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm1, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] DB[4] DB[5] DB[4]+DB[6] DB[5]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm2, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] DV[4] DV[5] DV[4]+DV[6] DV[5]+DV[7] */
+ __ASM_EMIT("vshufps $0x99, %%xmm0, %%xmm3, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] 0 DA[4] DA[5] DA[4]+DA[6] */
+ __ASM_EMIT("vshufps $0x99, %%xmm1, %%xmm4, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] 0 DB[4] DB[5] DB[4]+DB[6] */
+ __ASM_EMIT("vshufps $0x99, %%xmm2, %%xmm5, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] 0 DV[4] DV[5] DV[4]+DV[6] */
+ __ASM_EMIT("vaddps %%xmm0, %%xmm3, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] DA[4] DA[4]+DA[5] DA[4]+DA[5]+DA[6] DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm1, %%xmm4, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] DB[4] DB[4]+DB[5] DB[4]+DB[5]+DB[6] DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm2, %%xmm5, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] DV[4] DV[4]+DV[5] DV[4]+DV[5]+DV[6] DV[4]+DV[5]+DV[6]+DV[7] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vbroadcastss 0x00(%[ptr]), %%xmm0")
+ __ASM_EMIT32("vbroadcastss 0x04(%[ptr]), %%xmm1")
+ __ASM_EMIT32("vbroadcastss 0x08(%[ptr]), %%xmm2")
+ __ASM_EMIT64("vbroadcastss 0x00(%[corr]), %%xmm0")
+ __ASM_EMIT64("vbroadcastss 0x04(%[corr]), %%xmm1")
+ __ASM_EMIT64("vbroadcastss 0x08(%[corr]), %%xmm2")
+
+ __ASM_EMIT("vaddps %%xmm3, %%xmm1, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6] xa+DA[0]+DA[1]+DA[2]+DA[3]+DA[4]+DA[5]+DA[6]+DA[7] */
+ __ASM_EMIT("vaddps %%xmm4, %%xmm2, %%xmm2") /* xmm2 = BB = xb+DB[0] xb+DB[0]+DB[1] xb+DB[0]+DB[1]+DB[2] xb+DB[0]+DB[1]+DB[2]+DB[3] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6] xb+DB[0]+DB[1]+DB[2]+DB[3]+DB[4]+DB[5]+DB[6]+DB[7] */
+ __ASM_EMIT("vaddps %%xmm5, %%xmm0, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6] xv+DV[0]+DV[1]+DV[2]+DV[3]+DV[4]+DV[5]+DV[6]+DV[7] */
+ __ASM_EMIT("vmulps %%xmm2, %%xmm1, %%xmm7") /* xmm7 = B = BA*BB */
+ __ASM_EMIT("vshufps $0xff, %%xmm0, %%xmm0, %%xmm3") /* xmm3 = T[7] T[7] T[7] T[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm1, %%xmm1, %%xmm4") /* xmm4 = BA[7] BA[7] BA[7] BA[7] */
+ __ASM_EMIT("vshufps $0xff, %%xmm2, %%xmm2, %%xmm5") /* xmm5 = BB[7] BB[7] BB[7] BB[7] */
+
+ __ASM_EMIT32("vmovss %%xmm3, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm4, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm5, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm3, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm4, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm5, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtps %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpps $5, %[CORR_CC], %%xmm7, %%xmm1") /* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivps %%xmm6, %%xmm0, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovups %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovups %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x10, %[ptr]")
+ __ASM_EMIT64("add $0x10, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
+ __ASM_EMIT("6:")
+
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 8f")
+ __ASM_EMIT("7:")
+ __ASM_EMIT("vmovss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("vmovss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("vmovss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("vmovss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("vmulss %%xmm0, %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm3, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+ __ASM_EMIT("vfnmadd231ss %%xmm3, %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("vfnmadd231ss %%xmm4, %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("vmovss 0x00(%[ptr]), %%xmm3")
+ __ASM_EMIT32("vmovss 0x04(%[ptr]), %%xmm4")
+ __ASM_EMIT32("vmovss 0x08(%[ptr]), %%xmm5")
+ __ASM_EMIT64("vmovss 0x00(%[corr]), %%xmm3")
+ __ASM_EMIT64("vmovss 0x04(%[corr]), %%xmm4")
+ __ASM_EMIT64("vmovss 0x08(%[corr]), %%xmm5")
+
+ __ASM_EMIT("vaddss %%xmm3, %%xmm2, %%xmm2") /* xmm2 = T = xv+DV */
+ __ASM_EMIT("vaddss %%xmm4, %%xmm0, %%xmm0") /* xmm0 = BA = xa+DA */
+ __ASM_EMIT("vaddss %%xmm5, %%xmm1, %%xmm1") /* xmm1 = BB = xb+DB */
+ __ASM_EMIT("vmulss %%xmm1, %%xmm0, %%xmm7") /* xmm7 = B = BA*BB */
+
+ __ASM_EMIT32("vmovss %%xmm2, 0x00(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm0, 0x04(%[ptr])")
+ __ASM_EMIT32("vmovss %%xmm1, 0x08(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm2, 0x00(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm0, 0x04(%[corr])")
+ __ASM_EMIT64("vmovss %%xmm1, 0x08(%[corr])")
+
+ __ASM_EMIT("vsqrtss %%xmm7, %%xmm7, %%xmm6") /* xmm6 = sqrtf(B) */
+ __ASM_EMIT("vcmpss $5, %[CORR_CC], %%xmm7, %%xmm1") /* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("vdivss %%xmm6, %%xmm2, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("vandps %%xmm1, %%xmm0, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("vmovss %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("vmovss %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x04, %[ptr]")
+ __ASM_EMIT64("add $0x04, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("decl %[count]")
+ __ASM_EMIT64("dec %[count]")
+ __ASM_EMIT("jge 7b")
+ __ASM_EMIT("8:")
+
+ : __IF_32(
+ [ptr] "=&r" (ptr),
+ [corr] "+m" (corr), [dst] "+m" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+g" (count)
+ )
+ __IF_64(
+ [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ )
+ : __IF_64( [corr] "r" (corr), )
+ [CORR_CC] "o" (corr_const),
+ [CORR_IDX] "o" (corr_idx),
+ [CORR_KMASK] "o" (corr_kmask)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%k1", "%k2", "%k3"
+ );
+ }
+
+ } /* namespace avx512 */
+} /* namespace lsp */
+
+
+
+
+
+#endif /* PRIVATE_DSP_ARCH_X86_AVX512_CORRELATION_H_ */
diff --git a/include/private/dsp/arch/x86/avx512/msmatrix.h b/include/private/dsp/arch/x86/avx512/msmatrix.h
index c8798e4f..cfb1be5c 100644
--- a/include/private/dsp/arch/x86/avx512/msmatrix.h
+++ b/include/private/dsp/arch/x86/avx512/msmatrix.h
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 9 сент. 2023 г.
@@ -107,8 +107,8 @@ namespace lsp
__ASM_EMIT("vmovups %%xmm0, 0x00(%[mid], %[off])")
__ASM_EMIT("vmovups %%xmm2, 0x00(%[side], %[off])")
__ASM_EMIT("add $0x10, %[off]")
- __ASM_EMIT32("subl $8, %[count]")
- __ASM_EMIT64("sub $8, %[count]")
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
__ASM_EMIT("8:")
// 1x blocks
__ASM_EMIT32("addl $3, %[count]")
@@ -273,8 +273,8 @@ namespace lsp
ARCH_X86_ASM(
__ASM_EMIT("xor %[off], %[off]")
// 64x blocks
- __ASM_EMIT32("subl $32, %[count]")
- __ASM_EMIT64("sub $32, %[count]")
+ __ASM_EMIT32("subl $64, %[count]")
+ __ASM_EMIT64("sub $64, %[count]")
__ASM_EMIT("jb 2f")
__ASM_EMIT("1:")
__ASM_EMIT("vmovups 0x00(%[mid], %[off]), %%zmm0") // zmm0 = m
diff --git a/include/private/dsp/arch/x86/sse/correlation.h b/include/private/dsp/arch/x86/sse/correlation.h
new file mode 100644
index 00000000..e40d11a3
--- /dev/null
+++ b/include/private/dsp/arch/x86/sse/correlation.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 9 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_X86_SSE_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_X86_SSE_IMPL */
+
+namespace lsp
+{
+ namespace sse
+ {
+
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ IF_ARCH_X86(
+ size_t off;
+ );
+
+ ARCH_X86_ASM
+ (
+ __ASM_EMIT("xor %[off], %[off]")
+ __ASM_EMIT("xorps %%xmm0, %%xmm0") /* xv = 0 */
+ __ASM_EMIT("xorps %%xmm1, %%xmm1") /* xa = 0 */
+ __ASM_EMIT("xorps %%xmm2, %%xmm2") /* xb = 0 */
+ /* 8x blocks */
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("movups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("movups 0x10(%[a], %[off]), %%xmm4") /* xmm4 = a1 */
+ __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */
+ __ASM_EMIT("movups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("movups 0x10(%[b], %[off]), %%xmm6") /* xmm6 = b1 */
+ __ASM_EMIT("mulps %%xmm5, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = a0*a0 */
+ __ASM_EMIT("mulps %%xmm5, %%xmm5") /* xmm5 = b0*b0 */
+ __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("movaps %%xmm4, %%xmm7") /* xmm7 = a1 */
+ __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("mulps %%xmm6, %%xmm7") /* xmm7 = a1*b1 */
+ __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = a1*a1 */
+ __ASM_EMIT("mulps %%xmm6, %%xmm6") /* xmm6 = b1*b1 */
+ __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a1*b1 */
+ __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = xa + a1*a1 */
+ __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm2 = xb + b1*b1 */
+ __ASM_EMIT("add $0x20, %[off]") /* ++off */
+ __ASM_EMIT("sub $8, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("2:")
+ /* 4x block */
+ __ASM_EMIT("add $4, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("movups 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("movups 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */
+ __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = a0*a0 */
+ __ASM_EMIT("mulps %%xmm5, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("mulps %%xmm5, %%xmm5") /* xmm5 = b0*b0 */
+ __ASM_EMIT("addps %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("add $0x10, %[off]") /* ++off */
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("4:")
+ /* Do horizontal sum */
+ __ASM_EMIT("movhlps %%xmm0, %%xmm4") /* xmm4 = xv2 xv3 ? ? */
+ __ASM_EMIT("movhlps %%xmm1, %%xmm5") /* xmm5 = xa2 xa3 ? ? */
+ __ASM_EMIT("movhlps %%xmm2, %%xmm6") /* xmm6 = xb2 xb3 ? ? */
+ __ASM_EMIT("addps %%xmm4, %%xmm0") /* xmm0 = xv0+xv2 xv1+xv3 ? ? */
+ __ASM_EMIT("addps %%xmm5, %%xmm1") /* xmm1 = xa0+xa2 xa1+xa3 ? ? */
+ __ASM_EMIT("addps %%xmm6, %%xmm2") /* xmm2 = xb0+xb2 xb1+xb3 ? ? */
+ __ASM_EMIT("unpcklps %%xmm4, %%xmm0") /* xmm0 = xv0+xv2 xv2 xv1+xv3 xv3 */
+ __ASM_EMIT("unpcklps %%xmm5, %%xmm1") /* xmm1 = xa0+xa2 xa2 xa1+xa3 xa3 */
+ __ASM_EMIT("unpcklps %%xmm6, %%xmm2") /* xmm2 = xb0+xb2 xb2 xb1+xb3 xb3 */
+ __ASM_EMIT("movhlps %%xmm0, %%xmm4") /* xmm4 = xv1+xv3 xv3 ? ? */
+ __ASM_EMIT("movhlps %%xmm1, %%xmm5") /* xmm5 = xa1+xa3 xa3 ? ? */
+ __ASM_EMIT("movhlps %%xmm2, %%xmm6") /* xmm6 = xb1+xb3 xb3 ? ? */
+ __ASM_EMIT("addss %%xmm4, %%xmm0") /* xmm0 = xv0+xv1+xv2+xv3 xv2+xv3 ? ? */
+ __ASM_EMIT("addss %%xmm5, %%xmm1") /* xmm1 = xa0+xa1+xa2+xa3 xa2+xa3 ? ? */
+ __ASM_EMIT("addss %%xmm6, %%xmm2") /* xmm2 = xb0+xb1+xb2+xb3 xb2+xb3 ? ? */
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 6f")
+ __ASM_EMIT("5:")
+ __ASM_EMIT("movss 0x00(%[a], %[off]), %%xmm3") /* xmm3 = a0 */
+ __ASM_EMIT("movss 0x00(%[b], %[off]), %%xmm5") /* xmm5 = b0 */
+ __ASM_EMIT("movaps %%xmm3, %%xmm7") /* xmm7 = a0 */
+ __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = a0*a0 */
+ __ASM_EMIT("mulss %%xmm5, %%xmm7") /* xmm7 = a0*b0 */
+ __ASM_EMIT("mulss %%xmm5, %%xmm5") /* xmm5 = b0*b0 */
+ __ASM_EMIT("addss %%xmm7, %%xmm0") /* xmm0 = xv + a0*b0 */
+ __ASM_EMIT("addss %%xmm3, %%xmm1") /* xmm1 = xa + a0*a0 */
+ __ASM_EMIT("addss %%xmm5, %%xmm2") /* xmm2 = xb + b0*b0 */
+ __ASM_EMIT("add $0x04, %[off]") /* ++off */
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 5b")
+ __ASM_EMIT("6:")
+ /* Store result */
+ __ASM_EMIT("movss 0x00(%[corr]), %%xmm4")
+ __ASM_EMIT("movss 0x04(%[corr]), %%xmm5")
+ __ASM_EMIT("movss 0x08(%[corr]), %%xmm6")
+ __ASM_EMIT("addss %%xmm4, %%xmm0")
+ __ASM_EMIT("addss %%xmm5, %%xmm1")
+ __ASM_EMIT("addss %%xmm6, %%xmm2")
+ __ASM_EMIT("movss %%xmm0, 0x00(%[corr])")
+ __ASM_EMIT("movss %%xmm1, 0x04(%[corr])")
+ __ASM_EMIT("movss %%xmm2, 0x08(%[corr])")
+
+ : [corr] "+r" (corr), [off] "=&r" (off), [count] "+r" (count)
+ : [a] "r" (a), [b] "r" (b)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ static const float corr_const[] __lsp_aligned16 =
+ {
+ LSP_DSP_VEC4(1e-10f)
+ };
+
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ IF_ARCH_I386(
+ void *ptr;
+ );
+
+ ARCH_X86_ASM
+ (
+ /* 4x blocks */
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("movups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("movups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = ah */
+ __ASM_EMIT("movups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("movups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = at */
+ __ASM_EMIT("mulps %%xmm1, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("mulps %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("mulps %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("mulps %%xmm4, %%xmm5") /* xmm5 = at*bt */
+ __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = at*at */
+ __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = bt*bt */
+ __ASM_EMIT("subps %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("subps %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+ __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+
+ __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 0 0 0 */
+ __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 0 0 0 */
+ __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 0 0 0 */
+ __ASM_EMIT("movlhps %%xmm0, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] */
+ __ASM_EMIT("movlhps %%xmm1, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] */
+ __ASM_EMIT("movlhps %%xmm2, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] */
+ __ASM_EMIT("addps %%xmm3, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */
+ __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */
+ __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */
+ __ASM_EMIT("shufps $0x99, %%xmm0, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] */
+ __ASM_EMIT("shufps $0x99, %%xmm1, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] */
+ __ASM_EMIT("shufps $0x99, %%xmm2, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] */
+ __ASM_EMIT("addps %%xmm0, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("addps %%xmm1, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("movss 0x00(%[ptr]), %%xmm0")
+ __ASM_EMIT32("movss 0x04(%[ptr]), %%xmm1")
+ __ASM_EMIT32("movss 0x08(%[ptr]), %%xmm2")
+ __ASM_EMIT64("movss 0x00(%[corr]), %%xmm0")
+ __ASM_EMIT64("movss 0x04(%[corr]), %%xmm1")
+ __ASM_EMIT64("movss 0x08(%[corr]), %%xmm2")
+
+ __ASM_EMIT("shufps $0x00, %%xmm0, %%xmm0") /* xmm0 = xv xv xv xv */
+ __ASM_EMIT("shufps $0x00, %%xmm1, %%xmm1") /* xmm1 = xa xa xa xa */
+ __ASM_EMIT("shufps $0x00, %%xmm2, %%xmm2") /* xmm2 = xb xb xb xb */
+ __ASM_EMIT("addps %%xmm3, %%xmm1") /* xmm1 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("addps %%xmm4, %%xmm2") /* xmm2 = BB = xb+DV[0] xb+DV[0]+DV[1] xb+DV[0]+DV[1]+DV[2] xb+DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("movaps %%xmm1, %%xmm6") /* xmm6 = BA */
+ __ASM_EMIT("addps %%xmm5, %%xmm0") /* xmm0 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("mulps %%xmm2, %%xmm1") /* xmm1 = B = BA*BB */
+ __ASM_EMIT("movaps %%xmm0, %%xmm5") /* xmm5 = T */
+ __ASM_EMIT("shufps $0xff, %%xmm6, %%xmm6") /* xmm6 = BA[3] BA[3] BA[3] BA[3] */
+ __ASM_EMIT("shufps $0xff, %%xmm5, %%xmm5") /* xmm5 = T[3] T[3] T[3] T[3] */
+ __ASM_EMIT("shufps $0xff, %%xmm2, %%xmm2") /* xmm7 = BB[3] BB[3] BB[3] BB[3] */
+
+ __ASM_EMIT32("movss %%xmm5, 0x00(%[ptr])")
+ __ASM_EMIT32("movss %%xmm6, 0x04(%[ptr])")
+ __ASM_EMIT32("movss %%xmm2, 0x08(%[ptr])")
+ __ASM_EMIT64("movss %%xmm5, 0x00(%[corr])")
+ __ASM_EMIT64("movss %%xmm6, 0x04(%[corr])")
+ __ASM_EMIT64("movss %%xmm2, 0x08(%[corr])")
+
+ __ASM_EMIT("sqrtps %%xmm1, %%xmm7") /* xmm7 = sqrtf(B) */
+ __ASM_EMIT("cmpps $5, %[CORR_CC], %%xmm1") /* xmm1 = B >= 1e-10f */
+ __ASM_EMIT("divps %%xmm7, %%xmm0") /* xmm0 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("andps %%xmm1, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("movups %%xmm0, 0x00(%[ptr])")
+ __ASM_EMIT64("movups %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("add $0x10, %[ptr]")
+ __ASM_EMIT64("add $0x10, %[dst]")
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("subl $4, %[count]")
+ __ASM_EMIT64("sub $4, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("2:")
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("3:")
+ __ASM_EMIT("movss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("movss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("movss %%xmm0, %%xmm2") /* xmm2 = ah */
+ __ASM_EMIT("movss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("movss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("movss %%xmm3, %%xmm5") /* xmm5 = at */
+ __ASM_EMIT("mulss %%xmm1, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("mulss %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("mulss %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("mulss %%xmm4, %%xmm5") /* xmm5 = at*bt */
+ __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = at*at */
+ __ASM_EMIT("mulss %%xmm4, %%xmm4") /* xmm4 = bt*bt */
+ __ASM_EMIT("subss %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("subss %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+ __ASM_EMIT("subss %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+
+ __ASM_EMIT32("mov %[corr], %[ptr]")
+ __ASM_EMIT32("movss 0x00(%[ptr]), %%xmm3")
+ __ASM_EMIT32("movss 0x04(%[ptr]), %%xmm4")
+ __ASM_EMIT32("movss 0x08(%[ptr]), %%xmm5")
+ __ASM_EMIT64("movss 0x00(%[corr]), %%xmm3")
+ __ASM_EMIT64("movss 0x04(%[corr]), %%xmm4")
+ __ASM_EMIT64("movss 0x08(%[corr]), %%xmm5")
+
+ __ASM_EMIT("addss %%xmm4, %%xmm0") /* xmm0 = BA = xa+DA */
+ __ASM_EMIT("addss %%xmm3, %%xmm2") /* xmm2 = T = xv+DV */
+ __ASM_EMIT("movaps %%xmm0, %%xmm3") /* xmm3 = BA */
+ __ASM_EMIT("addss %%xmm5, %%xmm1") /* xmm1 = BB = xb+DB */
+ __ASM_EMIT("mulss %%xmm1, %%xmm3") /* xmm3 = B = BA*BB */
+
+ __ASM_EMIT32("movss %%xmm2, 0x00(%[ptr])")
+ __ASM_EMIT32("movss %%xmm0, 0x04(%[ptr])")
+ __ASM_EMIT32("movss %%xmm1, 0x08(%[ptr])")
+ __ASM_EMIT64("movss %%xmm2, 0x00(%[corr])")
+ __ASM_EMIT64("movss %%xmm0, 0x04(%[corr])")
+ __ASM_EMIT64("movss %%xmm1, 0x08(%[corr])")
+
+ __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */
+ __ASM_EMIT("cmpss $5, %[CORR_CC], %%xmm3") /* xmm3 = B >= 1e-10f */
+ __ASM_EMIT("divss %%xmm7, %%xmm2") /* xmm2 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("andps %%xmm3, %%xmm2") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */
+ __ASM_EMIT32("movss %%xmm2, 0x00(%[ptr])")
+ __ASM_EMIT64("movss %%xmm2, 0x00(%[dst])")
+ __ASM_EMIT32("add $0x04, %[ptr]")
+ __ASM_EMIT64("add $0x04, %[dst]")
+ __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT32("mov %[ptr], %[dst]")
+ __ASM_EMIT32("decl %[count]")
+ __ASM_EMIT64("dec %[count]")
+ __ASM_EMIT("jge 3b")
+ __ASM_EMIT("4:")
+
+ : __IF_32(
+ [ptr] "=&r" (ptr),
+ [corr] "+m" (corr), [dst] "+m" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+g" (count)
+ )
+ __IF_64(
+ [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ )
+ : __IF_64( [corr] "r" (corr), )
+ [CORR_CC] "o" (corr_const)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7"
+ );
+ }
+
+ } /* namespace sse */
+} /* namespace lsp */
+
+
+
+#endif /* PRIVATE_DSP_ARCH_X86_SSE_CORRELATION_H_ */
diff --git a/include/private/dsp/arch/x86/sse3/correlation.h b/include/private/dsp/arch/x86/sse3/correlation.h
new file mode 100644
index 00000000..14be34ed
--- /dev/null
+++ b/include/private/dsp/arch/x86/sse3/correlation.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 10 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#ifndef PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_
+#define PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_
+
+#ifndef PRIVATE_DSP_ARCH_X86_SSE3_IMPL
+ #error "This header should not be included directly"
+#endif /* PRIVATE_DSP_ARCH_X86_SSE3_IMPL */
+
+namespace lsp
+{
+ namespace sse3
+ {
+ static const float corr_const[] __lsp_aligned16 =
+ {
+ LSP_DSP_VEC4(1e-10f)
+ };
+
+ void x64_corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ ARCH_X86_64_ASM
+ (
+ /* load data */
+ __ASM_EMIT("movss 0x00(%[corr]), %%xmm8") /* xmm8 = xv */
+ __ASM_EMIT("movss 0x04(%[corr]), %%xmm9") /* xmm9 = xa */
+ __ASM_EMIT("movss 0x08(%[corr]), %%xmm10") /* xmm10 = xb */
+ __ASM_EMIT("movaps %[CORR_CC], %%xmm11") /* xmm11 = 1e-10f */
+ /* 4x blocks */
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("jb 2f")
+ __ASM_EMIT("shufps $0x00, %%xmm8, %%xmm8")
+ __ASM_EMIT("shufps $0x00, %%xmm9, %%xmm9")
+ __ASM_EMIT("shufps $0x00, %%xmm10, %%xmm10")
+ __ASM_EMIT(".align 0x10")
+ __ASM_EMIT("1:")
+ __ASM_EMIT("movups 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("movups 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("movaps %%xmm0, %%xmm2") /* xmm2 = ah */
+ __ASM_EMIT("movups 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("movups 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("movaps %%xmm3, %%xmm5") /* xmm5 = at */
+ __ASM_EMIT("mulps %%xmm1, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("mulps %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("mulps %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("mulps %%xmm4, %%xmm5") /* xmm5 = at*bt */
+ __ASM_EMIT("mulps %%xmm3, %%xmm3") /* xmm3 = at*at */
+ __ASM_EMIT("mulps %%xmm4, %%xmm4") /* xmm4 = bt*bt */
+ __ASM_EMIT("subps %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("subps %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+ __ASM_EMIT("subps %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+
+ __ASM_EMIT("xorps %%xmm3, %%xmm3") /* xmm3 = 0 0 0 0 */
+ __ASM_EMIT("xorps %%xmm4, %%xmm4") /* xmm4 = 0 0 0 0 */
+ __ASM_EMIT("xorps %%xmm5, %%xmm5") /* xmm5 = 0 0 0 0 */
+ __ASM_EMIT("movlhps %%xmm0, %%xmm3") /* xmm3 = 0 0 DA[0] DA[1] */
+ __ASM_EMIT("movlhps %%xmm1, %%xmm4") /* xmm4 = 0 0 DB[0] DB[1] */
+ __ASM_EMIT("movlhps %%xmm2, %%xmm5") /* xmm5 = 0 0 DV[0] DV[1] */
+ __ASM_EMIT("addps %%xmm3, %%xmm0") /* xmm0 = DA[0] DA[1] DA[0]+DA[2] DA[1]+DA[3] */
+ __ASM_EMIT("addps %%xmm4, %%xmm1") /* xmm1 = DB[0] DB[1] DB[0]+DB[2] DB[1]+DB[3] */
+ __ASM_EMIT("addps %%xmm5, %%xmm2") /* xmm2 = DV[0] DV[1] DV[0]+DV[2] DV[1]+DV[3] */
+ __ASM_EMIT("shufps $0x99, %%xmm0, %%xmm3") /* xmm3 = 0 DA[0] DA[1] DA[0]+DA[2] */
+ __ASM_EMIT("shufps $0x99, %%xmm1, %%xmm4") /* xmm4 = 0 DB[0] DB[1] DB[0]+DB[2] */
+ __ASM_EMIT("shufps $0x99, %%xmm2, %%xmm5") /* xmm5 = 0 DV[0] DV[1] DV[0]+DV[2] */
+ __ASM_EMIT("addps %%xmm0, %%xmm3") /* xmm3 = DA[0] DA[0]+DA[1] DA[0]+DA[1]+DA[2] DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("addps %%xmm1, %%xmm4") /* xmm4 = DB[0] DB[0]+DB[1] DB[0]+DB[1]+DB[2] DB[0]+DB[1]+DB[2]+DB[3] */
+ __ASM_EMIT("addps %%xmm2, %%xmm5") /* xmm5 = DV[0] DV[0]+DV[1] DV[0]+DV[1]+DV[2] DV[0]+DV[1]+DV[2]+DV[3] */
+
+ __ASM_EMIT("addps %%xmm3, %%xmm9") /* xmm9 = BA = xa+DA[0] xa+DA[0]+DA[1] xa+DA[0]+DA[1]+DA[2] xa+DA[0]+DA[1]+DA[2]+DA[3] */
+ __ASM_EMIT("addps %%xmm4, %%xmm10") /* xmm10 = BB = xb+DV[0] xb+DV[0]+DV[1] xb+DV[0]+DV[1]+DV[2] xb+DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("movaps %%xmm9, %%xmm6") /* xmm6 = BA */
+ __ASM_EMIT("addps %%xmm5, %%xmm8") /* xmm8 = T = xv+DV[0] xv+DV[0]+DV[1] xv+DV[0]+DV[1]+DV[2] xv+DV[0]+DV[1]+DV[2]+DV[3] */
+ __ASM_EMIT("mulps %%xmm10, %%xmm6") /* xmm6 = B = BA*BB */
+ __ASM_EMIT("movaps %%xmm8, %%xmm5") /* xmm5 = T */
+ __ASM_EMIT("shufps $0xff, %%xmm8, %%xmm8") /* xmm8 = T[3] T[3] T[3] T[3] */
+ __ASM_EMIT("shufps $0xff, %%xmm9, %%xmm9") /* xmm9 = BA[3] BA[3] BA[3] BA[3] */
+ __ASM_EMIT("shufps $0xff, %%xmm10, %%xmm10") /* xmm10 = BB[3] BB[3] BB[3] BB[3] */
+
+ __ASM_EMIT("sqrtps %%xmm6, %%xmm7") /* xmm7 = sqrtf(B) */
+ __ASM_EMIT("cmpps $5, %%xmm11, %%xmm6") /* xmm6 = B >= 1e-10f */
+ __ASM_EMIT("divps %%xmm7, %%xmm5") /* xmm5 = T/sqrtf(B) */
+ __ASM_EMIT("andps %%xmm6, %%xmm5") /* xmm5 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x10, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x10, %[b_head]") /* ++b_head */
+ __ASM_EMIT("movups %%xmm5, 0x00(%[dst])")
+ __ASM_EMIT("add $0x10, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x10, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT("add $0x10, %[dst]")
+ __ASM_EMIT("sub $4, %[count]")
+ __ASM_EMIT("jae 1b")
+ __ASM_EMIT("2:")
+ /* 1x blocks */
+ __ASM_EMIT("add $3, %[count]")
+ __ASM_EMIT("jl 4f")
+ __ASM_EMIT("3:")
+ __ASM_EMIT("movss 0x00(%[a_head]), %%xmm0") /* xmm0 = ah */
+ __ASM_EMIT("movss 0x00(%[b_head]), %%xmm1") /* xmm1 = bh */
+ __ASM_EMIT("movss %%xmm0, %%xmm2") /* xmm2 = ah */
+ __ASM_EMIT("movss 0x00(%[a_tail]), %%xmm3") /* xmm3 = at */
+ __ASM_EMIT("movss 0x00(%[b_tail]), %%xmm4") /* xmm4 = bt */
+ __ASM_EMIT("movss %%xmm3, %%xmm5") /* xmm5 = at */
+ __ASM_EMIT("mulss %%xmm1, %%xmm2") /* xmm2 = ah*bh */
+ __ASM_EMIT("mulss %%xmm0, %%xmm0") /* xmm0 = ah*ah */
+ __ASM_EMIT("mulss %%xmm1, %%xmm1") /* xmm1 = bh*bh */
+ __ASM_EMIT("mulss %%xmm4, %%xmm5") /* xmm5 = at*bt */
+ __ASM_EMIT("mulss %%xmm3, %%xmm3") /* xmm3 = at*at */
+ __ASM_EMIT("mulss %%xmm4, %%xmm4") /* xmm4 = bt*bt */
+ __ASM_EMIT("subss %%xmm3, %%xmm0") /* xmm0 = DA = ah*ah - at*at */
+ __ASM_EMIT("subss %%xmm4, %%xmm1") /* xmm1 = DB = bh*bh - bt*bt */
+ __ASM_EMIT("subss %%xmm5, %%xmm2") /* xmm2 = DV = ah*bh - at*bt */
+
+ __ASM_EMIT("addss %%xmm0, %%xmm9") /* xmm9 = BA = xa+DA */
+ __ASM_EMIT("addss %%xmm1, %%xmm10") /* xmm10 = BB = xb+DB */
+ __ASM_EMIT("movaps %%xmm9, %%xmm3") /* xmm3 = BA */
+ __ASM_EMIT("addss %%xmm2, %%xmm8") /* xmm8 = T = xv+DV */
+ __ASM_EMIT("mulss %%xmm10, %%xmm3") /* xmm3 = B = BA*BB */
+ __ASM_EMIT("movaps %%xmm8, %%xmm0") /* xmm0 = T */
+
+ __ASM_EMIT("sqrtss %%xmm3, %%xmm7") /* xmm7 = sqrtf(B) */
+ __ASM_EMIT("cmpss $5, %[CORR_CC], %%xmm3") /* xmm3 = B >= 1e-10f */
+ __ASM_EMIT("divss %%xmm7, %%xmm0") /* xmm2 = T/sqrtf(B) */
+ __ASM_EMIT32("mov %[dst], %[ptr]")
+ __ASM_EMIT("andps %%xmm3, %%xmm0") /* xmm0 = (B >= 1e-10f) ? T/sqrtf(B) : 0 */
+ __ASM_EMIT("add $0x04, %[a_head]") /* ++a_head */
+ __ASM_EMIT("add $0x04, %[b_head]") /* ++b_head */
+ __ASM_EMIT("movss %%xmm0, 0x00(%[dst])")
+ __ASM_EMIT("add $0x04, %[a_tail]") /* ++a_tail */
+ __ASM_EMIT("add $0x04, %[b_tail]") /* ++b_tail */
+ __ASM_EMIT("add $0x04, %[dst]")
+ __ASM_EMIT("dec %[count]")
+ __ASM_EMIT("jge 3b")
+ __ASM_EMIT("4:")
+
+ /* Store result */
+ __ASM_EMIT("movss %%xmm8, 0x00(%[corr])")
+ __ASM_EMIT("movss %%xmm9, 0x04(%[corr])")
+ __ASM_EMIT("movss %%xmm10, 0x08(%[corr])")
+
+ : [dst] "+r" (dst),
+ [a_head] "+r" (a_head), [b_head] "+r" (b_head),
+ [a_tail] "+r" (a_tail), [b_tail] "+r" (b_tail),
+ [count] "+r" (count)
+ : [corr] "r" (corr),
+ [CORR_CC] "o" (corr_const)
+ : "cc", "memory",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9", "%xmm9", "%xmm10"
+ );
+ }
+
+ } /* namespace sse3 */
+} /* namespace lsp */
+
+
+
+
+#endif /* PRIVATE_DSP_ARCH_X86_SSE3_CORRELATION_H_ */
diff --git a/make/system.mk b/make/system.mk
index 5796e895..66c7f58d 100644
--- a/make/system.mk
+++ b/make/system.mk
@@ -1,21 +1,21 @@
#
-# Copyright (C) 2020 Linux Studio Plugins Project
-# (C) 2020 Vladimir Sadovnikov
+# Copyright (C) 2024 Linux Studio Plugins Project
+# (C) 2024 Vladimir Sadovnikov
#
-# This file is part of lsp-plugins
+# This file is part of lsp-dsp-lib
#
-# lsp-plugins is free software: you can redistribute it and/or modify
+# lsp-dsp-lib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.
#
-# lsp-plugins is distributed in the hope that it will be useful,
+# lsp-dsp-lib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
-# along with lsp-plugins. If not, see .
+# along with lsp-dsp-lib. If not, see .
#
# Detect operating system
diff --git a/make/tools.mk b/make/tools.mk
index e99498dd..ade5d571 100644
--- a/make/tools.mk
+++ b/make/tools.mk
@@ -76,6 +76,8 @@ ifeq ($(PLATFORM),Solaris)
else ifeq ($(PLATFORM),Windows)
FLAG_RELRO =
FLAG_STDLIB =
+ CFLAGS_EXT += -DWINVER=0x600 -D_WIN32_WINNT=0x600
+ CXXFLAGS_EXT += -DWINVER=0x600 -D_WIN32_WINNT=0x600
EXE_FLAGS_EXT += -static-libgcc -static-libstdc++
SO_FLAGS_EXT += -static-libgcc -static-libstdc++
LDFLAGS_EXT += -T $(CURDIR)/make/ld-windows.script
diff --git a/modules.mk b/modules.mk
index 929f615c..8a17adde 100644
--- a/modules.mk
+++ b/modules.mk
@@ -19,13 +19,13 @@
#
# Variables that describe dependencies
-LSP_COMMON_LIB_VERSION := 1.0.34
+LSP_COMMON_LIB_VERSION := 1.0.35
LSP_COMMON_LIB_NAME := lsp-common-lib
LSP_COMMON_LIB_TYPE := src
LSP_COMMON_LIB_URL_RO := https://github.com/lsp-plugins/$(LSP_COMMON_LIB_NAME).git
LSP_COMMON_LIB_URL_RW := git@github.com:lsp-plugins/$(LSP_COMMON_LIB_NAME).git
-LSP_TEST_FW_VERSION := 1.0.24
+LSP_TEST_FW_VERSION := 1.0.25
LSP_TEST_FW_NAME := lsp-test-fw
LSP_TEST_FW_TYPE := src
LSP_TEST_FW_URL_RO := https://github.com/lsp-plugins/$(LSP_TEST_FW_NAME).git
diff --git a/project.mk b/project.mk
index 86bee130..3e183d1f 100644
--- a/project.mk
+++ b/project.mk
@@ -23,4 +23,4 @@ ARTIFACT_ID = LSP_DSP_LIB
ARTIFACT_NAME = lsp-dsp-lib
ARTIFACT_DESC = DSP library for digital signal processing
ARTIFACT_HEADERS = lsp-plug.in
-ARTIFACT_VERSION = 1.0.21
+ARTIFACT_VERSION = 1.0.22
diff --git a/src/main/aarch64/asimd.cpp b/src/main/aarch64/asimd.cpp
index 417c89af..a5eac977 100644
--- a/src/main/aarch64/asimd.cpp
+++ b/src/main/aarch64/asimd.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -52,6 +52,7 @@
#define PRIVATE_DSP_ARCH_AARCH64_ASIMD_IMPL
#include
#include
+ #include
#include
#include
#include
@@ -429,6 +430,8 @@
EXPORT1(downsample_8x);
EXPORT1(convolve);
+ EXPORT1(corr_init);
+ EXPORT1(corr_incr);
EXPORT1(abgr32_to_bgrff32);
EXPORT1(rgba32_to_bgra32);
diff --git a/src/main/arm/neon-d32.cpp b/src/main/arm/neon-d32.cpp
index edf61ec2..b7ae9953 100644
--- a/src/main/arm/neon-d32.cpp
+++ b/src/main/arm/neon-d32.cpp
@@ -50,6 +50,7 @@
#define PRIVATE_DSP_ARCH_ARM_NEON_D32_IMPL
#include
#include
+ #include
#include
#include
#include
@@ -141,6 +142,8 @@
EXPORT1(pcomplex_r2c_rdiv2);
EXPORT1(convolve);
+ EXPORT1(corr_init);
+ EXPORT1(corr_incr);
EXPORT1(axis_apply_lin1);
EXPORT1(axis_apply_log1);
diff --git a/src/main/generic/generic.cpp b/src/main/generic/generic.cpp
index 12324237..a99fffed 100644
--- a/src/main/generic/generic.cpp
+++ b/src/main/generic/generic.cpp
@@ -47,6 +47,7 @@ namespace lsp
#include
#include
#include
+ #include
#include
@@ -598,6 +599,8 @@ namespace lsp
EXPORT1(unit_vector_p1pv);
EXPORT1(convolve);
+ EXPORT1(corr_init);
+ EXPORT1(corr_incr);
EXPORT1(base64_enc);
EXPORT1(base64_dec);
diff --git a/src/main/x86/avx.cpp b/src/main/x86/avx.cpp
index 99e76b59..54e517a7 100644
--- a/src/main/x86/avx.cpp
+++ b/src/main/x86/avx.cpp
@@ -69,6 +69,7 @@
#include
#include
#include
+ #include
#include
@@ -391,6 +392,8 @@
CEXPORT1(favx, downsample_8x);
CEXPORT1(favx, convolve);
+ CEXPORT1(favx, corr_init);
+ CEXPORT1(favx, corr_incr);
CEXPORT1(favx, lin_inter_set);
CEXPORT1(favx, lin_inter_mul2);
@@ -483,6 +486,8 @@
CEXPORT2(favx, filter_transfer_apply_pc, filter_transfer_apply_pc_fma3);
CEXPORT2(favx, convolve, convolve_fma3);
+ CEXPORT2(favx, corr_init, corr_init_fma3);
+ CEXPORT2(favx, corr_incr, corr_incr_fma3);
CEXPORT2(favx, axis_apply_lin1, axis_apply_lin1_fma3);
diff --git a/src/main/x86/avx512.cpp b/src/main/x86/avx512.cpp
index 64329fb1..9946c4ba 100644
--- a/src/main/x86/avx512.cpp
+++ b/src/main/x86/avx512.cpp
@@ -49,6 +49,8 @@
#include
#include
#include
+
+ #include
#undef PRIVATE_DSP_ARCH_X86_AVX512_IMPL
namespace lsp
@@ -275,6 +277,9 @@
CEXPORT1(vl, uexpander_x1_curve);
CEXPORT1(vl, dexpander_x1_gain);
CEXPORT1(vl, dexpander_x1_curve);
+
+ CEXPORT1(vl, corr_init);
+ CEXPORT1(vl, corr_incr);
}
} /* namespace avx2 */
} /* namespace lsp */
diff --git a/src/main/x86/sse.cpp b/src/main/x86/sse.cpp
index 96d344ae..7211c429 100644
--- a/src/main/x86/sse.cpp
+++ b/src/main/x86/sse.cpp
@@ -66,6 +66,7 @@
#include
#include
+ #include
#include
#include
@@ -492,6 +493,8 @@
EXPORT1(cull_triangle_raw);
EXPORT1(convolve);
+ EXPORT1(corr_init);
+ EXPORT1(corr_incr);
EXPORT1(lin_inter_set);
EXPORT1(lin_inter_mul2);
diff --git a/src/main/x86/sse3.cpp b/src/main/x86/sse3.cpp
index 5d59d951..78b9be9c 100644
--- a/src/main/x86/sse3.cpp
+++ b/src/main/x86/sse3.cpp
@@ -48,6 +48,7 @@
#include
#include
#include
+ #include
#undef PRIVATE_DSP_ARCH_X86_SSE3_IMPL
namespace lsp
@@ -115,6 +116,8 @@
EXPORT1(split_triangle_raw);
EXPORT1(cull_triangle_raw);
+
+ EXPORT2_X64(corr_incr, x64_corr_incr);
}
#undef EXPORT2
diff --git a/src/test/ptest/convolve.cpp b/src/test/ptest/convolve.cpp
index 5393dc6b..9f2d2a42 100644
--- a/src/test/ptest/convolve.cpp
+++ b/src/test/ptest/convolve.cpp
@@ -109,6 +109,9 @@ PTEST_BEGIN("dsp", convolve, 5, 1000)
float *in = &out[buf_size*2];
float *conv = &in[buf_size];
float *backup = &conv[buf_size];
+ lsp_finally {
+ free_aligned(data);
+ };
for (size_t i=0; i < buf_size*4; ++i)
out[i] = randf(-1.0f, 1.0f);
diff --git a/src/test/ptest/corr_incr.cpp b/src/test/ptest/corr_incr.cpp
new file mode 100644
index 00000000..597c7ecf
--- /dev/null
+++ b/src/test/ptest/corr_incr.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 10 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#define MIN_RANK 8
+#define MAX_RANK 15
+
+namespace lsp
+{
+ namespace generic
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+
+ IF_ARCH_X86(
+ namespace sse
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+
+ namespace avx
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+
+ void corr_incr_fma3(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+
+ namespace avx512
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ IF_ARCH_X86_64(
+ namespace sse3
+ {
+ void x64_corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ IF_ARCH_ARM(
+ namespace neon_d32
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ IF_ARCH_AARCH64(
+ namespace asimd
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+}
+
+//-----------------------------------------------------------------------------
+// Performance test for lanczos resampling
+PTEST_BEGIN("dsp", corr_incr, 5, 10000)
+
+ void call(const char *label,
+ float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count, corr_incr_t func)
+ {
+ if (!PTEST_SUPPORTED(func))
+ return;
+
+ char buf[80];
+ sprintf(buf, "%s x %d", label, int(count));
+ printf("Testing %s correlation ...\n", buf);
+
+ dsp::correlation_t corr;
+
+ PTEST_LOOP(buf,
+ corr.v = 0.0f;
+ corr.a = 0.0f;
+ corr.b = 0.0f;
+
+ func(&corr, dst, a_head, b_head, a_tail, b_tail, count);
+ );
+ }
+
+ PTEST_MAIN
+ {
+ size_t buf_size = 1 << MAX_RANK;
+ uint8_t *data = NULL;
+ float *a_head = alloc_aligned(data, buf_size * 5, 64);
+ float *a_tail = &a_head[buf_size];
+ float *b_head = &a_tail[buf_size];
+ float *b_tail = &b_head[buf_size];
+ float *dst = &b_tail[buf_size];
+ lsp_finally {
+ free_aligned(data);
+ };
+
+ for (size_t i=0; i < buf_size*5; ++i)
+ a_head[i] = randf(-1.0f, 1.0f);
+
+ #define CALL(func, count) \
+ call(#func, dst, a_head, b_head, a_tail, b_tail, count, func)
+
+ for (size_t i=MIN_RANK; i<=MAX_RANK; ++i)
+ {
+ const size_t count = 1 << i;
+
+ CALL(generic::corr_incr, count);
+ IF_ARCH_X86(CALL(sse::corr_incr, count));
+ IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, count));
+ IF_ARCH_X86(CALL(avx::corr_incr, count));
+ IF_ARCH_X86(CALL(avx::corr_incr_fma3, count));
+ IF_ARCH_X86(CALL(avx512::corr_incr, count));
+ IF_ARCH_ARM(CALL(neon_d32::corr_incr, count));
+ IF_ARCH_AARCH64(CALL(asimd::corr_incr, count));
+
+ PTEST_SEPARATOR;
+ }
+ }
+
+PTEST_END
+
+
diff --git a/src/test/ptest/corr_init.cpp b/src/test/ptest/corr_init.cpp
new file mode 100644
index 00000000..8bba049d
--- /dev/null
+++ b/src/test/ptest/corr_init.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 9 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#define MIN_RANK 8
+#define MAX_RANK 15
+
+namespace lsp
+{
+ namespace generic
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+
+ IF_ARCH_X86(
+ namespace sse
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+
+ namespace avx
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+
+ namespace avx512
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+ )
+
+ IF_ARCH_ARM(
+ namespace neon_d32
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+ )
+
+ IF_ARCH_AARCH64(
+ namespace asimd
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+ )
+
+ typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+}
+
+//-----------------------------------------------------------------------------
+// Performance test for lanczos resampling
+PTEST_BEGIN("dsp", corr_init, 5, 10000)
+
+ void call(const char *label, const float *a, const float *b, size_t count, corr_init_t func)
+ {
+ if (!PTEST_SUPPORTED(func))
+ return;
+
+ char buf[80];
+ sprintf(buf, "%s x %d", label, int(count));
+ printf("Testing %s correlation ...\n", buf);
+
+ dsp::correlation_t corr;
+
+ PTEST_LOOP(buf,
+ corr.v = 0.0f;
+ corr.a = 0.0f;
+ corr.b = 0.0f;
+
+ func(&corr, a, b, count);
+ );
+ }
+
+ PTEST_MAIN
+ {
+ size_t buf_size = 1 << MAX_RANK;
+ uint8_t *data = NULL;
+ float *a = alloc_aligned(data, buf_size * 2, 64);
+ float *b = &a[buf_size];
+ lsp_finally {
+ free_aligned(data);
+ };
+
+ for (size_t i=0; i < buf_size*2; ++i)
+ a[i] = randf(-1.0f, 1.0f);
+
+ #define CALL(func, count) \
+ call(#func, a, b, count, func)
+
+ for (size_t i=MIN_RANK; i<=MAX_RANK; ++i)
+ {
+ const size_t count = 1 << i;
+
+ CALL(generic::corr_init, count);
+ IF_ARCH_X86(CALL(sse::corr_init, count));
+ IF_ARCH_X86(CALL(avx::corr_init, count));
+ IF_ARCH_X86(CALL(avx::corr_init_fma3, count));
+ IF_ARCH_X86(CALL(avx512::corr_init, count));
+ IF_ARCH_ARM(CALL(neon_d32::corr_init, count));
+ IF_ARCH_AARCH64(CALL(asimd::corr_init, count));
+
+ PTEST_SEPARATOR;
+ }
+ }
+
+PTEST_END
+
+
diff --git a/src/test/utest/corr_incr.cpp b/src/test/utest/corr_incr.cpp
new file mode 100644
index 00000000..34df2084
--- /dev/null
+++ b/src/test/utest/corr_incr.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 8 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+namespace lsp
+{
+ namespace generic
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+
+ IF_ARCH_X86(
+ namespace sse
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+
+ namespace avx
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+
+ void corr_incr_fma3(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+
+ namespace avx512
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ IF_ARCH_X86_64(
+ namespace sse3
+ {
+ void x64_corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ IF_ARCH_ARM(
+ namespace neon_d32
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ IF_ARCH_AARCH64(
+ namespace asimd
+ {
+ void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+ }
+ )
+
+ static void corr_incr(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count)
+ {
+ float vv = corr->v;
+ float va = corr->a;
+ float vb = corr->b;
+
+ for (size_t i=0; i= 1e-10f) ? vv / sqrtf(d) : 0.0f;
+ }
+
+ corr->v = vv;
+ corr->a = va;
+ corr->b = vb;
+ }
+
+ typedef void (* corr_incr_t)(dsp::correlation_t *corr, float *dst,
+ const float *a_head, const float *b_head,
+ const float *a_tail, const float *b_tail,
+ size_t count);
+}
+
+UTEST_BEGIN("dsp", corr_incr)
+ void call(const char *label, size_t align, corr_incr_t func)
+ {
+ if (!UTEST_SUPPORTED(func))
+ return;
+
+ for (size_t mask=0; mask <= 0x07; ++mask)
+ {
+ UTEST_FOREACH(tail, 0x80, 0x1ff)
+ {
+ UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff)
+ {
+ FloatBuffer a(tail + count + 1, align, mask & 0x01);
+ FloatBuffer b(tail + count + 1, align, mask & 0x02);
+ FloatBuffer dst1(count, align, mask & 0x04);
+ FloatBuffer dst2(count, align, mask & 0x04);
+
+ dsp::correlation_t corr_a, corr_b;
+ corr_a.v = 0.0f;
+ corr_a.a = 0.0f;
+ corr_a.b = 0.0f;
+ dsp::corr_init(&corr_a, a, b, tail);
+ corr_b = corr_a;
+
+ printf("Tesing %s correlation tail=%d on buffer count=%d mask=0x%x\n", label, int(tail), int(count), int(mask));
+
+ const float *a_tail = a;
+ const float *b_tail = b;
+ const float *a_head = &a_tail[tail];
+ const float *b_head = &b_tail[tail];
+
+ corr_incr(&corr_a, dst1, a_head, b_head, a_tail, b_tail, count);
+ func(&corr_b, dst2, a_head, b_head, a_tail, b_tail, count);
+
+ UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted");
+ UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted");
+ UTEST_ASSERT_MSG(dst1.valid(), "Destination buffer 1 corrupted");
+ UTEST_ASSERT_MSG(dst2.valid(), "Destination buffer 2 corrupted");
+
+ // Compare buffers
+ if (!dst1.equals_adaptive(dst2, 1e-4))
+ {
+ a.dump("a ");
+ b.dump("b ");
+ dst1.dump("dst1");
+ dst2.dump("dst2");
+ UTEST_FAIL_MSG("Output of functions for test '%s' differs at index %d, value=%f vs %f\n"
+ "correlation state a={%f, %f, %f}, b={%f, %f, %f}",
+ label, int(dst1.last_diff()), dst1.get(dst1.last_diff()), dst2.get(dst1.last_diff()),
+ corr_a.v, corr_a.a, corr_a.b,
+ corr_b.v, corr_b.a, corr_b.b);
+ }
+
+ // Compare state
+ if ((!float_equals_adaptive(corr_a.v, corr_b.v, 1e-5)) ||
+ (!float_equals_adaptive(corr_a.a, corr_b.a, 1e-5)) ||
+ (!float_equals_adaptive(corr_a.b, corr_b.b, 1e-5)))
+ {
+ UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}",
+ corr_a.v, corr_a.a, corr_a.b,
+ corr_b.v, corr_b.a, corr_b.b);
+ }
+ }
+ }
+ }
+ }
+
+ UTEST_MAIN
+ {
+ #define CALL(func, align) \
+ call(#func, align, func)
+
+ CALL(generic::corr_incr, 16);
+ IF_ARCH_X86(CALL(sse::corr_incr, 16));
+ IF_ARCH_X86_64(CALL(sse3::x64_corr_incr, 16));
+ IF_ARCH_X86(CALL(avx::corr_incr, 32));
+ IF_ARCH_X86(CALL(avx::corr_incr_fma3, 32));
+ IF_ARCH_X86(CALL(avx512::corr_incr, 64));
+ IF_ARCH_ARM(CALL(neon_d32::corr_incr, 16));
+ IF_ARCH_AARCH64(CALL(asimd::corr_incr, 16));
+ }
+
+UTEST_END;
+
+
+
diff --git a/src/test/utest/corr_init.cpp b/src/test/utest/corr_init.cpp
new file mode 100644
index 00000000..a1a3c325
--- /dev/null
+++ b/src/test/utest/corr_init.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
+ *
+ * This file is part of lsp-dsp-lib
+ * Created on: 8 мар. 2024 г.
+ *
+ * lsp-dsp-lib is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * lsp-dsp-lib is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with lsp-dsp-lib. If not, see .
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+namespace lsp
+{
+ namespace generic
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+
+ IF_ARCH_X86(
+ namespace sse
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+
+ namespace avx
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ void corr_init_fma3(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+
+ namespace avx512
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+ )
+
+ IF_ARCH_ARM(
+ namespace neon_d32
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+ )
+
+ IF_ARCH_AARCH64(
+ namespace asimd
+ {
+ void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+ }
+ )
+
+ static void corr_init(dsp::correlation_t *corr, const float *a, const float *b, size_t count)
+ {
+ float vv = 0.0f;
+ float va = 0.0f;
+ float vb = 0.0f;
+
+ for (size_t i=0; iv += vv;
+ corr->a += va;
+ corr->b += vb;
+ }
+
+ typedef void (* corr_init_t)(dsp::correlation_t *corr, const float *a, const float *b, size_t count);
+}
+
+UTEST_BEGIN("dsp", corr_init)
+ void call(const char *label, size_t align, corr_init_t func)
+ {
+ if (!UTEST_SUPPORTED(func))
+ return;
+
+ for (size_t mask=0; mask <= 0x07; ++mask)
+ {
+ UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 8, 16, 24, 32, 33, 64, 47, 0x80, 0x1ff)
+ {
+ FloatBuffer a(count, align, mask & 0x01);
+ FloatBuffer b(count, align, mask & 0x02);
+
+ dsp::correlation_t corr_a, corr_b;
+ corr_a.v = 0.1f;
+ corr_a.a = 0.2f;
+ corr_a.b = 0.3f;
+ corr_b.v = 0.1f;
+ corr_b.a = 0.2f;
+ corr_b.b = 0.3f;
+
+ printf("Tesing %s corr_init on buffer count=%d mask=0x%x\n", label, int(count), int(mask));
+
+ corr_init(&corr_a, a, b, count);
+ func(&corr_b, a, b, count);
+
+ UTEST_ASSERT_MSG(a.valid(), "Buffer A corrupted");
+ UTEST_ASSERT_MSG(b.valid(), "Buffer B corrupted");
+
+ // Compare state
+ if ((!float_equals_adaptive(corr_a.v, corr_b.v, 1e-5f)) ||
+ (!float_equals_adaptive(corr_a.a, corr_b.a, 1e-5f)) ||
+ (!float_equals_adaptive(corr_a.b, corr_b.b, 1e-5f)))
+ {
+ UTEST_FAIL_MSG("Correlation state differs a={%f, %f, %f}, b={%f, %f, %f}",
+ corr_a.v, corr_a.a, corr_a.b,
+ corr_b.v, corr_b.a, corr_b.b);
+ }
+ }
+ }
+ }
+
+ UTEST_MAIN
+ {
+ #define CALL(func, align) \
+ call(#func, align, func)
+
+ CALL(generic::corr_init, 16);
+ IF_ARCH_X86(CALL(sse::corr_init, 16));
+ IF_ARCH_X86(CALL(avx::corr_init, 32));
+ IF_ARCH_X86(CALL(avx::corr_init_fma3, 32));
+ IF_ARCH_X86(CALL(avx512::corr_init, 64));
+ IF_ARCH_ARM(CALL(neon_d32::corr_init, 16));
+ IF_ARCH_AARCH64(CALL(asimd::corr_init, 16));
+ }
+
+UTEST_END;
+
+
+
diff --git a/src/test/utest/msmatrix/conv2.cpp b/src/test/utest/msmatrix/conv2.cpp
index f22e21e4..540e3c46 100644
--- a/src/test/utest/msmatrix/conv2.cpp
+++ b/src/test/utest/msmatrix/conv2.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -85,7 +85,7 @@ UTEST_BEGIN("dsp.msmatrix", conv2)
return;
UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 32, 64, 65, 100, 999, 0xfff)
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 63, 64, 65, 100, 999, 0xfff)
{
for (size_t mask=0; mask <= 0x0f; ++mask)
{
diff --git a/src/test/utest/msmatrix/conv2x1.cpp b/src/test/utest/msmatrix/conv2x1.cpp
index ba75853e..889e8a74 100644
--- a/src/test/utest/msmatrix/conv2x1.cpp
+++ b/src/test/utest/msmatrix/conv2x1.cpp
@@ -1,6 +1,6 @@
/*
- * Copyright (C) 2023 Linux Studio Plugins Project
- * (C) 2023 Vladimir Sadovnikov
+ * Copyright (C) 2024 Linux Studio Plugins Project
+ * (C) 2024 Vladimir Sadovnikov
*
* This file is part of lsp-dsp-lib
* Created on: 31 мар. 2020 г.
@@ -97,7 +97,7 @@ UTEST_BEGIN("dsp.msmatrix", conv2x1)
return;
UTEST_FOREACH(count, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 32, 64, 65, 100, 999, 0xfff)
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 63, 64, 65, 100, 999, 0xfff)
{
for (size_t mask=0; mask <= 0x07; ++mask)
{
diff --git a/src/test/utest/resampling/oversampling.cpp b/src/test/utest/resampling/oversampling.cpp
index 6ec2ae51..2e484c9f 100644
--- a/src/test/utest/resampling/oversampling.cpp
+++ b/src/test/utest/resampling/oversampling.cpp
@@ -23,6 +23,7 @@
#include
#include
#include
+#include
namespace lsp
{