Skip to content

Commit a4856a7

Browse files
committed
AVX2: Native poly_pointwise_montgomery
Signed-off-by: Matthias J. Kannwischer <[email protected]>
1 parent eeb9204 commit a4856a7

File tree

3 files changed

+139
-0
lines changed

3 files changed

+139
-0
lines changed

mldsa/native/x86_64/meta.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define MLD_USE_NATIVE_NTT_CUSTOM_ORDER
1515
#define MLD_USE_NATIVE_NTT
1616
#define MLD_USE_NATIVE_INTT
17+
#define MLD_USE_NATIVE_POINTWISE
1718

1819
#if !defined(__ASSEMBLER__)
1920
#include <string.h>
@@ -34,6 +35,14 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
3435
mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
3536
}
3637

38+
static MLD_INLINE void mld_pointwise_montgomery_native(
39+
int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
40+
const int32_t in1[MLDSA_N])
41+
{
42+
mld_pointwise_montgomery_avx2((__m256i *)out, (const __m256i *)in0,
43+
(const __m256i *)in1, mld_qdata.vec);
44+
}
45+
3746
#endif /* !__ASSEMBLER__ */
3847

3948
#endif /* !MLD_NATIVE_X86_64_META_H */

mldsa/native/x86_64/src/arith_native_x86_64.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,9 @@ void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
1919
#define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
2020
void mld_nttunpack_avx2(__m256i *r);
2121

22+
#define mld_pointwise_montgomery_avx2 \
23+
MLD_NAMESPACE(mld_pointwise_montgomery_avx2)
24+
void mld_pointwise_montgomery_avx2(__m256i *r, const __m256i *a,
25+
const __m256i *b, const __m256i *mld_qdata);
26+
2227
#endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* Copyright (c) The mlkem-native project authors
3+
* Copyright (c) The mldsa-native project authors
4+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5+
*/
6+
/*
7+
* This file is derived from the public domain
8+
* AVX2 Dilithium implementation @[REF_AVX2].
9+
*/
10+
11+
#include "../../../common.h"
12+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
13+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
14+
15+
#include "consts.h"
16+
17+
.text
18+
.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_avx2)
19+
MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_avx2)
20+
#consts
21+
vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV*4(%rcx),%ymm0
22+
vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rcx),%ymm1
23+
24+
xor %eax,%eax
25+
_looptop1:
26+
#load
27+
vmovdqa (%rsi),%ymm2
28+
vmovdqa 32(%rsi),%ymm4
29+
vmovdqa 64(%rsi),%ymm6
30+
vmovdqa (%rdx),%ymm10
31+
vmovdqa 32(%rdx),%ymm12
32+
vmovdqa 64(%rdx),%ymm14
33+
vpsrlq $32,%ymm2,%ymm3
34+
vpsrlq $32,%ymm4,%ymm5
35+
vmovshdup %ymm6,%ymm7
36+
vpsrlq $32,%ymm10,%ymm11
37+
vpsrlq $32,%ymm12,%ymm13
38+
vmovshdup %ymm14,%ymm15
39+
40+
#mul
41+
vpmuldq %ymm2,%ymm10,%ymm2
42+
vpmuldq %ymm3,%ymm11,%ymm3
43+
vpmuldq %ymm4,%ymm12,%ymm4
44+
vpmuldq %ymm5,%ymm13,%ymm5
45+
vpmuldq %ymm6,%ymm14,%ymm6
46+
vpmuldq %ymm7,%ymm15,%ymm7
47+
48+
#reduce
49+
vpmuldq %ymm0,%ymm2,%ymm10
50+
vpmuldq %ymm0,%ymm3,%ymm11
51+
vpmuldq %ymm0,%ymm4,%ymm12
52+
vpmuldq %ymm0,%ymm5,%ymm13
53+
vpmuldq %ymm0,%ymm6,%ymm14
54+
vpmuldq %ymm0,%ymm7,%ymm15
55+
vpmuldq %ymm1,%ymm10,%ymm10
56+
vpmuldq %ymm1,%ymm11,%ymm11
57+
vpmuldq %ymm1,%ymm12,%ymm12
58+
vpmuldq %ymm1,%ymm13,%ymm13
59+
vpmuldq %ymm1,%ymm14,%ymm14
60+
vpmuldq %ymm1,%ymm15,%ymm15
61+
vpsubq %ymm10,%ymm2,%ymm2
62+
vpsubq %ymm11,%ymm3,%ymm3
63+
vpsubq %ymm12,%ymm4,%ymm4
64+
vpsubq %ymm13,%ymm5,%ymm5
65+
vpsubq %ymm14,%ymm6,%ymm6
66+
vpsubq %ymm15,%ymm7,%ymm7
67+
vpsrlq $32,%ymm2,%ymm2
68+
vpsrlq $32,%ymm4,%ymm4
69+
vmovshdup %ymm6,%ymm6
70+
71+
#store
72+
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
73+
vpblendd $0xAA,%ymm5,%ymm4,%ymm4
74+
vpblendd $0xAA,%ymm7,%ymm6,%ymm6
75+
vmovdqa %ymm2,(%rdi)
76+
vmovdqa %ymm4,32(%rdi)
77+
vmovdqa %ymm6,64(%rdi)
78+
79+
add $96,%rdi
80+
add $96,%rsi
81+
add $96,%rdx
82+
add $1,%eax
83+
cmp $10,%eax
84+
jb _looptop1
85+
86+
vmovdqa (%rsi),%ymm2
87+
vmovdqa 32(%rsi),%ymm4
88+
vmovdqa (%rdx),%ymm10
89+
vmovdqa 32(%rdx),%ymm12
90+
vpsrlq $32,%ymm2,%ymm3
91+
vpsrlq $32,%ymm4,%ymm5
92+
vmovshdup %ymm10,%ymm11
93+
vmovshdup %ymm12,%ymm13
94+
95+
#mul
96+
vpmuldq %ymm2,%ymm10,%ymm2
97+
vpmuldq %ymm3,%ymm11,%ymm3
98+
vpmuldq %ymm4,%ymm12,%ymm4
99+
vpmuldq %ymm5,%ymm13,%ymm5
100+
101+
#reduce
102+
vpmuldq %ymm0,%ymm2,%ymm10
103+
vpmuldq %ymm0,%ymm3,%ymm11
104+
vpmuldq %ymm0,%ymm4,%ymm12
105+
vpmuldq %ymm0,%ymm5,%ymm13
106+
vpmuldq %ymm1,%ymm10,%ymm10
107+
vpmuldq %ymm1,%ymm11,%ymm11
108+
vpmuldq %ymm1,%ymm12,%ymm12
109+
vpmuldq %ymm1,%ymm13,%ymm13
110+
vpsubq %ymm10,%ymm2,%ymm2
111+
vpsubq %ymm11,%ymm3,%ymm3
112+
vpsubq %ymm12,%ymm4,%ymm4
113+
vpsubq %ymm13,%ymm5,%ymm5
114+
vpsrlq $32,%ymm2,%ymm2
115+
vmovshdup %ymm4,%ymm4
116+
117+
#store
118+
vpblendd $0x55,%ymm2,%ymm3,%ymm2
119+
vpblendd $0x55,%ymm4,%ymm5,%ymm4
120+
vmovdqa %ymm2,(%rdi)
121+
vmovdqa %ymm4,32(%rdi)
122+
123+
ret
124+
125+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED */

0 commit comments

Comments
 (0)