diff --git a/mldsa/native/x86_64/src/ntt.S b/mldsa/native/x86_64/src/ntt.S index d1916dbeb..633e3d52a 100644 --- a/mldsa/native/x86_64/src/ntt.S +++ b/mldsa/native/x86_64/src/ntt.S @@ -24,62 +24,62 @@ #include "consts.h" + .intel_syntax noprefix + .macro shuffle8 r0,r1,r2,r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +vperm2i128 ymm\r2,ymm\r0,ymm\r1,0x20 +vperm2i128 ymm\r3,ymm\r0,ymm\r1,0x31 .endm .macro shuffle4 r0,r1,r2,r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +vpunpcklqdq ymm\r2,ymm\r0,ymm\r1 +vpunpckhqdq ymm\r3,ymm\r0,ymm\r1 .endm .macro shuffle2 r0,r1,r2,r3 -#vpsllq $32,%ymm\r1,%ymm\r2 -vmovsldup %ymm\r1,%ymm\r2 -vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 -vpsrlq $32,%ymm\r0,%ymm\r0 -#vmovshdup %ymm\r0,%ymm\r0 -vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +vmovsldup ymm\r2,ymm\r1 +vpblendd ymm\r2,ymm\r0,ymm\r2,0xAA +vpsrlq ymm\r0,ymm\r0,32 +vpblendd ymm\r3,ymm\r0,ymm\r1,0xAA .endm .macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 -vpmuldq %ymm\zl0,%ymm\h,%ymm13 -vmovshdup %ymm\h,%ymm12 -vpmuldq %ymm\zl1,%ymm12,%ymm14 +vpmuldq ymm13,ymm\h,ymm\zl0 +vmovshdup ymm12,ymm\h +vpmuldq ymm14,ymm12,ymm\zl1 -vpmuldq %ymm\zh0,%ymm\h,%ymm\h -vpmuldq %ymm\zh1,%ymm12,%ymm12 +vpmuldq ymm\h,ymm\h,ymm\zh0 +vpmuldq ymm12,ymm12,ymm\zh1 -vpmuldq %ymm0,%ymm13,%ymm13 -vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq ymm13,ymm13,ymm0 +vpmuldq ymm14,ymm14,ymm0 -vmovshdup %ymm\h,%ymm\h -vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h +vmovshdup ymm\h,ymm\h +vpblendd ymm\h,ymm\h,ymm12,0xAA -vpsubd %ymm\h,%ymm\l,%ymm12 -vpaddd %ymm\h,%ymm\l,%ymm\l +vpsubd ymm12,ymm\l,ymm\h +vpaddd ymm\l,ymm\l,ymm\h -vmovshdup %ymm13,%ymm13 -vpblendd $0xAA,%ymm14,%ymm13,%ymm13 +vmovshdup ymm13,ymm13 +vpblendd ymm13,ymm13,ymm14,0xAA -vpaddd %ymm13,%ymm12,%ymm\h -vpsubd %ymm13,%ymm\l,%ymm\l +vpaddd ymm\h,ymm12,ymm13 +vpsubd ymm\l,ymm\l,ymm13 .endm .macro levels0t1 off /* level 0 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4(%rsi),%ymm2 - -vmovdqa 0+32*\off(%rdi),%ymm4 -vmovdqa 128+32*\off(%rdi),%ymm5 -vmovdqa 256+32*\off(%rdi),%ymm6 -vmovdqa 384+32*\off(%rdi),%ymm7 -vmovdqa 512+32*\off(%rdi),%ymm8 -vmovdqa 640+32*\off(%rdi),%ymm9 -vmovdqa 768+32*\off(%rdi),%ymm10 -vmovdqa 896+32*\off(%rdi),%ymm11 +vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+1)*4] +vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+1)*4] + +vmovdqa ymm4,[rdi+0+32*\off] +vmovdqa ymm5,[rdi+128+32*\off] +vmovdqa ymm6,[rdi+256+32*\off] +vmovdqa ymm7,[rdi+384+32*\off] +vmovdqa ymm8,[rdi+512+32*\off] +vmovdqa ymm9,[rdi+640+32*\off] +vmovdqa ymm10,[rdi+768+32*\off] +vmovdqa ymm11,[rdi+896+32*\off] butterfly 4,8 butterfly 5,9 @@ -87,39 +87,39 @@ butterfly 6,10 butterfly 7,11 /* level 1 */ -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 +vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4] +vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4] butterfly 4,6 butterfly 5,7 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 +vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+3)*4] +vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4] butterfly 8,10 butterfly 9,11 -vmovdqa %ymm4, 0+32*\off(%rdi) -vmovdqa %ymm5,128+32*\off(%rdi) -vmovdqa %ymm6,256+32*\off(%rdi) -vmovdqa %ymm7,384+32*\off(%rdi) -vmovdqa %ymm8,512+32*\off(%rdi) -vmovdqa %ymm9,640+32*\off(%rdi) -vmovdqa %ymm10,768+32*\off(%rdi) -vmovdqa %ymm11,896+32*\off(%rdi) +vmovdqa [rdi+0+32*\off],ymm4 +vmovdqa [rdi+128+32*\off],ymm5 +vmovdqa [rdi+256+32*\off],ymm6 +vmovdqa [rdi+384+32*\off],ymm7 +vmovdqa [rdi+512+32*\off],ymm8 +vmovdqa [rdi+640+32*\off],ymm9 +vmovdqa [rdi+768+32*\off],ymm10 +vmovdqa [rdi+896+32*\off],ymm11 .endm .macro levels2t7 off /* level 2 */ -vmovdqa 256*\off+ 0(%rdi),%ymm4 -vmovdqa 256*\off+ 32(%rdi),%ymm5 -vmovdqa 256*\off+ 64(%rdi),%ymm6 -vmovdqa 256*\off+ 96(%rdi),%ymm7 -vmovdqa 256*\off+128(%rdi),%ymm8 -vmovdqa 256*\off+160(%rdi),%ymm9 -vmovdqa 256*\off+192(%rdi),%ymm10 -vmovdqa 256*\off+224(%rdi),%ymm11 - -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 -vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4(%rsi),%ymm2 +vmovdqa ymm4,[rdi+0+256*\off] +vmovdqa ymm5,[rdi+32+256*\off] +vmovdqa ymm6,[rdi+64+256*\off] +vmovdqa ymm7,[rdi+96+256*\off] +vmovdqa ymm8,[rdi+128+256*\off] +vmovdqa ymm9,[rdi+160+256*\off] +vmovdqa ymm10,[rdi+192+256*\off] +vmovdqa ymm11,[rdi+224+256*\off] + +vpbroadcastd ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+4+\off)*4] +vpbroadcastd ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+4+\off)*4] butterfly 4,8 butterfly 5,9 @@ -132,8 +132,8 @@ shuffle8 6,10,5,10 shuffle8 7,11,6,11 /* level 3 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4] butterfly 3,5 butterfly 8,10 @@ -146,8 +146,8 @@ shuffle4 4,6,8,6 shuffle4 9,11,4,11 /* level 4 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4] butterfly 7,8 butterfly 5,6 @@ -160,10 +160,10 @@ shuffle2 3,4,5,4 shuffle2 10,11,3,11 /* level 5 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 9,5,1,10,2,15 butterfly 8,4,1,10,2,15 @@ -171,70 +171,70 @@ butterfly 7,3,1,10,2,15 butterfly 6,11,1,10,2,15 /* level 6 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 9,7,1,10,2,15 butterfly 8,6,1,10,2,15 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off+32)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off+32)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 5,3,1,10,2,15 butterfly 4,11,1,10,2,15 /* level 7 */ -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 9,8,1,10,2,15 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+32)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+32)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 7,6,1,10,2,15 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+64)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+64)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 5,4,1,10,2,15 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 -vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 -vpsrlq $32,%ymm1,%ymm10 -vmovshdup %ymm2,%ymm15 +vmovdqa ymm1,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off+96)*4] +vmovdqa ymm2,[rsi+(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off+96)*4] +vpsrlq ymm10,ymm1,32 +vmovshdup ymm15,ymm2 butterfly 3,11,1,10,2,15 -vmovdqa %ymm9,256*\off+ 0(%rdi) -vmovdqa %ymm8,256*\off+ 32(%rdi) -vmovdqa %ymm7,256*\off+ 64(%rdi) -vmovdqa %ymm6,256*\off+ 96(%rdi) -vmovdqa %ymm5,256*\off+128(%rdi) -vmovdqa %ymm4,256*\off+160(%rdi) -vmovdqa %ymm3,256*\off+192(%rdi) -vmovdqa %ymm11,256*\off+224(%rdi) +vmovdqa [rdi+0+256*\off],ymm9 +vmovdqa [rdi+32+256*\off],ymm8 +vmovdqa [rdi+64+256*\off],ymm7 +vmovdqa [rdi+96+256*\off],ymm6 +vmovdqa [rdi+128+256*\off],ymm5 +vmovdqa [rdi+160+256*\off],ymm4 +vmovdqa [rdi+192+256*\off],ymm3 +vmovdqa [rdi+224+256*\off],ymm11 .endm .text .balign 4 .global MLD_ASM_NAMESPACE(ntt_avx2) MLD_ASM_FN_SYMBOL(ntt_avx2) -vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rsi),%ymm0 +vmovdqa ymm0,[rsi+MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4] -levels0t1 0 -levels0t1 1 -levels0t1 2 -levels0t1 3 +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 -levels2t7 0 -levels2t7 1 -levels2t7 2 -levels2t7 3 +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 ret #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \