From 19b9f63d9a9e60547e0b235130ca3d055c9cecd1 Mon Sep 17 00:00:00 2001 From: jammychiou1 Date: Sun, 26 Oct 2025 22:57:05 +0800 Subject: [PATCH 1/3] Add bounds reasoning comments to AVX2 ntt/intt Since the bound 3q/4 applies to all montmul-by-const and was used repeatedly, we dedicate a section at the end of intt.S, where this bound is absolutely required, and refer to this section whenever this bound is mentioned elsewhere. In the process, we noticed that the original MLD_INTT_BOUND defined as (MLDSA_Q * 3 / 4) = floor(3q/4) was too strong to justify concisely, so we bump it to (MLDSA_Q * 3 / 4 + 1) = ceil(3q/4). Signed-off-by: jammychiou1 --- dev/x86_64/src/intt.S | 71 +++++++++++++++++++++++++++++++++++++++++++ dev/x86_64/src/ntt.S | 55 +++++++++++++++++++++++++++++---- mldsa/src/ntt.h | 4 +-- 3 files changed, 122 insertions(+), 8 deletions(-) diff --git a/dev/x86_64/src/intt.S b/dev/x86_64/src/intt.S index f45d0fd87..1b8b7a0cc 100644 --- a/dev/x86_64/src/intt.S +++ b/dev/x86_64/src/intt.S @@ -43,6 +43,13 @@ vpsrlq $32,%ymm\r0,%ymm\r0 vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm +/* + * Compute l + h, montmul(h - l, zh) then store the results back to l, h + * respectively. + * + * The abs bound of "Montgomery multiplication with signed canonical constant" + * is ceil(3q/4) (see the end of this file). + */ .macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 vpsubd %ymm\l,%ymm\h,%ymm12 vpaddd %ymm\h,%ymm\l,%ymm\l @@ -74,6 +81,8 @@ vmovdqa 256*\off+160(%rdi),%ymm9 vmovdqa 256*\off+192(%rdi),%ymm10 vmovdqa 256*\off+224(%rdi),%ymm11 +/* All: abs bound < q */ + /* level 0 */ vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 @@ -99,6 +108,19 @@ vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 10,11,1,3,2,15 +/* 4, 6, 8, 10: abs bound < 2q; 5, 7, 9, 11: abs bound < ceil(3q/4) */ +/* + * Note that since 2^31 / q > 256, the sum of all 256 coefficients does not + * overflow. This allows us to greatly simplify the range analysis by relaxing + * and unifying the bounds of all coefficients on the same layer. As a concrete + * example, here we relax the bounds on 5, 7, 9, 11 and conclude that + * + * All: abs bound < 2q + * + * In all but last of the following layers, we do the same relaxation without + * explicit mention. + */ + /* level 1 */ vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 @@ -114,6 +136,8 @@ vmovshdup %ymm15,%ymm2 butterfly 8,10,1,3,2,15 butterfly 9,11,1,3,2,15 +/* All: abs bound < 4q */ + /* level 2 */ vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 vpermq $0x1B,(MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 @@ -124,6 +148,8 @@ butterfly 5,9,1,3,2,15 butterfly 6,10,1,3,2,15 butterfly 7,11,1,3,2,15 +/* All: abs bound < 8q */ + /* level 3 */ shuffle2 4,5,3,5 shuffle2 6,7,4,7 @@ -137,6 +163,8 @@ butterfly 4,7 butterfly 6,9 butterfly 8,11 +/* All: abs bound < 16q */ + /* level 4 */ shuffle4 3,4,10,4 shuffle4 6,8,3,8 @@ -150,6 +178,8 @@ butterfly 3,8 butterfly 6,7 butterfly 5,11 +/* All: abs bound < 32q */ + /* level 5 */ shuffle8 10,3,9,3 shuffle8 6,5,10,5 @@ -163,6 +193,8 @@ butterfly 10,5 butterfly 6,8 butterfly 4,11 +/* All: abs bound < 64q */ + vmovdqa %ymm9,256*\off+ 0(%rdi) vmovdqa %ymm10,256*\off+ 32(%rdi) vmovdqa %ymm6,256*\off+ 64(%rdi) @@ -194,6 +226,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 butterfly 8,10 butterfly 9,11 +/* All: abs bound < 128q */ + /* level 7 */ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+0)*4(%rsi),%ymm1 vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+0)*4(%rsi),%ymm2 @@ -203,11 +237,26 @@ butterfly 5,9 butterfly 6,10 butterfly 7,11 +/* 4, 5, 6, 7: abs bound < 256q; 8, 9, 10, 11: abs bound < ceil(3q/4) */ + vmovdqa %ymm8,512+32*\off(%rdi) vmovdqa %ymm9,640+32*\off(%rdi) vmovdqa %ymm10,768+32*\off(%rdi) vmovdqa %ymm11,896+32*\off(%rdi) +/* + * In order to (a) remove the factor of 256 arising from the 256-point INTT + * butterflies and (b) transform the output into Montgomery domain, we need to + * multiply all coefficients by 2^32/256. + * + * For ymm{8,9,10,11}, the scaling has been merged into the last butterfly, so + * only ymm{4,5,6,7} need to be scaled explicitly. + * + * The scaling is achieved by computing montmul(-, MLD_AVX2_DIV). + * + * 4, 5, 6, 7: abs bound < 256q + */ + vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV)*4(%rsi),%ymm1 vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV)*4(%rsi),%ymm2 vpmuldq %ymm1,%ymm4,%ymm12 @@ -256,6 +305,28 @@ vmovshdup %ymm7,%ymm7 vpblendd $0xAA,%ymm8,%ymm6,%ymm6 vpblendd $0xAA,%ymm9,%ymm7,%ymm7 +/* + * The bound ceil(3q/4) for this scaling, as well as any other "Montgomery + * multiplication with signed canonical constant", is justified as follows. + * + * In Section 2.2 of https://eprint.iacr.org/2023/1962, they showed a bound that + * works for any variable input a, as long as the constant b is signed + * canonical: + * + * |montmul(a, b)| <= (|a| (q/2) + (R/2) q) / R = (q/2) (1 + |a|/R). + * + * Therefore, even if we know nothing about a except that it fits inside + * int32_t (thus |a| <= R/2), we still have |montmul(a, b)| <= 3q/4. This can be + * strengthened to |montmul_pos(a, b)| <= floor(3q/4) < ceil(3q/4) since LHS is + * an integer and 3q/4 isn't. + * + * TODO: Use proper citation. Currently, citations within asm can cause linter + * to complain about unused citation, because comments are not preserved + * after simpasm. + */ + +/* 4, 5, 6, 7: abs bound < ceil(3q/4) */ + vmovdqa %ymm4, 0+32*\off(%rdi) vmovdqa %ymm5,128+32*\off(%rdi) vmovdqa %ymm6,256+32*\off(%rdi) diff --git a/dev/x86_64/src/ntt.S b/dev/x86_64/src/ntt.S index 8fae4ccbc..0329a0dfc 100644 --- a/dev/x86_64/src/ntt.S +++ b/dev/x86_64/src/ntt.S @@ -44,6 +44,17 @@ vpsrlq $32,%ymm\r0,%ymm\r0 vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm +/* + * Compute l + montmul(h, zh), l - montmul(h, zh) then store the results back to + * l, h respectively. + * + * Although the abs bound of "Montgomery multiplication with signed canonical + * constant" is ceil(3q/4) (see the end of dev/x86_64/src/intt.S), we use the + * more convenient bound q here. + * + * In conclusion, the magnitudes of all coefficients grow by at most q after + * each layer. + */ .macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 vpmuldq %ymm\zl0,%ymm\h,%ymm13 vmovshdup %ymm\h,%ymm12 @@ -56,16 +67,30 @@ vpmuldq %ymm0,%ymm13,%ymm13 vpmuldq %ymm0,%ymm14,%ymm14 vmovshdup %ymm\h,%ymm\h -vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h /* mulhi(h, zh) */ -vpsubd %ymm\h,%ymm\l,%ymm12 -vpaddd %ymm\h,%ymm\l,%ymm\l +/* + * Originally, mulhi(h, zh) should be subtracted by mulhi(q, mullo(h, zl)) in + * order to complete computing + * + * montmul(h, zh) = mulhi(h, zh) - mulhi(q, mullo(h, zl)). + * + * Here, since mulhi(q, mullo(h, zl)) has not been computed yet, this task is + * delayed until after add/sub. + */ +vpsubd %ymm\h,%ymm\l,%ymm12 /* l - mulhi(h, zh) + * = l - montmul(h, zh) + * - mulhi(q, mullo(h, zl)) */ +vpaddd %ymm\h,%ymm\l,%ymm\l /* l + mulhi(h, zh) + * = l + montmul(h, zh) + * + mulhi(q, mullo(h, zl)) */ vmovshdup %ymm13,%ymm13 -vpblendd $0xAA,%ymm14,%ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 /* mulhi(q, mullo(h, zl)) */ -vpaddd %ymm13,%ymm12,%ymm\h -vpsubd %ymm13,%ymm\l,%ymm\l +/* Finish the delayed task mentioned above */ +vpaddd %ymm13,%ymm12,%ymm\h /* l - montmul(h, zh) */ +vpsubd %ymm13,%ymm\l,%ymm\l /* l + montmul(h, zh) */ .endm .macro levels0t1 off @@ -82,11 +107,15 @@ vmovdqa 640+32*\off(%rdi),%ymm9 vmovdqa 768+32*\off(%rdi),%ymm10 vmovdqa 896+32*\off(%rdi),%ymm11 +/* All: abs bound < q */ + butterfly 4,8 butterfly 5,9 butterfly 6,10 butterfly 7,11 +/* All: abs bound < 2q */ + /* level 1 */ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+2)*4(%rsi),%ymm1 vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+2)*4(%rsi),%ymm2 @@ -98,6 +127,8 @@ vpbroadcastd (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+3)*4(%rsi),%ymm2 butterfly 8,10 butterfly 9,11 +/* All: abs bound < 3q */ + vmovdqa %ymm4, 0+32*\off(%rdi) vmovdqa %ymm5,128+32*\off(%rdi) vmovdqa %ymm6,256+32*\off(%rdi) @@ -132,6 +163,8 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 +/* All: abs bound < 4q */ + /* level 3 */ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+8+8*\off)*4(%rsi),%ymm2 @@ -146,6 +179,8 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 +/* All: abs bound < 5q */ + /* level 4 */ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+40+8*\off)*4(%rsi),%ymm2 @@ -160,6 +195,8 @@ shuffle2 5,6,7,6 shuffle2 3,4,5,4 shuffle2 10,11,3,11 +/* All: abs bound < 6q */ + /* level 5 */ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+72+8*\off)*4(%rsi),%ymm2 @@ -171,6 +208,8 @@ butterfly 8,4,1,10,2,15 butterfly 7,3,1,10,2,15 butterfly 6,11,1,10,2,15 +/* All: abs bound < 7q */ + /* level 6 */ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+104+8*\off)*4(%rsi),%ymm2 @@ -186,6 +225,8 @@ vmovshdup %ymm2,%ymm15 butterfly 5,3,1,10,2,15 butterfly 4,11,1,10,2,15 +/* All: abs bound < 8q */ + /* level 7 */ vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 vmovdqa (MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS+168+8*\off)*4(%rsi),%ymm2 @@ -211,6 +252,8 @@ vpsrlq $32,%ymm1,%ymm10 vmovshdup %ymm2,%ymm15 butterfly 3,11,1,10,2,15 +/* All: abs bound < 9q */ + vmovdqa %ymm9,256*\off+ 0(%rdi) vmovdqa %ymm8,256*\off+ 32(%rdi) vmovdqa %ymm7,256*\off+ 64(%rdi) diff --git a/mldsa/src/ntt.h b/mldsa/src/ntt.h index 9cbd0a19f..e64d57a12 100644 --- a/mldsa/src/ntt.h +++ b/mldsa/src/ntt.h @@ -21,8 +21,8 @@ /* Absolute exclusive upper bound for the output of the forward NTT */ #define MLD_NTT_BOUND (9 * MLDSA_Q) -/* Absolute exclusive upper bound for the output of the inverse NTT*/ -#define MLD_INTT_BOUND (MLDSA_Q * 3 / 4) +/* Absolute exclusive upper bound for the output of the inverse NTT */ +#define MLD_INTT_BOUND (MLDSA_Q * 3 / 4 + 1) /* ceil(3 * MLDSA_Q / 4) */ #define mld_ntt MLD_NAMESPACE(ntt) /************************************************* From 0cd0f3a12c5a33414547895f31c38d5ee248286c Mon Sep 17 00:00:00 2001 From: jammychiou1 Date: Wed, 5 Nov 2025 17:57:05 +0800 Subject: [PATCH 2/3] Port test/test_bounds.py to confirm bound for montmul-by-const The file test/test_bounds.py was ported from mlkem-native. We modified it to check the bound |montmul(a, b)| <= (q/2) (1 + |a|/R) for "Montgomery multiplication with signed canonical constant", in particular the generic bound 3q/4 for |a| <= R/2 which was mentioned repeatedly in the bound comments for AVX2 [I]NTT. Signed-off-by: jammychiou1 --- dev/x86_64/src/intt.S | 3 + test/test_bounds.py | 124 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 test/test_bounds.py diff --git a/dev/x86_64/src/intt.S b/dev/x86_64/src/intt.S index 1b8b7a0cc..4e9979588 100644 --- a/dev/x86_64/src/intt.S +++ b/dev/x86_64/src/intt.S @@ -320,6 +320,9 @@ vpblendd $0xAA,%ymm9,%ymm7,%ymm7 * strengthened to |montmul_pos(a, b)| <= floor(3q/4) < ceil(3q/4) since LHS is * an integer and 3q/4 isn't. * + * See test/test_bounds.py for more empirical evidence (and some minor technical + * details). + * * TODO: Use proper citation. Currently, citations within asm can cause linter * to complain about unused citation, because comments are not preserved * after simpasm. diff --git a/test/test_bounds.py b/test/test_bounds.py new file mode 100644 index 000000000..505d8b95a --- /dev/null +++ b/test/test_bounds.py @@ -0,0 +1,124 @@ +# Copyright (c) The mlkem-native project authors +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +# +# The purpose of this script is to provide either brute-force proof +# or empirical evidence to arithmetic bounds for the modular +# arithmetic primitives used in this repository. +# + +import random +from functools import lru_cache +from fractions import Fraction +from math import ceil + +# Global constants +R = 2**32 +Q = 8380417 +Qinv = pow(Q, -1, R) +NQinv = pow(-Q, -1, R) + + +# +# Montgomery multiplication +# + + +def lift_signed_i32(x): + """Returns signed canonical representative modulo R=2^32.""" + x = x % R + if x >= R // 2: + x -= R + return x + + +@lru_cache(maxsize=None) +def montmul_neg_twiddle(b): + return (b * NQinv) % R + + +@lru_cache(maxsize=None) +def montmul_pos_twiddle(b): + return (b * Qinv) % R + + +def montmul_neg(a, b): + b_twiddle = montmul_neg_twiddle(b) + return (a * b + Q * lift_signed_i32(a * b_twiddle)) // R + + +def montmul_pos(a, b): + b_twiddle = montmul_pos_twiddle(b) + return (a * b - Q * lift_signed_i32(a * b_twiddle)) // R + + +# +# Generic test functions +# + + +def test_random(f, test_name, num_tests=10000000, bound_a=R // 2, bound_b=Q // 2): + print(f"Randomly checking {test_name} ({num_tests} tests)...") + for i in range(num_tests): + if i % 100000 == 0: + print(f"... run {i} tests ({((i * 1000) // num_tests)/10}%)") + a = random.randrange(-bound_a, bound_a) + b = random.randrange(-bound_b, bound_b) + f(a, b) + + +# +# Test bound on "Montgomery multiplication with signed canonical constant", as +# used in AVX2 [I]NTT +# + +""" +In @[Survey_Hwang23, Section 2.2], the author noted the bound* + + |montmul(a, b)| <= (q/2) (1 + |a|/R). + +In particular, knowing that a fits inside int32_t (thus |a| <= R/2) already +implies |montmul(a, b)| <= 3q/4 < ceil(3q/4). + +(*) Strictly speaking, they considered the negative/additive variant + montmul_neg(a, b), but the exact same bound and proof also work for the + positive/subtractive variant montmul_pos(a, b). +""" + + +def montmul_pos_const_bound(a): + return Fraction(Q, 2) * (1 + Fraction(abs(a), R)) + + +def montmul_pos_const_bound_test(a, b): + ab = montmul_pos(a, b) + bound = montmul_pos_const_bound(a) + if abs(ab) > bound: + print(f"montmul_pos_const_bound_test failure for (a,b)={(a,b)}") + print(f"montmul_pos(a,b): {ab}") + print(f"bound: {bound}") + assert False + + +def montmul_pos_const_bound_test_random(): + test_random( + montmul_pos_const_bound_test, + "bound on Montgomery multiplication with constant, as used in AVX2 [I]NTT", + ) + + +def montmul_pos_const_bound_tight(): + """ + This example shows that, unless we know more about a or b, the bound + |montmul(a, b)| < ceil(3q/4) is the tightest exclusive bound. + """ + a_worst = -R // 2 + b_worst = -(Q - 3) // 2 + ab_worst = montmul_pos(a_worst, b_worst) + bound = ceil(Fraction(3 * Q, 4)) + assert ab_worst == bound - 1 + + +montmul_pos_const_bound_test_random() +montmul_pos_const_bound_tight() From 5e4bc80fd94c2c9550a71c5dfc04b387baeab2e2 Mon Sep 17 00:00:00 2001 From: jammychiou1 Date: Mon, 27 Oct 2025 11:26:57 +0800 Subject: [PATCH 3/3] Add bounds reasoning comments to AVX2 basemul Signed-off-by: jammychiou1 --- dev/x86_64/src/pointwise.S | 7 +++++++ dev/x86_64/src/pointwise_acc_l4.S | 14 ++++++++++++++ dev/x86_64/src/pointwise_acc_l5.S | 15 +++++++++++++++ dev/x86_64/src/pointwise_acc_l7.S | 17 +++++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/dev/x86_64/src/pointwise.S b/dev/x86_64/src/pointwise.S index 8bd73616f..6445054d5 100644 --- a/dev/x86_64/src/pointwise.S +++ b/dev/x86_64/src/pointwise.S @@ -61,6 +61,7 @@ _looptop1: vpsrlq ymm11, ymm10, 32 vpsrlq ymm13, ymm12, 32 vmovshdup ymm15, ymm14 + /* All: abs bound < 9q */ // Multiply vpmuldq ymm2, ymm2, ymm10 @@ -69,6 +70,7 @@ _looptop1: vpmuldq ymm5, ymm5, ymm13 vpmuldq ymm6, ymm6, ymm14 vpmuldq ymm7, ymm7, ymm15 + /* All: abs bound < 81q^2 < 81*2^46 < 2^53 = 2^21R < qR/2 */ // Reduce vpmuldq ymm10, ymm0, ymm2 @@ -92,6 +94,11 @@ _looptop1: vpsrlq ymm2, ymm2, 32 vpsrlq ymm4, ymm4, 32 vmovshdup ymm6, ymm6 + /* + * All coefficients are Montgomery-reduced. This results in the bound + * + * All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q + */ // Store vpblendd ymm2, ymm2, ymm3, 0xAA diff --git a/dev/x86_64/src/pointwise_acc_l4.S b/dev/x86_64/src/pointwise_acc_l4.S index e64881ccb..44c2b62b0 100644 --- a/dev/x86_64/src/pointwise_acc_l4.S +++ b/dev/x86_64/src/pointwise_acc_l4.S @@ -37,12 +37,17 @@ vpsrlq ymm9, ymm8, 32 vmovshdup ymm11, ymm10 vmovshdup ymm13, ymm12 + /* + * 6, 7, 8, 9: from the first input polynomial, abs bound < q + * 10, 11, 12, 13: from the second input polynomial, abs bound < 9q + */ // Multiply vpmuldq ymm6, ymm6, ymm10 vpmuldq ymm7, ymm7, ymm11 vpmuldq ymm8, ymm8, ymm12 vpmuldq ymm9, ymm9, ymm13 + /* All: abs bound < 9q^2 */ .endm .macro acc @@ -80,15 +85,19 @@ _looptop2: vmovdqa ymm3, ymm7 vmovdqa ymm4, ymm8 vmovdqa ymm5, ymm9 + /* All: abs bound < 9q^2 */ pointwise 1024 acc + /* All: abs bound < 18q^2 */ pointwise 2048 acc + /* All: abs bound < 27q^2 */ pointwise 3072 acc + /* All: abs bound < 36q^2 < 36*2^46 < 2^52 = 2^20R < qR/2 */ // Reduce vpmuldq ymm6, ymm0, ymm2 @@ -105,6 +114,11 @@ _looptop2: vpsubq ymm5, ymm5, ymm9 vpsrlq ymm2, ymm2, 32 vmovshdup ymm4, ymm4 + /* + * All coefficients are Montgomery-reduced. This results in the bound + * + * All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q + */ // Store vpblendd ymm2, ymm2, ymm3, 0xAA diff --git a/dev/x86_64/src/pointwise_acc_l5.S b/dev/x86_64/src/pointwise_acc_l5.S index db7348f19..020f517eb 100644 --- a/dev/x86_64/src/pointwise_acc_l5.S +++ b/dev/x86_64/src/pointwise_acc_l5.S @@ -37,12 +37,17 @@ vpsrlq ymm9, ymm8, 32 vmovshdup ymm11, ymm10 vmovshdup ymm13, ymm12 + /* + * 6, 7, 8, 9: from the first input polynomial, abs bound < q + * 10, 11, 12, 13: from the second input polynomial, abs bound < 9q + */ // Multiply vpmuldq ymm6, ymm6, ymm10 vpmuldq ymm7, ymm7, ymm11 vpmuldq ymm8, ymm8, ymm12 vpmuldq ymm9, ymm9, ymm13 + /* All: abs bound < 9q^2 */ .endm .macro acc @@ -80,18 +85,23 @@ _looptop2: vmovdqa ymm3, ymm7 vmovdqa ymm4, ymm8 vmovdqa ymm5, ymm9 + /* All: abs bound < 9q^2 */ pointwise 1024 acc + /* All: abs bound < 18q^2 */ pointwise 2048 acc + /* All: abs bound < 27q^2 */ pointwise 3072 acc + /* All: abs bound < 36q^2 */ pointwise 4096 acc + /* All: abs bound < 45q^2 < 45*2^46 < 2^52 = 2^20R < qR/2 */ // Reduce vpmuldq ymm6, ymm0, ymm2 @@ -108,6 +118,11 @@ _looptop2: vpsubq ymm5, ymm5, ymm9 vpsrlq ymm2, ymm2, 32 vmovshdup ymm4, ymm4 + /* + * All coefficients are Montgomery-reduced. This results in the bound + * + * All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q + */ // Store vpblendd ymm2, ymm2, ymm3, 0xAA diff --git a/dev/x86_64/src/pointwise_acc_l7.S b/dev/x86_64/src/pointwise_acc_l7.S index bae230d75..835e87b05 100644 --- a/dev/x86_64/src/pointwise_acc_l7.S +++ b/dev/x86_64/src/pointwise_acc_l7.S @@ -37,12 +37,17 @@ vpsrlq ymm9, ymm8, 32 vmovshdup ymm11, ymm10 vmovshdup ymm13, ymm12 + /* + * 6, 7, 8, 9: from the first input polynomial, abs bound < q + * 10, 11, 12, 13: from the second input polynomial, abs bound < 9q + */ // Multiply vpmuldq ymm6, ymm6, ymm10 vpmuldq ymm7, ymm7, ymm11 vpmuldq ymm8, ymm8, ymm12 vpmuldq ymm9, ymm9, ymm13 + /* All: abs bound < 9q^2 */ .endm .macro acc @@ -80,24 +85,31 @@ _looptop2: vmovdqa ymm3, ymm7 vmovdqa ymm4, ymm8 vmovdqa ymm5, ymm9 + /* All: abs bound < 9q^2 */ pointwise 1024 acc + /* All: abs bound < 18q^2 */ pointwise 2048 acc + /* All: abs bound < 27q^2 */ pointwise 3072 acc + /* All: abs bound < 36q^2 */ pointwise 4096 acc + /* All: abs bound < 45q^2 */ pointwise 5120 acc + /* All: abs bound < 54q^2 */ pointwise 6144 acc + /* All: abs bound < 63q^2 < 63*2^46 < 2^52 = 2^20R < qR/2 */ // Reduce vpmuldq ymm6, ymm0, ymm2 @@ -114,6 +126,11 @@ _looptop2: vpsubq ymm5, ymm5, ymm9 vpsrlq ymm2, ymm2, 32 vmovshdup ymm4, ymm4 + /* + * All coefficients are Montgomery-reduced. This results in the bound + * + * All: abs bound <= "input abs bound"/R + q/2 < (qR/2)/R + q/2 = q + */ // Store vpblendd ymm2, ymm2, ymm3, 0xAA