@@ -389,9 +389,9 @@ static vint16m2_t mlk_rv64v_intt2(vint16m2_t vp, vint16m1_t cz)
389389 t0 = __riscv_vget_v_i16m2_i16m1 (vp , 0 );
390390 t1 = __riscv_vget_v_i16m2_i16m1 (vp , 1 );
391391
392- /* move to positive range [0, q-1] for the reverse transform */
393- t0 = fq_mulq_vx (t0 , MLK_RVV_MONT_R1 , vl );
394- t1 = fq_mulq_vx (t1 , MLK_RVV_MONT_R1 , vl );
392+ /* pre-scale and move to positive range [0, q-1] for inverse transform */
393+ t0 = fq_mulq_vx (t0 , MLK_RVV_MONT_NR , vl );
394+ t1 = fq_mulq_vx (t1 , MLK_RVV_MONT_NR , vl );
395395
396396 c0 = __riscv_vrgather_vv_i16m1 (cz , cs2 , vl );
397397 MLK_RVV_BFLY_RV (t0 , t1 , vt , c0 , vl );
@@ -512,23 +512,6 @@ void mlk_rv64v_poly_invntt_tomont(int16_t *r)
512512 MLK_RVV_BFLY_RX (v6 , ve , vt , izeta [0x01 ], vl );
513513 MLK_RVV_BFLY_RX (v7 , vf , vt , izeta [0x01 ], vl );
514514
515- v0 = fq_mulq_vx (v0 , MLK_RVV_MONT_NR , vl );
516- v1 = fq_mulq_vx (v1 , MLK_RVV_MONT_NR , vl );
517- v2 = fq_mulq_vx (v2 , MLK_RVV_MONT_NR , vl );
518- v3 = fq_mulq_vx (v3 , MLK_RVV_MONT_NR , vl );
519- v4 = fq_mulq_vx (v4 , MLK_RVV_MONT_NR , vl );
520- v5 = fq_mulq_vx (v5 , MLK_RVV_MONT_NR , vl );
521- v6 = fq_mulq_vx (v6 , MLK_RVV_MONT_NR , vl );
522- v7 = fq_mulq_vx (v7 , MLK_RVV_MONT_NR , vl );
523- v8 = fq_mulq_vx (v8 , MLK_RVV_MONT_NR , vl );
524- v9 = fq_mulq_vx (v9 , MLK_RVV_MONT_NR , vl );
525- va = fq_mulq_vx (va , MLK_RVV_MONT_NR , vl );
526- vb = fq_mulq_vx (vb , MLK_RVV_MONT_NR , vl );
527- vc = fq_mulq_vx (vc , MLK_RVV_MONT_NR , vl );
528- vd = fq_mulq_vx (vd , MLK_RVV_MONT_NR , vl );
529- ve = fq_mulq_vx (ve , MLK_RVV_MONT_NR , vl );
530- vf = fq_mulq_vx (vf , MLK_RVV_MONT_NR , vl );
531-
532515 __riscv_vse16_v_i16m1 (& r [0x00 ], v0 , vl );
533516 __riscv_vse16_v_i16m1 (& r [0x10 ], v1 , vl );
534517 __riscv_vse16_v_i16m1 (& r [0x20 ], v2 , vl );
0 commit comments