From 5944710a5faff1b175d5c4c4b6ddee2c13b3d095 Mon Sep 17 00:00:00 2001
From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
Date: Thu, 19 Dec 2024 12:27:36 +0100
Subject: [PATCH] Optimize computation of DEEP queries in recursive verifier
 (#1594)

* feat: optimize computation of DEEP queries

chore: update changelog

chore: address feedback
Signed-off-by: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>

* refactor: remove file

* feat: add paramter check to recursive verifier

chore: changelog update

chore: add back file and update it

Revert "refactor: remove file"

This reverts commit 384a5af794697cd4060d1a80f167fa9a571962b3.

---------

Signed-off-by: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com>
---
 CHANGELOG.md                              |   2 +
 stdlib/asm/crypto/stark/constants.masm    |  12 +-
 stdlib/asm/crypto/stark/deep_queries.masm | 139 +++++++++++++---------
 stdlib/asm/crypto/stark/mod.masm          |  19 +--
 stdlib/asm/crypto/stark/utils.masm        |  32 +++++
 stdlib/asm/crypto/stark/verifier.masm     |  17 ++-
 6 files changed, 149 insertions(+), 72 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 32fb2da348..922fa9d0b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@
 #### Enhancements
 - Added options `--kernel`, `--debug` and `--output` to `miden bundle` (#1447).
 - Added `miden_core::mast::MastForest::advice_map` to load it into the advice provider before the `MastForest` execution (#1574).
+- Optimized the computation of the DEEP queries in the recursive verifier (#1594).
+- Added validity checks for the inputs to the recursive verifier (#1596).
 
 ## 0.11.0 (2024-11-04)
 
diff --git a/stdlib/asm/crypto/stark/constants.masm b/stdlib/asm/crypto/stark/constants.masm
index cf8784c9d7..8529917593 100644
--- a/stdlib/asm/crypto/stark/constants.masm
+++ b/stdlib/asm/crypto/stark/constants.masm
@@ -39,11 +39,15 @@ const.COMPOSITION_COEF_PTR=4294900200
 
 # We need 2 Felt for each trace column and each of the 8 constraint composition columns. We thus need
 # (70 + 7 + 8) * 2 Felt i.e. 43 memory slots.
+# Note that there is a cap on the number of such coefficients so that the memory region allocated for
+# these coefficients does not overlap with the memory region storing the FRI queries.
+# This cap is of a 100 coefficients which is equivalent to 50 memory slots. This gives 150 memory
+# slots for all of the FRI queries i.e., 150 FRI queries.
 const.DEEP_RAND_CC_PTR=4294903000
 
 # FRI
 #
-#       (FRI_COM_PTR - 100)    ---|
+#       (FRI_COM_PTR - 150)    ---|
 #              .
 #              .                  | <- FRI queries
 #              .
@@ -97,6 +101,7 @@ const.TMP5=4294903319
 const.TMP6=4294903320
 const.TMP7=4294903321
 const.TMP8=4294903322
+const.TMP9=4294903323
 
 
 
@@ -135,6 +140,7 @@ const.TMP8=4294903322
 #   | TMP6                                     |       4294903320        |
 #   | TMP7                                     |       4294903321        |
 #   | TMP8                                     |       4294903322        |
+#   | TMP9                                     |       4294903323        |
 #   +------------------------------------------+-------------------------+
 
 # ACCESSORS
@@ -305,3 +311,7 @@ end
 export.tmp8
     push.TMP8
 end
+
+export.tmp9
+    push.TMP9
+end
diff --git a/stdlib/asm/crypto/stark/deep_queries.masm b/stdlib/asm/crypto/stark/deep_queries.masm
index 0ce9e0e962..41c721c066 100644
--- a/stdlib/asm/crypto/stark/deep_queries.masm
+++ b/stdlib/asm/crypto/stark/deep_queries.masm
@@ -127,22 +127,23 @@ end
 #! Loads the next query rows in the main, auxiliary and constraint composition polynomials traces.
 #! It takes a pointer to the current random query index and returns that index.
 #!
-#! Input: [query_ptr, ...]
-#! Output: [index, query_ptr, ...]
+#! Input: [Y, query_ptr, ...]
+#! Output: [Y, Y, index, query_ptr, ...]
+#!
+#! where:
+#! - Y is a "garbage" word.
 #!
-#! Cycles: 219
+#! Cycles: 206
 proc.load_query_row
     # Main trace portion of the query
 
     ## Get the next query index
-    padw
     dup.4
     mem_loadw
-    swap
-    #=> [depth, index, y, y, query_ptr, ...]
+    #=> [index, depth, y, y, query_ptr, ...] where y are "garbage" values
 
     ## Get main trace commitment and use it to get the leaf
-    movup.3 movup.3
+    movdn.3 movdn.2
     push.0.0
     exec.constants::main_trace_com_ptr mem_loadw
     #=>[R, depth, index, query_ptr, ...]
@@ -273,50 +274,48 @@ proc.load_query_row
     assert_eq
     #=> [Y, ptr, y, y, y, depth, index, query_ptr, ...]
 
-    dropw dropw drop
-    #=> [index, query_ptr, ...]
+    drop
+    #=> [Y, Y, index, query_ptr, ...]
 end
 
 #! Takes a query index and computes x := offset * domain_gen^index. It also computes the denominators
 #! (x - z) and (x - gz).
 #!
-#! Input: [index, ...]
-#! Output: [Z, x, index, ...]  where Z := [-gz1, x -gz0, -z1, x - z0]
+#! Input: [Y, Y, index, ...]
+#! Output: [Z, Y, x, index, ...]  
+#!
+#! where:
+#! - Z := [-gz1, x -gz0, -z1, x - z0]
+#! - Y is a "garbage" word
 #!
-#! Cycles: 68
+#! Cycles: 58
 proc.compute_denominators
     # Compute x = offset * domain_gen^index
-    padw
     exec.constants::lde_size_ptr mem_loadw
-    #=> [lde_size, depth, domain_gen, 0, index, ...]
+    #=> [lde_size, depth, domain_gen, 0, Y, index, ...]
     movup.2
-    dup.4
+    dup.8
     exp.u32
     exec.constants::domain_offset mul
-    #=> [x, lde_size, depth, 0, index, ...]
+    #=> [x, lde_size, depth, 0, Y, index, ...]
 
     # Get z and gz from memory
     movdn.3
-    #=> [lde_size, depth, 0, x, index, ...]
+    #=> [lde_size, depth, 0, x, Y, index, ...]
     push.0
     exec.constants::tmp1 mem_loadw
-    #=> [gz1, gz0, z1, z0, x, index, ...]
+    #=> [-z0, -gz0, -gz1, -z1, x, Y, index, ...]
 
-    # Compute Z := [-z1, x - z0, -gz1, x -gz0]
-    neg
-    dup.4
-    movup.2
-    sub
-    #=> [x-gz0, -gz1, z1, z0, x, index, ...]
+    dup.4 add
+    #=> [x-z0, -gz0, -gz1, -z1, x, Y, index, ...]
     movdn.3
-    movdn.2
-    #=> [z1, z0, -gz1, x-gz0, x, index, ...]
-    neg
-    movdn.3
-    dup.4 swap
-    sub
-    movdn.3
-    #=> [Z, x, index, ...] where Z := [-gz1, x -gz0, -z1, x - z0]
+    #=> [-gz0, -gz1, -z1, x-z0, x, Y, index, ...]
+
+    movup.4 dup movdn.9
+    #=> [x, -gz0, -gz1, -z1, x-z0, Y, x, index, ...]
+
+    add swap
+    #=> [-gz1, x - gz0, -z1, x-z0, Y, x, index, ...]
 end
 
 #! Computes the random linear combination involving the main trace columns and accumulates
@@ -429,7 +428,7 @@ proc.combine_constraint_poly_columns
     dropw
     swapw
     exec.constants::tmp3 mem_loadw
-    #=> [Acc3, Acc2, y, y, y, y, Acc1`, Acc0`, ...]
+    #=> [Acc3, Acc2, y, y, y, y, Acc1`, Acc0`, ...] where y are "garbage" values
     movdn.5 movdn.5
     #=> [y, y, y, y, Acc3, Acc2, Acc1`, Acc0`, ...]
     dropw
@@ -466,12 +465,43 @@ end
 #! Compute the DEEP composition polynomial FRI queries.
 #!
 #! Input: [query_ptr, ...]
-#! Output: [...]
-#! Cycles: 6 + num_queries * 473
+#! Output: [query_ptr, ...]
+#! Cycles: 24 + num_queries * 445
 export.compute_deep_composition_polynomial_queries
+
+    # Get pointer to help test for the last query to be processed
     exec.constants::fri_com_ptr
     dup.1
-    #=>[query_ptr, query_end_ptr, ...]
+    # =>[query_ptr, query_end_ptr, query_ptr...]
+    # Store the pointers to:
+    # 1. random values for computing DEEP polynomial
+    # 2. OOD evaluation frame
+    # 3. trace row values for the current query
+    push.0
+    exec.constants::deep_rand_coef_ptr
+    exec.constants::ood_trace_ptr
+    exec.constants::current_trace_row_ptr
+    exec.constants::tmp9 mem_storew
+    # =>[Y, query_ptr, query_end_ptr, query_ptr, ...]
+
+    # Compute the negations of z and gz where z is the OOD point
+    # We do it here as this computation is common to all queries.
+    exec.constants::z_ptr mem_loadw
+    # => [zN_1, zN_0, z1, z0, query_ptr, query_end_ptr, query_ptr, ...]
+    drop drop
+    neg swap neg  
+    # => [-z0, -z1, query_ptr, query_end_ptr, query_ptr, ...]
+    dup.1 exec.constants::trace_domain_generator_ptr mem_load mul
+    # => [-gz1, -z0, -z1, query_ptr, query_end_ptr, query_ptr, ...]
+    swap
+    # => [-z0, -gz1, -z1, query_ptr, query_end_ptr, query_ptr, ...]
+    dup exec.constants::trace_domain_generator_ptr mem_load mul
+    # => [-gz0, -z0, -gz1, -z1, query_ptr, query_end_ptr, query_ptr, ...]
+    swap
+    # => [-z0, -gz0, -gz1, -z1, query_ptr, query_end_ptr, query_ptr, ...]
+    # Save to temporary location `tmp1` for later use when computing the denominators
+    exec.constants::tmp1 mem_storew
+    # => [Y, query_ptr, query_end_ptr, query_ptr, ...]
 
     push.1
     while.true
@@ -480,19 +510,18 @@ export.compute_deep_composition_polynomial_queries
         # Load the (main, aux, constraint)-traces rows associated with the current query and get
         # the index of the query.
         #
-        # Cycles: 219
+        # Cycles: 206
         exec.load_query_row
-        #=>[index, query_ptr, query_end_ptr, ...]
+        #=>[Y, Y, index, query_ptr, query_end_ptr, query_ptr, ...]
 
 
         # II)
         #
         # Compute x := offset * domain_gen^index and denominators (x - z) and (x - gz)
         #
-        # Cycles: 68
+        # Cycles: 58
         exec.compute_denominators
-        #=> [Z, x, index, query_ptr, query_end_ptr, ...] where Z := [-gz1, x - gz0, -z1, x - z0]
-
+        #=> [Z, Y, x, index, query_ptr, query_end_ptr, query_ptr, ...] where Z := [-gz1, x - gz0, -z1, x - z0]
 
         # III)
         #
@@ -503,26 +532,23 @@ export.compute_deep_composition_polynomial_queries
 
         ## a) Push pointers
         ##
-        ## Cycles: 4
-        push.0
-        exec.constants::deep_rand_coef_ptr
-        exec.constants::ood_trace_ptr
-        exec.constants::current_trace_row_ptr
-        #=> [P, Z, x, index, query_ptr, query_end_ptr, ...]
+        ## Cycles: 3
+        swapw exec.constants::tmp9 mem_loadw
+        #=> [P, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...]
         # where P := [CURRENT_TRACE_ROW_PTR, OOD_TRACE_PTR, DEEP_RAND_CC_PTR, 0]
 
         ## b) Push the accumulators
         ##
         ## Cycles: 4
         padw
-        #=> [Acc, P, Z, x, index, query_ptr, query_end_ptr, ...]
+        #=> [Acc, P, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...]
         #=> where Acc =: [Acc3, Acc2, Acc1, Acc0]
 
         ## c) This will be used to mstream the elements T_i(x)
         ##
         ## Cycles: 8
         padw padw
-        #=> [Y, Y, Acc, P, Z, x, index, query_ptr, query_end_ptr, ...]
+        #=> [Y, Y, Acc, P, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...]
 
         ## d) Compute the random linear combination
         ##
@@ -530,13 +556,13 @@ export.compute_deep_composition_polynomial_queries
         exec.combine_main_trace_columns
         exec.combine_aux_trace_columns     
         exec.combine_constraint_poly_columns
-        #=> [Acc, Z, x, index, query_ptr, query_end_ptr, ...]
+        #=> [Acc, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...]
 
         ## e) Divide by denominators and sum to get final result
         ##
         ## Cycles: 38
         exec.divide_by_denominators_and_sum
-        #=> [eval1, eval0, x, index, query_ptr, query_end_ptr, ...]
+        #=> [eval1, eval0, x, index, query_ptr, query_end_ptr, query_ptr, ...]
 
 
         # IV)
@@ -549,22 +575,21 @@ export.compute_deep_composition_polynomial_queries
         ## Cycles: 4
         movup.3 movup.3
         exec.constants::domain_offset_inv mul
-        #=> [poe, index, eval1, eval0, query_ptr, query_end_ptr, ...]
+        #=> [poe, index, eval1, eval0, query_ptr, query_end_ptr, query_ptr, ...]
 
         ## b) Store [eval0, eval1, index, poe]
         ##
         ## Cycles: 5
         dup.4 add.1 swap.5
         mem_storew
-        #=> [poe, index, eval1, eval0, query_ptr+1, query_end_ptr, ...]
+        #=> [poe, index, eval1, eval0, query_ptr+1, query_end_ptr, query_ptr, ...]
 
         ## c) Prepare stack for next iteration
         ##
-        ## Cycles: 8
-        dropw
-        dup.1 dup.1
+        ## Cycles: 4
+        dup.5 dup.5
         neq
         #=> [?, query_ptr+1, query_end_ptr, ...]
     end
-    drop drop
+    dropw drop drop
 end
diff --git a/stdlib/asm/crypto/stark/mod.masm b/stdlib/asm/crypto/stark/mod.masm
index 492f6da59d..776b802297 100644
--- a/stdlib/asm/crypto/stark/mod.masm
+++ b/stdlib/asm/crypto/stark/mod.masm
@@ -4,23 +4,24 @@ use.std::crypto::stark::verifier
 #!   The following simplifying assumptions are currently made:
 #!   - The blowup is set to 8.
 #!   - The maximal allowed degree of the remainder polynomial is 7.
-#!   - Only the input and output stacks, assumed of fixed size equal to 16, are handled in regards
-#!   to public inputs.
+#!   - The public inputs are composed of the input and output stacks, of fixed size equal to 16.
 #!   - There are two trace segments, main and auxiliary. It is assumed that the main trace segment
-#!   is 73 columns wide while the auxiliary trace segment is 9 columns wide.
+#!   is 70 columns wide while the auxiliary trace segment is 7 columns wide.
 #!   - The OOD evaluation frame is composed of two interleaved rows, current and next, each composed
-#!    of 73 elements representing the main trace portion and 9 elements for the auxiliary trace one.
+#!    of 70 elements representing the main trace portion and 7 elements for the auxiliary trace one.
 #!   - To boost soundness, the protocol is run on a quadratic extension field and this means that
 #!    the OOD evaluation frame is composed of elements in a quadratic extension field i.e. tuples.
-#!    Similarly, elements of the auxiliary trace are quadratic extension field elements.
+#!    Similarly, elements of the auxiliary trace are quadratic extension field elements. The random
+#!    values for computing random linear combinations are also in this extension field.
 #!   - The following procedure makes use of global memory address beyond 3 * 2^30 and these are
 #!    defined in `constants.masm`.
 #!
-#! Input: [log(trace_length), num_queries, log(blowup), grinding]
-#! Output: []
+#! Input: [log(trace_length), num_queries, log(blowup), grinding, ...]
+#! Output: [...]
+#!
 #! Cycles:
 #!  1- Remainder codeword size 32:
-#!   5000 + num_queries * (40 + num_fri_layers * 76 + 26 + 463) + 83 * num_fri_layers + 10 * log(trace_length) + 1633
+#!   5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 1633
 #!  2- Remainder codeword size 64:
-#!   5000 + num_queries * (40 + num_fri_layers * 76 + 26 + 463) + 83 * num_fri_layers + 10 * log(trace_length) + 3109
+#!   5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 3109
 export.verifier::verify
diff --git a/stdlib/asm/crypto/stark/utils.masm b/stdlib/asm/crypto/stark/utils.masm
index 27553f0dde..5f701da93c 100644
--- a/stdlib/asm/crypto/stark/utils.masm
+++ b/stdlib/asm/crypto/stark/utils.masm
@@ -16,3 +16,35 @@ export.compute_lde_generator
     exp.u32
     # => [domain_gen, ..]
 end
+
+#! Validates the inputs to the recursive verifier.
+#!
+#! Input: [log(trace_length), num_queries, log(blowup), grinding, ...]
+#! Output: [log(trace_length), num_queries, log(blowup), grinding, ...]
+#!
+#! Cycles: 28
+export.validate_inputs
+    # 1) Assert that all inputs are u32 so that we can use u32 operations in what follows
+    dupw
+    u32assertw
+    # => [log(trace_length), num_queries, log(blowup), grinding, ...]
+
+    # 2) Assert that the trace length is at most 29. The 2-adicity of our field is 32 and since
+    #    the blowup factor is 8, we need to make sure that the LDE size is at most 2^32.
+    #    We also check that the trace length is greater than the minimal length supported i.e., 2^6.
+    dup u32lt.30 assert
+    u32gt.5 assert
+
+    # 3) Assert that the number of FRI queries is at most 150. This restriction is a soft one
+    #    and is due to the memory layout in the `constants.masm` files but can be updated
+    #    therein.
+    #    We also make sure that there is at least one FRI query.
+    dup u32lt.151 assert
+    u32gt.0 assert
+
+    # 4) Assert that the the log(blowup) is 3
+    eq.3 assert
+
+    # 5) Assert that the grinding factor is at most 31
+    u32lt.32 assert
+end
diff --git a/stdlib/asm/crypto/stark/verifier.masm b/stdlib/asm/crypto/stark/verifier.masm
index edcc4adeca..0183fe2a78 100644
--- a/stdlib/asm/crypto/stark/verifier.masm
+++ b/stdlib/asm/crypto/stark/verifier.masm
@@ -6,7 +6,7 @@ use.std::crypto::stark::random_coin
 use.std::crypto::stark::ood_frames
 use.std::crypto::stark::public_inputs
 use.std::crypto::stark::constants
-
+use.std::crypto::stark::utils
 
 #!   Verify a STARK proof attesting to the correct execution of a program in the Miden VM.
 #!   The following simplifying assumptions are currently made:
@@ -25,18 +25,25 @@ use.std::crypto::stark::constants
 #!    defined in `constants.masm`.
 #!
 #! Input: [log(trace_length), num_queries, log(blowup), grinding, ...]
-#! Output: []
+#! Output: [...]
+#!
 #! Cycles:
 #!  1- Remainder codeword size 32:
-#!   4975 + num_queries * (40 + num_fri_layers * 76 + 26 + 473) + 83 * num_fri_layers + 10 * log(trace_length) + 1633
+#!   5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 1633
 #!  2- Remainder codeword size 64:
-#!   4975 + num_queries * (40 + num_fri_layers * 76 + 26 + 473) + 83 * num_fri_layers + 10 * log(trace_length) + 3109
+#!   5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 3109
 export.verify
 
     #==============================================================================================
     #       I)  Hash proof context and hash-&-load public inputs
     #==============================================================================================
 
+    # Validate inputs
+    #
+    # Cycles: 28
+    exec.utils::validate_inputs
+    # => [log(trace_length), num_queries, log(blowup), grinding, ...]
+
     # Initialize the seed using proof context
     #
     # Cycles: 82
@@ -221,7 +228,7 @@ export.verify
 
     # Compute deep compostion polynomial queries
     #
-    # Cycles: 14 + num_queries * 473
+    # Cycles: 24 + num_queries * 445
     #=> [query_ptr, ...]
     exec.deep_queries::compute_deep_composition_polynomial_queries
     #=> [query_ptr, ...]