From 5944710a5faff1b175d5c4c4b6ddee2c13b3d095 Mon Sep 17 00:00:00 2001 From: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com> Date: Thu, 19 Dec 2024 12:27:36 +0100 Subject: [PATCH] Optimize computation of DEEP queries in recursive verifier (#1594) * feat: optimize computation of DEEP queries chore: update changelog chore: address feedback Signed-off-by: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com> * refactor: remove file * feat: add paramter check to recursive verifier chore: changelog update chore: add back file and update it Revert "refactor: remove file" This reverts commit 384a5af794697cd4060d1a80f167fa9a571962b3. --------- Signed-off-by: Al-Kindi-0 <82364884+Al-Kindi-0@users.noreply.github.com> --- CHANGELOG.md | 2 + stdlib/asm/crypto/stark/constants.masm | 12 +- stdlib/asm/crypto/stark/deep_queries.masm | 139 +++++++++++++--------- stdlib/asm/crypto/stark/mod.masm | 19 +-- stdlib/asm/crypto/stark/utils.masm | 32 +++++ stdlib/asm/crypto/stark/verifier.masm | 17 ++- 6 files changed, 149 insertions(+), 72 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 32fb2da348..922fa9d0b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ #### Enhancements - Added options `--kernel`, `--debug` and `--output` to `miden bundle` (#1447). - Added `miden_core::mast::MastForest::advice_map` to load it into the advice provider before the `MastForest` execution (#1574). +- Optimized the computation of the DEEP queries in the recursive verifier (#1594). +- Added validity checks for the inputs to the recursive verifier (#1596). ## 0.11.0 (2024-11-04) diff --git a/stdlib/asm/crypto/stark/constants.masm b/stdlib/asm/crypto/stark/constants.masm index cf8784c9d7..8529917593 100644 --- a/stdlib/asm/crypto/stark/constants.masm +++ b/stdlib/asm/crypto/stark/constants.masm @@ -39,11 +39,15 @@ const.COMPOSITION_COEF_PTR=4294900200 # We need 2 Felt for each trace column and each of the 8 constraint composition columns. We thus need # (70 + 7 + 8) * 2 Felt i.e. 43 memory slots. +# Note that there is a cap on the number of such coefficients so that the memory region allocated for +# these coefficients does not overlap with the memory region storing the FRI queries. +# This cap is of a 100 coefficients which is equivalent to 50 memory slots. This gives 150 memory +# slots for all of the FRI queries i.e., 150 FRI queries. const.DEEP_RAND_CC_PTR=4294903000 # FRI # -# (FRI_COM_PTR - 100) ---| +# (FRI_COM_PTR - 150) ---| # . # . | <- FRI queries # . @@ -97,6 +101,7 @@ const.TMP5=4294903319 const.TMP6=4294903320 const.TMP7=4294903321 const.TMP8=4294903322 +const.TMP9=4294903323 @@ -135,6 +140,7 @@ const.TMP8=4294903322 # | TMP6 | 4294903320 | # | TMP7 | 4294903321 | # | TMP8 | 4294903322 | +# | TMP9 | 4294903323 | # +------------------------------------------+-------------------------+ # ACCESSORS @@ -305,3 +311,7 @@ end export.tmp8 push.TMP8 end + +export.tmp9 + push.TMP9 +end diff --git a/stdlib/asm/crypto/stark/deep_queries.masm b/stdlib/asm/crypto/stark/deep_queries.masm index 0ce9e0e962..41c721c066 100644 --- a/stdlib/asm/crypto/stark/deep_queries.masm +++ b/stdlib/asm/crypto/stark/deep_queries.masm @@ -127,22 +127,23 @@ end #! Loads the next query rows in the main, auxiliary and constraint composition polynomials traces. #! It takes a pointer to the current random query index and returns that index. #! -#! Input: [query_ptr, ...] -#! Output: [index, query_ptr, ...] +#! Input: [Y, query_ptr, ...] +#! Output: [Y, Y, index, query_ptr, ...] +#! +#! where: +#! - Y is a "garbage" word. #! -#! Cycles: 219 +#! Cycles: 206 proc.load_query_row # Main trace portion of the query ## Get the next query index - padw dup.4 mem_loadw - swap - #=> [depth, index, y, y, query_ptr, ...] + #=> [index, depth, y, y, query_ptr, ...] where y are "garbage" values ## Get main trace commitment and use it to get the leaf - movup.3 movup.3 + movdn.3 movdn.2 push.0.0 exec.constants::main_trace_com_ptr mem_loadw #=>[R, depth, index, query_ptr, ...] @@ -273,50 +274,48 @@ proc.load_query_row assert_eq #=> [Y, ptr, y, y, y, depth, index, query_ptr, ...] - dropw dropw drop - #=> [index, query_ptr, ...] + drop + #=> [Y, Y, index, query_ptr, ...] end #! Takes a query index and computes x := offset * domain_gen^index. It also computes the denominators #! (x - z) and (x - gz). #! -#! Input: [index, ...] -#! Output: [Z, x, index, ...] where Z := [-gz1, x -gz0, -z1, x - z0] +#! Input: [Y, Y, index, ...] +#! Output: [Z, Y, x, index, ...] +#! +#! where: +#! - Z := [-gz1, x -gz0, -z1, x - z0] +#! - Y is a "garbage" word #! -#! Cycles: 68 +#! Cycles: 58 proc.compute_denominators # Compute x = offset * domain_gen^index - padw exec.constants::lde_size_ptr mem_loadw - #=> [lde_size, depth, domain_gen, 0, index, ...] + #=> [lde_size, depth, domain_gen, 0, Y, index, ...] movup.2 - dup.4 + dup.8 exp.u32 exec.constants::domain_offset mul - #=> [x, lde_size, depth, 0, index, ...] + #=> [x, lde_size, depth, 0, Y, index, ...] # Get z and gz from memory movdn.3 - #=> [lde_size, depth, 0, x, index, ...] + #=> [lde_size, depth, 0, x, Y, index, ...] push.0 exec.constants::tmp1 mem_loadw - #=> [gz1, gz0, z1, z0, x, index, ...] + #=> [-z0, -gz0, -gz1, -z1, x, Y, index, ...] - # Compute Z := [-z1, x - z0, -gz1, x -gz0] - neg - dup.4 - movup.2 - sub - #=> [x-gz0, -gz1, z1, z0, x, index, ...] + dup.4 add + #=> [x-z0, -gz0, -gz1, -z1, x, Y, index, ...] movdn.3 - movdn.2 - #=> [z1, z0, -gz1, x-gz0, x, index, ...] - neg - movdn.3 - dup.4 swap - sub - movdn.3 - #=> [Z, x, index, ...] where Z := [-gz1, x -gz0, -z1, x - z0] + #=> [-gz0, -gz1, -z1, x-z0, x, Y, index, ...] + + movup.4 dup movdn.9 + #=> [x, -gz0, -gz1, -z1, x-z0, Y, x, index, ...] + + add swap + #=> [-gz1, x - gz0, -z1, x-z0, Y, x, index, ...] end #! Computes the random linear combination involving the main trace columns and accumulates @@ -429,7 +428,7 @@ proc.combine_constraint_poly_columns dropw swapw exec.constants::tmp3 mem_loadw - #=> [Acc3, Acc2, y, y, y, y, Acc1`, Acc0`, ...] + #=> [Acc3, Acc2, y, y, y, y, Acc1`, Acc0`, ...] where y are "garbage" values movdn.5 movdn.5 #=> [y, y, y, y, Acc3, Acc2, Acc1`, Acc0`, ...] dropw @@ -466,12 +465,43 @@ end #! Compute the DEEP composition polynomial FRI queries. #! #! Input: [query_ptr, ...] -#! Output: [...] -#! Cycles: 6 + num_queries * 473 +#! Output: [query_ptr, ...] +#! Cycles: 24 + num_queries * 445 export.compute_deep_composition_polynomial_queries + + # Get pointer to help test for the last query to be processed exec.constants::fri_com_ptr dup.1 - #=>[query_ptr, query_end_ptr, ...] + # =>[query_ptr, query_end_ptr, query_ptr...] + # Store the pointers to: + # 1. random values for computing DEEP polynomial + # 2. OOD evaluation frame + # 3. trace row values for the current query + push.0 + exec.constants::deep_rand_coef_ptr + exec.constants::ood_trace_ptr + exec.constants::current_trace_row_ptr + exec.constants::tmp9 mem_storew + # =>[Y, query_ptr, query_end_ptr, query_ptr, ...] + + # Compute the negations of z and gz where z is the OOD point + # We do it here as this computation is common to all queries. + exec.constants::z_ptr mem_loadw + # => [zN_1, zN_0, z1, z0, query_ptr, query_end_ptr, query_ptr, ...] + drop drop + neg swap neg + # => [-z0, -z1, query_ptr, query_end_ptr, query_ptr, ...] + dup.1 exec.constants::trace_domain_generator_ptr mem_load mul + # => [-gz1, -z0, -z1, query_ptr, query_end_ptr, query_ptr, ...] + swap + # => [-z0, -gz1, -z1, query_ptr, query_end_ptr, query_ptr, ...] + dup exec.constants::trace_domain_generator_ptr mem_load mul + # => [-gz0, -z0, -gz1, -z1, query_ptr, query_end_ptr, query_ptr, ...] + swap + # => [-z0, -gz0, -gz1, -z1, query_ptr, query_end_ptr, query_ptr, ...] + # Save to temporary location `tmp1` for later use when computing the denominators + exec.constants::tmp1 mem_storew + # => [Y, query_ptr, query_end_ptr, query_ptr, ...] push.1 while.true @@ -480,19 +510,18 @@ export.compute_deep_composition_polynomial_queries # Load the (main, aux, constraint)-traces rows associated with the current query and get # the index of the query. # - # Cycles: 219 + # Cycles: 206 exec.load_query_row - #=>[index, query_ptr, query_end_ptr, ...] + #=>[Y, Y, index, query_ptr, query_end_ptr, query_ptr, ...] # II) # # Compute x := offset * domain_gen^index and denominators (x - z) and (x - gz) # - # Cycles: 68 + # Cycles: 58 exec.compute_denominators - #=> [Z, x, index, query_ptr, query_end_ptr, ...] where Z := [-gz1, x - gz0, -z1, x - z0] - + #=> [Z, Y, x, index, query_ptr, query_end_ptr, query_ptr, ...] where Z := [-gz1, x - gz0, -z1, x - z0] # III) # @@ -503,26 +532,23 @@ export.compute_deep_composition_polynomial_queries ## a) Push pointers ## - ## Cycles: 4 - push.0 - exec.constants::deep_rand_coef_ptr - exec.constants::ood_trace_ptr - exec.constants::current_trace_row_ptr - #=> [P, Z, x, index, query_ptr, query_end_ptr, ...] + ## Cycles: 3 + swapw exec.constants::tmp9 mem_loadw + #=> [P, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...] # where P := [CURRENT_TRACE_ROW_PTR, OOD_TRACE_PTR, DEEP_RAND_CC_PTR, 0] ## b) Push the accumulators ## ## Cycles: 4 padw - #=> [Acc, P, Z, x, index, query_ptr, query_end_ptr, ...] + #=> [Acc, P, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...] #=> where Acc =: [Acc3, Acc2, Acc1, Acc0] ## c) This will be used to mstream the elements T_i(x) ## ## Cycles: 8 padw padw - #=> [Y, Y, Acc, P, Z, x, index, query_ptr, query_end_ptr, ...] + #=> [Y, Y, Acc, P, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...] ## d) Compute the random linear combination ## @@ -530,13 +556,13 @@ export.compute_deep_composition_polynomial_queries exec.combine_main_trace_columns exec.combine_aux_trace_columns exec.combine_constraint_poly_columns - #=> [Acc, Z, x, index, query_ptr, query_end_ptr, ...] + #=> [Acc, Z, x, index, query_ptr, query_end_ptr, query_ptr, ...] ## e) Divide by denominators and sum to get final result ## ## Cycles: 38 exec.divide_by_denominators_and_sum - #=> [eval1, eval0, x, index, query_ptr, query_end_ptr, ...] + #=> [eval1, eval0, x, index, query_ptr, query_end_ptr, query_ptr, ...] # IV) @@ -549,22 +575,21 @@ export.compute_deep_composition_polynomial_queries ## Cycles: 4 movup.3 movup.3 exec.constants::domain_offset_inv mul - #=> [poe, index, eval1, eval0, query_ptr, query_end_ptr, ...] + #=> [poe, index, eval1, eval0, query_ptr, query_end_ptr, query_ptr, ...] ## b) Store [eval0, eval1, index, poe] ## ## Cycles: 5 dup.4 add.1 swap.5 mem_storew - #=> [poe, index, eval1, eval0, query_ptr+1, query_end_ptr, ...] + #=> [poe, index, eval1, eval0, query_ptr+1, query_end_ptr, query_ptr, ...] ## c) Prepare stack for next iteration ## - ## Cycles: 8 - dropw - dup.1 dup.1 + ## Cycles: 4 + dup.5 dup.5 neq #=> [?, query_ptr+1, query_end_ptr, ...] end - drop drop + dropw drop drop end diff --git a/stdlib/asm/crypto/stark/mod.masm b/stdlib/asm/crypto/stark/mod.masm index 492f6da59d..776b802297 100644 --- a/stdlib/asm/crypto/stark/mod.masm +++ b/stdlib/asm/crypto/stark/mod.masm @@ -4,23 +4,24 @@ use.std::crypto::stark::verifier #! The following simplifying assumptions are currently made: #! - The blowup is set to 8. #! - The maximal allowed degree of the remainder polynomial is 7. -#! - Only the input and output stacks, assumed of fixed size equal to 16, are handled in regards -#! to public inputs. +#! - The public inputs are composed of the input and output stacks, of fixed size equal to 16. #! - There are two trace segments, main and auxiliary. It is assumed that the main trace segment -#! is 73 columns wide while the auxiliary trace segment is 9 columns wide. +#! is 70 columns wide while the auxiliary trace segment is 7 columns wide. #! - The OOD evaluation frame is composed of two interleaved rows, current and next, each composed -#! of 73 elements representing the main trace portion and 9 elements for the auxiliary trace one. +#! of 70 elements representing the main trace portion and 7 elements for the auxiliary trace one. #! - To boost soundness, the protocol is run on a quadratic extension field and this means that #! the OOD evaluation frame is composed of elements in a quadratic extension field i.e. tuples. -#! Similarly, elements of the auxiliary trace are quadratic extension field elements. +#! Similarly, elements of the auxiliary trace are quadratic extension field elements. The random +#! values for computing random linear combinations are also in this extension field. #! - The following procedure makes use of global memory address beyond 3 * 2^30 and these are #! defined in `constants.masm`. #! -#! Input: [log(trace_length), num_queries, log(blowup), grinding] -#! Output: [] +#! Input: [log(trace_length), num_queries, log(blowup), grinding, ...] +#! Output: [...] +#! #! Cycles: #! 1- Remainder codeword size 32: -#! 5000 + num_queries * (40 + num_fri_layers * 76 + 26 + 463) + 83 * num_fri_layers + 10 * log(trace_length) + 1633 +#! 5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 1633 #! 2- Remainder codeword size 64: -#! 5000 + num_queries * (40 + num_fri_layers * 76 + 26 + 463) + 83 * num_fri_layers + 10 * log(trace_length) + 3109 +#! 5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 3109 export.verifier::verify diff --git a/stdlib/asm/crypto/stark/utils.masm b/stdlib/asm/crypto/stark/utils.masm index 27553f0dde..5f701da93c 100644 --- a/stdlib/asm/crypto/stark/utils.masm +++ b/stdlib/asm/crypto/stark/utils.masm @@ -16,3 +16,35 @@ export.compute_lde_generator exp.u32 # => [domain_gen, ..] end + +#! Validates the inputs to the recursive verifier. +#! +#! Input: [log(trace_length), num_queries, log(blowup), grinding, ...] +#! Output: [log(trace_length), num_queries, log(blowup), grinding, ...] +#! +#! Cycles: 28 +export.validate_inputs + # 1) Assert that all inputs are u32 so that we can use u32 operations in what follows + dupw + u32assertw + # => [log(trace_length), num_queries, log(blowup), grinding, ...] + + # 2) Assert that the trace length is at most 29. The 2-adicity of our field is 32 and since + # the blowup factor is 8, we need to make sure that the LDE size is at most 2^32. + # We also check that the trace length is greater than the minimal length supported i.e., 2^6. + dup u32lt.30 assert + u32gt.5 assert + + # 3) Assert that the number of FRI queries is at most 150. This restriction is a soft one + # and is due to the memory layout in the `constants.masm` files but can be updated + # therein. + # We also make sure that there is at least one FRI query. + dup u32lt.151 assert + u32gt.0 assert + + # 4) Assert that the the log(blowup) is 3 + eq.3 assert + + # 5) Assert that the grinding factor is at most 31 + u32lt.32 assert +end diff --git a/stdlib/asm/crypto/stark/verifier.masm b/stdlib/asm/crypto/stark/verifier.masm index edcc4adeca..0183fe2a78 100644 --- a/stdlib/asm/crypto/stark/verifier.masm +++ b/stdlib/asm/crypto/stark/verifier.masm @@ -6,7 +6,7 @@ use.std::crypto::stark::random_coin use.std::crypto::stark::ood_frames use.std::crypto::stark::public_inputs use.std::crypto::stark::constants - +use.std::crypto::stark::utils #! Verify a STARK proof attesting to the correct execution of a program in the Miden VM. #! The following simplifying assumptions are currently made: @@ -25,18 +25,25 @@ use.std::crypto::stark::constants #! defined in `constants.masm`. #! #! Input: [log(trace_length), num_queries, log(blowup), grinding, ...] -#! Output: [] +#! Output: [...] +#! #! Cycles: #! 1- Remainder codeword size 32: -#! 4975 + num_queries * (40 + num_fri_layers * 76 + 26 + 473) + 83 * num_fri_layers + 10 * log(trace_length) + 1633 +#! 5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 1633 #! 2- Remainder codeword size 64: -#! 4975 + num_queries * (40 + num_fri_layers * 76 + 26 + 473) + 83 * num_fri_layers + 10 * log(trace_length) + 3109 +#! 5013 + num_queries * (40 + num_fri_layers * 76 + 26 + 445) + 83 * num_fri_layers + 10 * log(trace_length) + 3109 export.verify #============================================================================================== # I) Hash proof context and hash-&-load public inputs #============================================================================================== + # Validate inputs + # + # Cycles: 28 + exec.utils::validate_inputs + # => [log(trace_length), num_queries, log(blowup), grinding, ...] + # Initialize the seed using proof context # # Cycles: 82 @@ -221,7 +228,7 @@ export.verify # Compute deep compostion polynomial queries # - # Cycles: 14 + num_queries * 473 + # Cycles: 24 + num_queries * 445 #=> [query_ptr, ...] exec.deep_queries::compute_deep_composition_polynomial_queries #=> [query_ptr, ...]