leanEthereum · TomWambsgans · Sep 14, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -56,6 +56,15 @@ RUSTFLAGS='-C target-cpu=native' NUM_XMSS_AGGREGATED='500' cargo test --release
 ![Alt text](docs/benchmark_graphs/graphs/xmss_aggregated_time.svg)
 ![Alt text](docs/benchmark_graphs/graphs/xmss_aggregated_overhead.svg)
 
+### Proof size
+
+With conjecture "up to capacity", current proofs with rate = 1/2 are about about ≈ 400 - 500 KiB, in which ≈ 300 KiB comes from WHIR.
+
+- The remaining 100 - 200 KiB will be significantly reduced in the future (this part has not been optimized at all).
+- WHIR proof size will also be reduced, thanks to merkle pruning (TODO).
+
+Reasonable target: 256 KiB for fast proof, 128 KiB for slower proofs (rate = 1/4 or 1/8).
+
 ## Credits
 
 - [Plonky3](https://github.com/Plonky3/Plonky3) for its various performant crates (Finite fields, poseidon2 AIR etc)

diff --git a/TODO.md b/TODO.md
@@ -10,7 +10,6 @@
 - use RowMAjorMatrix instead of Vec<Vec> for witness, and avoid any transpositions as suggested by Thomas
 - Fill Precompile tables during bytecode execution
 - Use Univariate Skip to commit to tables with k.2^n rows (k small)
-- increase density of multi commitments -> we can almost reduce by 2x commitment costs (question: will perf be good enough in order to avoid using the "jagged pcs" (cf sp1 hypercube)?)
 - avoid field embedding in the initial sumcheck of logup*, when table / values are in base field
 - opti logup* GKR: 
     - when the indexes are not a power of 2 (which is the case in the execution table)
@@ -23,8 +22,50 @@
 - Sumcheck, case z = 0, no need to fold, only keep first half of the values (done in PR 33 by Lambda) (and also in WHIR?)
 - Custom AVX2 / AVX512 / Neon implem in Plonky3 for all of the finite field operations (done for degree 4 extension, but not degree 5)
 - the 2 executions of the program, before generating the validity proof, can be merged, using some kind of placeholders
-- both WHIR verif + XMSS aggregation programs have 40% of unused memory!! -> TODO improve the compiler to reduce memory fragmentation
 - Many times, we evaluate different multilinear polynomials (diferent columns of the same table etc) at a common point. OPTI = compute the eq(.) once, and then dot_product with everything
+- To commit to multiple AIR table using 1 single pcs, the most general form our "packed pcs" api should accept is:
+ a list of n (n not a power of 2) columns, each ending with m repeated values (in this manner we can reduce proof size when they are a lot of columns (poseidons ...))
+
+About "the packed pcs" (similar to SP1 Jagged PCS, slightly less efficient, but simpler (no sumchecks)):
+- The best strategy is probably to pack as much as possible (the cost increasing the density = additional inner evaluations), if we can fit below a power of 2 - epsilon  (epsilon = 20% for instance, tbd), if the sum of the non zero data is just above a power of 2, no packed technique, even the best, can help us, so we should spread aniway (to reduce the pressure of inner evaluations)
+- About those inner evaluations, there is a trick: we need to compute M1(a, b, c, d, ...) then M2(b, c, d, ...), then M3(c, d, ...) -> The trick = compute the "eq(.) for (b, c, d), then dot product with M3. Then expand to eq(b, c, d, ...), dot product with M2. Then expand to eq(a, b, c, d, ...), dot product with M1. The idea is that in this order, computing each "eq" is easier is we start from the previous one.
+- Currently the packed pcs works as follows:
+
+```
+┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐
+| || || || || || || || || || || || || || |
+| || || || || || || || || || || || || || |
+| || || || || || || || || || || || || || |
+| || || || || || || || || || || || || || |
+| || || || || || || || || || || || || || |
+| || || || || || || || || || || || || || |
+└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘
+┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐
+| || || || || || || || || || || || || || |
+| || || || || || || || || || || || || || |
+└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘
+┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐┌─┐
+└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘└─┘
+```
+
+But we reduce proof size a lot using instead (TODO):
+
+```
+┌────────────────────────┐┌──────────┐┌─┐
+|                        ||          || |
+|                        ||          || |
+|                        ||          || |
+|                        ||          || |
+|                        ||          || |
+|                        ||          || |
+└────────────────────────┘└──────────┘└─┘
+┌────────────────────────┐┌──────────┐┌─┐
+|                        ||          || |
+|                        ||          || |
+└────────────────────────┘└──────────┘└─┘
+┌────────────────────────┐┌──────────┐┌─┐
+└────────────────────────┘└──────────┘└─┘
+```
 
 ## Not Perf
 
@@ -37,6 +78,7 @@
 
 - KoalaBear extension of degree 5: the current implem (in a fork of Plonky3) has not been been optimized
 - KoalaBear extension of degree 6: in order to use the (proven) Johnson bound in WHIR
+- current "packed PCS" is not optimal in the end: can lead to [16][4][2][2] (instead of [16][8])
 
 ## Known leanISA compiler bugs:
 

diff --git a/crates/compiler/src/a_simplify_lang.rs b/crates/compiler/src/a_simplify_lang.rs
@@ -127,6 +127,7 @@ pub enum SimpleLine {
         var: Var,
         size: SimpleExpr,
         vectorized: bool,
+        vectorized_len: SimpleExpr,
     },
     ConstMalloc {
         // always not vectorized
@@ -283,6 +284,7 @@ fn simplify_lines(
                         arg1: right,
                     });
                 }
+                Expression::Log2Ceil { .. } => unreachable!(),
             },
             Line::ArrayAssign {
                 array,
@@ -589,9 +591,17 @@ fn simplify_lines(
                 var,
                 size,
                 vectorized,
+                vectorized_len,
             } => {
                 let simplified_size =
                     simplify_expr(size, &mut res, counters, array_manager, const_malloc);
+                let simplified_vectorized_len = simplify_expr(
+                    vectorized_len,
+                    &mut res,
+                    counters,
+                    array_manager,
+                    const_malloc,
+                );
                 if simplified_size.is_constant()
                     && !*vectorized
                     && const_malloc.forbidden_vars.contains(var)
@@ -619,6 +629,7 @@ fn simplify_lines(
                             var: var.clone(),
                             size: simplified_size,
                             vectorized: *vectorized,
+                            vectorized_len: simplified_vectorized_len,
                         });
                     }
                 }
@@ -724,6 +735,14 @@ fn simplify_expr(
             });
             SimpleExpr::Var(aux_var)
         }
+        Expression::Log2Ceil { value } => {
+            let const_value = simplify_expr(value, lines, counters, array_manager, const_malloc)
+                .as_constant()
+                .unwrap();
+            SimpleExpr::Constant(ConstExpression::Log2Ceil {
+                value: Box::new(const_value),
+            })
+        }
     }
 }
 
@@ -884,6 +903,9 @@ fn inline_expr(expr: &mut Expression, args: &BTreeMap<Var, SimpleExpr>, inlining
             inline_expr(left, args, inlining_count);
             inline_expr(right, args, inlining_count);
         }
+        Expression::Log2Ceil { value } => {
+            inline_expr(value, args, inlining_count);
+        }
     }
 }
 
@@ -1036,6 +1058,9 @@ fn vars_in_expression(expr: &Expression) -> BTreeSet<Var> {
             vars.extend(vars_in_expression(left));
             vars.extend(vars_in_expression(right));
         }
+        Expression::Log2Ceil { value } => {
+            vars.extend(vars_in_expression(value));
+        }
     }
     vars
 }
@@ -1221,6 +1246,15 @@ fn replace_vars_for_unroll_in_expr(
                 internal_vars,
             );
         }
+        Expression::Log2Ceil { value } => {
+            replace_vars_for_unroll_in_expr(
+                value,
+                iterator,
+                unroll_index,
+                iterator_value,
+                internal_vars,
+            );
+        }
     }
 }
 
@@ -1434,6 +1468,7 @@ fn replace_vars_for_unroll(
                 var,
                 size,
                 vectorized: _,
+                vectorized_len,
             } => {
                 assert!(var != iterator, "Weird");
                 *var = format!("@unrolled_{unroll_index}_{iterator_value}_{var}");
@@ -1444,7 +1479,13 @@ fn replace_vars_for_unroll(
                     iterator_value,
                     internal_vars,
                 );
-                // vectorized is not changed
+                replace_vars_for_unroll_in_expr(
+                    vectorized_len,
+                    iterator,
+                    unroll_index,
+                    iterator_value,
+                    internal_vars,
+                );
             }
             Line::DecomposeBits { var, to_decompose } => {
                 assert!(var != iterator, "Weird");
@@ -1734,6 +1775,9 @@ fn replace_vars_by_const_in_expr(expr: &mut Expression, map: &BTreeMap<Var, F>)
             replace_vars_by_const_in_expr(left, map);
             replace_vars_by_const_in_expr(right, map);
         }
+        Expression::Log2Ceil { value } => {
+            replace_vars_by_const_in_expr(value, map);
+        }
     }
 }
 
@@ -2029,13 +2073,13 @@ impl SimpleLine {
                 var,
                 size,
                 vectorized,
+                vectorized_len,
             } => {
-                let alloc_type = if *vectorized {
-                    "malloc_vectorized"
+                if *vectorized {
+                    format!("{var} = malloc_vec({size}, {vectorized_len})")
                 } else {
-                    "malloc"
-                };
-                format!("{var} = {alloc_type}({size})")
+                    format!("{var} = malloc({size})")
+                }
             }
             Self::ConstMalloc {
                 var,

diff --git a/crates/compiler/src/b_compile_intermediate.rs b/crates/compiler/src/b_compile_intermediate.rs
@@ -491,9 +491,20 @@ fn compile_lines(
 
             SimpleLine::FunctionRet { return_data } => {
                 if compiler.func_name == "main" {
+                    // pC -> ending_pc, fp -> 0
+                    let zero_value_offset = IntermediateValue::MemoryAfterFp {
+                        offset: compiler.stack_size.into(),
+                    };
+                    compiler.stack_size += 1;
+                    instructions.push(IntermediateInstruction::Computation {
+                        operation: Operation::Add,
+                        arg_a: IntermediateValue::Constant(0.into()),
+                        arg_c: IntermediateValue::Constant(0.into()),
+                        res: zero_value_offset.clone(),
+                    });
                     instructions.push(IntermediateInstruction::Jump {
                         dest: IntermediateValue::label("@end_program".to_string()),
-                        updated_fp: None,
+                        updated_fp: Some(zero_value_offset),
                     });
                 } else {
                     compile_function_ret(&mut instructions, return_data, compiler);
@@ -504,12 +515,14 @@ fn compile_lines(
                 var,
                 size,
                 vectorized,
+                vectorized_len,
             } => {
                 declared_vars.insert(var.clone());
                 instructions.push(IntermediateInstruction::RequestMemory {
                     offset: compiler.get_offset(&var.clone().into()),
                     size: IntermediateValue::from_simple_expr(size, compiler),
                     vectorized: *vectorized,
+                    vectorized_len: IntermediateValue::from_simple_expr(vectorized_len, compiler),
                 });
             }
             SimpleLine::ConstMalloc { var, size, label } => {
@@ -631,6 +644,7 @@ fn setup_function_call(
             offset: new_fp_pos.into(),
             size: ConstExpression::function_size(func_name.to_string()).into(),
             vectorized: false,
+            vectorized_len: IntermediateValue::Constant(ConstExpression::zero()),
         },
         IntermediateInstruction::Deref {
             shift_0: new_fp_pos.into(),

diff --git a/crates/compiler/src/c_compile_final.rs b/crates/compiler/src/c_compile_final.rs
@@ -303,12 +303,17 @@ fn compile_block(
                 offset,
                 size,
                 vectorized,
+                vectorized_len,
             } => {
                 let size = try_as_mem_or_constant(&size).unwrap();
+                let vectorized_len = try_as_constant(&vectorized_len, compiler)
+                    .unwrap()
+                    .to_usize();
                 let hint = Hint::RequestMemory {
                     offset: eval_const_expression_usize(&offset, compiler),
                     vectorized,
                     size,
+                    vectorized_len,
                 };
                 hints.entry(pc).or_default().push(hint);
             }

diff --git a/crates/compiler/src/grammar.pest b/crates/compiler/src/grammar.pest
@@ -72,9 +72,11 @@ div_expr = { exp_expr ~ ("/" ~ exp_expr)* }
 exp_expr = { primary ~ ("**" ~ primary)* }
 primary = { 
     "(" ~ expression ~ ")" | 
+    log2_ceil_expr |
     array_access_expr |
     var_or_constant
 }
+log2_ceil_expr = { "log2_ceil" ~ "(" ~ expression ~ ")" }
 array_access_expr = { identifier ~ "[" ~ expression ~ "]" }
 
 // Basic elements

diff --git a/crates/compiler/src/intermediate_bytecode.rs b/crates/compiler/src/intermediate_bytecode.rs
@@ -134,7 +134,8 @@ pub enum IntermediateInstruction {
     RequestMemory {
         offset: ConstExpression, // m[fp + offset] where the hint will be stored
         size: IntermediateValue, // the hint
-        vectorized: bool, // if true, will be 8-alligned, and the returned pointer will be "divied" by 8 (i.e. everything is in chunks of 8 field elements)
+        vectorized: bool, // if true, will be (2^vectorized_len)-alligned, and the returned pointer will be "divied" by 2^vectorized_len
+        vectorized_len: IntermediateValue,
     },
     DecomposeBits {
         res_offset: usize, // m[fp + res_offset..fp + res_offset + 31 * len(to_decompose)] will contain the decomposed bits
@@ -297,13 +298,17 @@ impl Display for IntermediateInstruction {
                 offset,
                 size,
                 vectorized,
-            } => write!(
-                f,
-                "m[fp + {}] = {}({})",
-                offset,
-                if *vectorized { "malloc_vec" } else { "malloc" },
-                size
-            ),
+                vectorized_len,
+            } => {
+                if *vectorized {
+                    write!(
+                        f,
+                        "m[fp + {offset}] = request_memory_vec({size}, {vectorized_len})"
+                    )
+                } else {
+                    write!(f, "m[fp + {offset}] = request_memory({size})")
+                }
+            }
             Self::Print { line_info, content } => {
                 write!(f, "print {line_info}: ")?;
                 for (i, c) in content.iter().enumerate() {