From 3585dada3f71e7b13a6d6e9a292733273d87b0b6 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 9 Dec 2025 15:11:32 +0100 Subject: [PATCH 01/13] implement variant of tree layer computation using SIMD packing --- src/symmetric/tweak_hash.rs | 26 +++++++++ src/symmetric/tweak_hash/poseidon.rs | 83 ++++++++++++++++++++++++++++ src/symmetric/tweak_hash_tree.rs | 18 +----- 3 files changed, 111 insertions(+), 16 deletions(-) diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs index efbb90d..dd0d3ba 100644 --- a/src/symmetric/tweak_hash.rs +++ b/src/symmetric/tweak_hash.rs @@ -1,5 +1,7 @@ use rand::Rng; +use rayon::prelude::*; + use crate::serialization::Serializable; use crate::symmetric::prf::Pseudorandom; @@ -46,6 +48,30 @@ pub trait TweakableHash { message: &[Self::Domain], ) -> Self::Domain; + /// Applies the calculation for a single tweak hash tree layer. + fn compute_tree_layer( + parameter: &Self::Parameter, + level: u8, + parent_start: usize, + children: &[Self::Domain], + ) -> Vec { + // default implementation is scalar. tweak_tree/poseidon.rs provides a SIMD variant + children + .par_chunks_exact(2) + .enumerate() + .map(|(i, children)| { + // Parent index in this layer + let parent_pos = (parent_start + i) as u32; + // Hash children into their parent using the tweak + Self::apply( + parameter, + &Self::tree_tweak((level as u8) + 1, parent_pos), + children, + ) + }) + .collect() + } + /// Computes bottom tree leaves by walking hash chains for multiple epochs. /// /// This method has a default scalar implementation that processes epochs in parallel. diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index bcd709f..7f4fb72 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -371,6 +371,89 @@ impl< } } + fn compute_tree_layer( + parameter: &Self::Parameter, + level: u8, + parent_start: usize, + children: &[Self::Domain], + ) -> Vec { + // SIMD implementation specifically for Poseidon + + // Broadcast the hash parameter to all SIMD lanes. + // Each lane will use the same parameter + let packed_parameter: [PackedF; PARAMETER_LEN] = + array::from_fn(|i| PackedF::from(parameter[i])); + + const WIDTH: usize = PackedF::WIDTH; + + // permutation to use for the compression. 24 as we merge two inputs + let perm = poseidon2_24(); + + // preallocate a vector that can hold the SIMD part as well as any possible scalar remainder + let mut parents = Vec::with_capacity(children.len() / 2); + parents.par_extend(children.par_chunks_exact(2 * WIDTH).enumerate().flat_map( + |(i, children)| { + let parent_pos = (parent_start + i * WIDTH) as u32; + let packed_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| { + PackedF::from_fn(|lane| { + let parent_pos_per_lane = parent_pos + (lane as u32); + Self::tree_tweak(level, parent_pos_per_lane) + .to_field_elements::()[t_idx] + }) + }); + + // Assemble the packed input for the hash function. + // Layout: [parameter | tweak | left | right] + let mut packed_input = [PackedF::ZERO; MERGE_COMPRESSION_WIDTH]; + let mut current_pos = 0; + + // Copy parameter into the input buffer. + packed_input[current_pos..current_pos + PARAMETER_LEN] + .copy_from_slice(&packed_parameter); + current_pos += PARAMETER_LEN; + + // Copy tweak into the input buffer. + packed_input[current_pos..current_pos + TWEAK_LEN].copy_from_slice(&packed_tweak); + current_pos += TWEAK_LEN; + + // Copy the left child value into the input buffer. + let lefts: [FieldArray; WIDTH] = array::from_fn(|k| children[2 * k]); + let packed_lefts = pack_array(&lefts); + packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_lefts); + current_pos += HASH_LEN; + + // Copy the right child value into the input buffer. + let rights: [FieldArray; WIDTH] = array::from_fn(|k| children[2 * k + 1]); + let packed_rights = pack_array(&rights); + packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_rights); + + let packed_parents = + poseidon_compress::( + &perm, + &packed_input, + ); + + // unpack the parents from SIMD to scalar output + let mut parents = [FieldArray([F::ZERO; HASH_LEN]); WIDTH]; + unpack_array(&packed_parents, &mut parents); + + parents + }, + )); + + // handle non WIDTH left over elements + let remainder = children.par_chunks_exact(2 * WIDTH).remainder(); + + // TODO: parallel iterator here likely not worth it? + let num_simd_parents = parents.len(); + parents.par_extend(remainder.par_chunks_exact(2).enumerate().map(|(i, pair)| { + let pos = parent_start + num_simd_parents + i; + Self::apply(parameter, &Self::tree_tweak(level, pos as u32), pair) + })); + + parents + } + fn compute_tree_leaves( prf_key: &PRF::Key, parameter: &Self::Parameter, diff --git a/src/symmetric/tweak_hash_tree.rs b/src/symmetric/tweak_hash_tree.rs index 74ef9ad..a976008 100644 --- a/src/symmetric/tweak_hash_tree.rs +++ b/src/symmetric/tweak_hash_tree.rs @@ -2,7 +2,6 @@ use crate::serialization::Serializable; use crate::symmetric::tweak_hash::TweakableHash; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; -use rayon::prelude::*; use serde::{Deserialize, Serialize}; use ssz::{Decode, DecodeError, Encode}; @@ -391,21 +390,8 @@ where // Compute all parents in parallel, pairing children two-by-two // // We do exact chunks of two children, no remainder. - let parents = prev - .nodes - .par_chunks_exact(2) - .enumerate() - .map(|(i, children)| { - // Parent index in this layer - let parent_pos = (parent_start + i) as u32; - // Hash children into their parent using the tweak - TH::apply( - parameter, - &TH::tree_tweak((level as u8) + 1, parent_pos), - children, - ) - }) - .collect(); + let parents = + TH::compute_tree_layer(¶meter, level as u8 + 1, parent_start, &prev.nodes); // Add the new layer with padding so next iteration also has even start and length layers.push(HashTreeLayer::padded(rng, parents, parent_start)); From c6c16fc79d40520e5e6106edaf7d83ca19070883 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 9 Dec 2025 15:48:05 +0100 Subject: [PATCH 02/13] [tests] add test cases for SIMD tree construction --- src/symmetric/tweak_hash/poseidon.rs | 314 +++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 7f4fb72..5442161 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -1370,4 +1370,318 @@ mod tests { } } } + + // ==================== compute_tree_layer tests ==================== + + /// Scalar reference implementation for compute_tree_layer. + /// Used to verify the SIMD implementation produces correct results. + fn compute_tree_layer_scalar( + parameter: &TH::Parameter, + level: u8, + parent_start: usize, + children: &[TH::Domain], + ) -> Vec { + children + .chunks_exact(2) + .enumerate() + .map(|(i, pair)| { + TH::apply( + parameter, + &TH::tree_tweak(level, (parent_start + i) as u32), + pair, + ) + }) + .collect() + } + + #[test] + fn test_compute_tree_layer_matches_scalar() { + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + // Test with 16 children (8 pairs) + let children: Vec<_> = (0..16) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let level = 1u8; + let parent_start = 0usize; + + let simd_result = + PoseidonTweak44::compute_tree_layer(¶meter, level, parent_start, &children); + let scalar_result = compute_tree_layer_scalar::( + ¶meter, + level, + parent_start, + &children, + ); + + assert_eq!(simd_result.len(), scalar_result.len()); + assert_eq!(simd_result, scalar_result); + } + + #[test] + fn test_compute_tree_layer_output_length() { + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + // Test various input sizes + for num_pairs in [1, 2, 4, 7, 8, 15, 16, 17, 32, 33] { + let children: Vec<_> = (0..num_pairs * 2) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + + assert_eq!( + result.len(), + num_pairs, + "Expected {} parents for {} children, got {}", + num_pairs, + num_pairs * 2, + result.len() + ); + } + } + + #[test] + fn test_compute_tree_layer_determinism() { + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let children: Vec<_> = (0..20) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let result1 = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let result2 = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + + assert_eq!( + result1, result2, + "compute_tree_layer should be deterministic" + ); + } + + #[test] + fn test_compute_tree_layer_level_affects_output() { + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let children: Vec<_> = (0..16) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let result_level_1 = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let result_level_2 = PoseidonTweak44::compute_tree_layer(¶meter, 2, 0, &children); + + assert_ne!( + result_level_1, result_level_2, + "Different levels should produce different outputs" + ); + } + + #[test] + fn test_compute_tree_layer_parent_start_affects_output() { + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let children: Vec<_> = (0..16) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let result_start_0 = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let result_start_10 = PoseidonTweak44::compute_tree_layer(¶meter, 1, 10, &children); + + assert_ne!( + result_start_0, result_start_10, + "Different parent_start should produce different outputs" + ); + } + + #[test] + fn test_compute_tree_layer_simd_boundary_exact_width() { + // Test with exactly 2 * WIDTH children (one full SIMD batch, no remainder) + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + let children: Vec<_> = (0..2 * width) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, 1, 0, &children); + + assert_eq!(simd_result, scalar_result); + } + + #[test] + fn test_compute_tree_layer_simd_boundary_with_remainder() { + // Test with 2 * WIDTH + 2 children (one SIMD batch + one remainder pair) + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + let children: Vec<_> = (0..2 * width + 2) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, 1, 0, &children); + + assert_eq!( + simd_result.len(), + width + 1, + "Should have WIDTH + 1 parents" + ); + assert_eq!(simd_result, scalar_result); + } + + #[test] + fn test_compute_tree_layer_only_remainder() { + // Test with fewer than 2 * WIDTH children (entire computation is remainder) + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + + // Test sizes smaller than one SIMD batch + for num_pairs in 1..width { + let children: Vec<_> = (0..num_pairs * 2) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, 1, 0, &children); + + assert_eq!( + simd_result, scalar_result, + "Failed for num_pairs = {}", + num_pairs + ); + } + } + + #[test] + fn test_compute_tree_layer_two_simd_batches() { + // Test with 4 * WIDTH children (two full SIMD batches) + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + let children: Vec<_> = (0..4 * width) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, 1, 0, &children); + + assert_eq!(simd_result.len(), 2 * width); + assert_eq!(simd_result, scalar_result); + } + + #[test] + fn test_compute_tree_layer_two_batches_with_remainder() { + // Test with 4 * WIDTH + 2 children (two SIMD batches + one remainder pair) + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + let children: Vec<_> = (0..4 * width + 2) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, 1, 0, &children); + + assert_eq!(simd_result.len(), 2 * width + 1); + assert_eq!(simd_result, scalar_result); + } + + #[test] + fn test_compute_tree_layer_boundary_sweep() { + // Test all sizes from 2 to 4 * WIDTH + 2 to catch off-by-one errors + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + let max_pairs = 4 * width + 1; + + for num_pairs in 1..=max_pairs { + let children: Vec<_> = (0..num_pairs * 2) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = PoseidonTweak44::compute_tree_layer(¶meter, 1, 0, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, 1, 0, &children); + + assert_eq!( + simd_result, scalar_result, + "Mismatch for num_pairs = {} (WIDTH = {})", + num_pairs, width + ); + } + } + + #[test] + fn test_compute_tree_layer_nonzero_parent_start() { + // Test with various parent_start values to ensure tweaks are correct + let mut rng = rand::rng(); + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + + let width = PackedF::WIDTH; + + for parent_start in [0, 1, 10, 100, 1000] { + let children: Vec<_> = (0..2 * width + 4) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = + PoseidonTweak44::compute_tree_layer(¶meter, 1, parent_start, &children); + let scalar_result = compute_tree_layer_scalar::( + ¶meter, + 1, + parent_start, + &children, + ); + + assert_eq!( + simd_result, scalar_result, + "Mismatch for parent_start = {}", + parent_start + ); + } + } + + proptest! { + #[test] + fn proptest_compute_tree_layer_matches_scalar( + num_pairs in 1usize..64, + level in 0u8..32, + parent_start in 0usize..1000, + seed in any::(), + ) { + use rand::SeedableRng; + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + + let parameter = PoseidonTweak44::rand_parameter(&mut rng); + let children: Vec<_> = (0..num_pairs * 2) + .map(|_| PoseidonTweak44::rand_domain(&mut rng)) + .collect(); + + let simd_result = + PoseidonTweak44::compute_tree_layer(¶meter, level, parent_start, &children); + let scalar_result = + compute_tree_layer_scalar::(¶meter, level, parent_start, &children); + + prop_assert_eq!(simd_result.len(), num_pairs); + prop_assert_eq!(simd_result, scalar_result); + } + } } From 3cc78ea394c6d4ba4404f424d5f8aa526e9dca81 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 10 Dec 2025 13:12:40 +0100 Subject: [PATCH 03/13] address clippy errors --- src/symmetric/tweak_hash.rs | 2 +- src/symmetric/tweak_hash/poseidon.rs | 3 +-- src/symmetric/tweak_hash_tree.rs | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs index dd0d3ba..61426b8 100644 --- a/src/symmetric/tweak_hash.rs +++ b/src/symmetric/tweak_hash.rs @@ -65,7 +65,7 @@ pub trait TweakableHash { // Hash children into their parent using the tweak Self::apply( parameter, - &Self::tree_tweak((level as u8) + 1, parent_pos), + &Self::tree_tweak(level + 1, parent_pos), children, ) }) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 5442161..b12d94b 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -378,14 +378,13 @@ impl< children: &[Self::Domain], ) -> Vec { // SIMD implementation specifically for Poseidon + const WIDTH: usize = PackedF::WIDTH; // Broadcast the hash parameter to all SIMD lanes. // Each lane will use the same parameter let packed_parameter: [PackedF; PARAMETER_LEN] = array::from_fn(|i| PackedF::from(parameter[i])); - const WIDTH: usize = PackedF::WIDTH; - // permutation to use for the compression. 24 as we merge two inputs let perm = poseidon2_24(); diff --git a/src/symmetric/tweak_hash_tree.rs b/src/symmetric/tweak_hash_tree.rs index a976008..7a1a54a 100644 --- a/src/symmetric/tweak_hash_tree.rs +++ b/src/symmetric/tweak_hash_tree.rs @@ -391,7 +391,7 @@ where // // We do exact chunks of two children, no remainder. let parents = - TH::compute_tree_layer(¶meter, level as u8 + 1, parent_start, &prev.nodes); + TH::compute_tree_layer(parameter, level as u8 + 1, parent_start, &prev.nodes); // Add the new layer with padding so next iteration also has even start and length layers.push(HashTreeLayer::padded(rng, parents, parent_start)); From c9e234f1bea6975f5bcd9321c09a58f05623e9d4 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Mon, 15 Dec 2025 19:20:02 +0100 Subject: [PATCH 04/13] avoid temporary arrays entirely during packing Co-Authored-By: Thomas Coratger <60488569+tcoratger@users.noreply.github.com> --- src/simd_utils.rs | 234 ++++++++++++++++++++++++--- src/symmetric/tweak_hash/poseidon.rs | 173 ++++++++++---------- 2 files changed, 298 insertions(+), 109 deletions(-) diff --git a/src/simd_utils.rs b/src/simd_utils.rs index c9c74bc..0b90940 100644 --- a/src/simd_utils.rs +++ b/src/simd_utils.rs @@ -2,7 +2,7 @@ use core::array; use p3_field::PackedValue; -use crate::{PackedF, array::FieldArray}; +use crate::{F, PackedF, array::FieldArray}; /// Packs scalar arrays into SIMD-friendly vertical layout. /// @@ -26,7 +26,7 @@ use crate::{PackedF, array::FieldArray}; /// /// This vertical packing enables efficient SIMD operations where a single instruction /// processes the same element position across multiple arrays simultaneously. -#[inline] +#[inline(always)] pub fn pack_array(data: &[FieldArray]) -> [PackedF; N] { array::from_fn(|i| PackedF::from_fn(|j| data[j][i])) } @@ -37,32 +37,96 @@ pub fn pack_array(data: &[FieldArray]) -> [PackedF; N] { /// /// This is the inverse operation of `pack_array`. The output buffer must be preallocated /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray`. -/// -/// Input layout (vertical): each PackedF holds one element from each array -/// ```text -/// packed_data[0] = PackedF([a0, b0, c0, ...]) -/// packed_data[1] = PackedF([a1, b1, c1, ...]) -/// packed_data[2] = PackedF([a2, b2, c2, ...]) -/// ... -/// ``` -/// -/// Output layout (horizontal): each FieldArray is one complete array -/// ```text -/// output[0] = FieldArray([a0, a1, a2, ..., aN]) -/// output[1] = FieldArray([b0, b1, b2, ..., bN]) -/// output[2] = FieldArray([c0, c1, c2, ..., cN]) -/// ... -/// ``` -#[inline] +#[inline(always)] pub fn unpack_array(packed_data: &[PackedF; N], output: &mut [FieldArray]) { - for (i, data) in packed_data.iter().enumerate().take(N) { - let unpacked_v = data.as_slice(); - for j in 0..PackedF::WIDTH { - output[j][i] = unpacked_v[j]; + // Optimized for cache locality: iterate over output lanes first + for j in 0..PackedF::WIDTH { + for i in 0..N { + output[j].0[i] = packed_data[i].as_slice()[j]; } } } +#[inline(always)] +pub fn unpack_to_array( + packed_data: [PackedF; N], +) -> [FieldArray; PackedF::WIDTH] { + array::from_fn(|j| FieldArray(array::from_fn(|i| packed_data[i].as_slice()[j]))) +} + +#[inline(always)] +pub fn pack_column(col: [F; PackedF::WIDTH]) -> PackedF { + PackedF::from_fn(|i| col[i]) +} + +/// Pack contiguous FieldArrays directly into a destination slice at the given offset. +/// +/// Packs `data[0..WIDTH]` into `dest[offset..offset+N]`. +/// This avoids creating an intermediate `[PackedF; N]` array. +/// +/// # Arguments +/// * `dest` - Destination slice to pack into +/// * `offset` - Starting index in `dest` +/// * `data` - Source slice of FieldArrays (must have length >= WIDTH) +#[inline(always)] +pub fn pack_into(dest: &mut [PackedF], offset: usize, data: &[FieldArray]) { + for i in 0..N { + dest[offset + i] = PackedF::from_fn(|lane| data[lane][i]); + } +} + +/// Pack even-indexed FieldArrays (stride 2) directly into destination. +/// +/// Packs `data[0], data[2], data[4], ...` into `dest[offset..offset+N]`. +/// Useful for packing left children from interleaved `[L0, R0, L1, R1, ...]` pairs. +/// +/// # Arguments +/// * `dest` - Destination slice to pack into +/// * `offset` - Starting index in `dest` +/// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH) +#[inline(always)] +pub fn pack_even_into(dest: &mut [PackedF], offset: usize, data: &[FieldArray]) { + for i in 0..N { + dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane][i]); + } +} + +/// Pack odd-indexed FieldArrays (stride 2) directly into destination. +/// +/// Packs `data[1], data[3], data[5], ...` into `dest[offset..offset+N]`. +/// Useful for packing right children from interleaved `[L0, R0, L1, R1, ...]` pairs. +/// +/// # Arguments +/// * `dest` - Destination slice to pack into +/// * `offset` - Starting index in `dest` +/// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH) +#[inline(always)] +pub fn pack_odd_into(dest: &mut [PackedF], offset: usize, data: &[FieldArray]) { + for i in 0..N { + dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane + 1][i]); + } +} + +/// Pack values generated by a function directly into destination. +/// +/// For each element index `i` in `0..N`, generates a PackedF by calling +/// `f(i, lane)` for each SIMD lane. +/// +/// # Arguments +/// * `dest` - Destination slice to pack into +/// * `offset` - Starting index in `dest` +/// * `f` - Function that takes (element_index, lane_index) and returns a field element +#[inline(always)] +pub fn pack_fn_into( + dest: &mut [PackedF], + offset: usize, + f: impl Fn(usize, usize) -> F, +) { + for i in 0..N { + dest[offset + i] = PackedF::from_fn(|lane| f(i, lane)); + } +} + #[cfg(test)] mod tests { use crate::F; @@ -111,6 +175,24 @@ mod tests { } } + #[test] + fn test_unpack_to_array() { + // Create packed data + let packed: [PackedF; 2] = [ + PackedF::from_fn(|i| F::from_u64(i as u64)), + PackedF::from_fn(|i| F::from_u64((i + 100) as u64)), + ]; + + // Unpack using the new function + let output = unpack_to_array(packed); + + // Verify + for (lane, arr) in output.iter().enumerate() { + assert_eq!(arr[0], F::from_u64(lane as u64)); + assert_eq!(arr[1], F::from_u64((lane + 100) as u64)); + } + } + #[test] fn test_pack_preserves_element_order() { // Create data where each array has sequential values @@ -176,5 +258,111 @@ mod tests { // Verify roundtrip prop_assert_eq!(original, unpacked); } + + #[test] + fn proptest_unpack_to_array_matches_unpack_array( + _seed in any::() + ) { + let mut rng = rand::rng(); + + // Generate random packed data + let packed: [PackedF; 8] = array::from_fn(|_| { + PackedF::from_fn(|_| rng.random()) + }); + + // Unpack using both methods + let mut output1 = [FieldArray([F::ZERO; 8]); PackedF::WIDTH]; + unpack_array(&packed, &mut output1); + let output2 = unpack_to_array(packed); + + // Verify they match + prop_assert_eq!(output1, output2); + } + + #[test] + fn proptest_pack_into_matches_pack_array( + _seed in any::() + ) { + let mut rng = rand::rng(); + + // Generate random data + let data: [FieldArray<7>; PackedF::WIDTH] = array::from_fn(|_| { + FieldArray(array::from_fn(|_| rng.random())) + }); + + // Pack using pack_array + let expected = pack_array(&data); + + // Pack using pack_into + let mut dest = [PackedF::ZERO; 10]; + pack_into(&mut dest, 2, &data); + + // Verify they match at the offset + for i in 0..7 { + prop_assert_eq!(dest[2 + i], expected[i]); + } + } + + #[test] + fn proptest_pack_even_odd_into( + _seed in any::() + ) { + let mut rng = rand::rng(); + + // Generate interleaved pairs: [L0, R0, L1, R1, ...] + let pairs: [FieldArray<5>; 2 * PackedF::WIDTH] = array::from_fn(|_| { + FieldArray(array::from_fn(|_| rng.random())) + }); + + // Pack even (left children) and odd (right children) + let mut dest = [PackedF::ZERO; 12]; + pack_even_into(&mut dest, 1, &pairs); + pack_odd_into(&mut dest, 6, &pairs); + + // Verify even indices were packed correctly + for i in 0..5 { + for lane in 0..PackedF::WIDTH { + prop_assert_eq!( + dest[1 + i].as_slice()[lane], + pairs[2 * lane][i], + "Even packing mismatch at element {}, lane {}", i, lane + ); + } + } + + // Verify odd indices were packed correctly + for i in 0..5 { + for lane in 0..PackedF::WIDTH { + prop_assert_eq!( + dest[6 + i].as_slice()[lane], + pairs[2 * lane + 1][i], + "Odd packing mismatch at element {}, lane {}", i, lane + ); + } + } + } + + #[test] + fn proptest_pack_fn_into( + _seed in any::() + ) { + // Pack using a function that generates predictable values + let mut dest = [PackedF::ZERO; 8]; + pack_fn_into::<4>(&mut dest, 3, |elem_idx, lane_idx| { + F::from_u64((elem_idx * 100 + lane_idx) as u64) + }); + + // Verify + for i in 0..4 { + for lane in 0..PackedF::WIDTH { + let expected = F::from_u64((i * 100 + lane) as u64); + prop_assert_eq!( + dest[3 + i].as_slice()[lane], + expected, + "pack_fn_into mismatch at element {}, lane {}", i, lane + ); + } + } + } } } diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index b12d94b..eddec2f 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -9,7 +9,7 @@ use crate::TWEAK_SEPARATOR_FOR_TREE_HASH; use crate::array::FieldArray; use crate::poseidon2_16; use crate::poseidon2_24; -use crate::simd_utils::{pack_array, unpack_array}; +use crate::simd_utils::{pack_array, pack_even_into, pack_fn_into, pack_odd_into, unpack_array}; use crate::symmetric::prf::Pseudorandom; use crate::symmetric::tweak_hash::chain; use crate::{F, PackedF}; @@ -377,78 +377,71 @@ impl< parent_start: usize, children: &[Self::Domain], ) -> Vec { - // SIMD implementation specifically for Poseidon const WIDTH: usize = PackedF::WIDTH; - // Broadcast the hash parameter to all SIMD lanes. - // Each lane will use the same parameter + // Pre-allocate output vector + let output_len = children.len() / 2; + let mut parents = vec![FieldArray([F::ZERO; HASH_LEN]); output_len]; + + // Broadcast the hash parameter to all SIMD lanes (computed once) let packed_parameter: [PackedF; PARAMETER_LEN] = - array::from_fn(|i| PackedF::from(parameter[i])); + array::from_fn(|i| PackedF::from(parameter.0[i])); - // permutation to use for the compression. 24 as we merge two inputs + // Permutation for merging two inputs (width-24) let perm = poseidon2_24(); - // preallocate a vector that can hold the SIMD part as well as any possible scalar remainder - let mut parents = Vec::with_capacity(children.len() / 2); - parents.par_extend(children.par_chunks_exact(2 * WIDTH).enumerate().flat_map( - |(i, children)| { - let parent_pos = (parent_start + i * WIDTH) as u32; - let packed_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| { - PackedF::from_fn(|lane| { - let parent_pos_per_lane = parent_pos + (lane as u32); - Self::tree_tweak(level, parent_pos_per_lane) - .to_field_elements::()[t_idx] - }) - }); + // Offsets for assembling packed_input: [parameter | tweak | left | right] + let tweak_offset = PARAMETER_LEN; + let left_offset = PARAMETER_LEN + TWEAK_LEN; + let right_offset = PARAMETER_LEN + TWEAK_LEN + HASH_LEN; - // Assemble the packed input for the hash function. - // Layout: [parameter | tweak | left | right] + // Process SIMD batches with in-place mutation + parents + .par_chunks_exact_mut(WIDTH) + .zip(children.par_chunks_exact(2 * WIDTH)) + .enumerate() + .for_each(|(chunk_idx, (parents_chunk, children_chunk))| { + let parent_pos = (parent_start + chunk_idx * WIDTH) as u32; + + // Assemble packed input directly: [parameter | tweak | left | right] let mut packed_input = [PackedF::ZERO; MERGE_COMPRESSION_WIDTH]; - let mut current_pos = 0; - // Copy parameter into the input buffer. - packed_input[current_pos..current_pos + PARAMETER_LEN] - .copy_from_slice(&packed_parameter); - current_pos += PARAMETER_LEN; + // Copy pre-packed parameter + packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - // Copy tweak into the input buffer. - packed_input[current_pos..current_pos + TWEAK_LEN].copy_from_slice(&packed_tweak); - current_pos += TWEAK_LEN; + // Pack tweaks directly into destination + pack_fn_into::(&mut packed_input, tweak_offset, |t_idx, lane| { + Self::tree_tweak(level, parent_pos + lane as u32) + .to_field_elements::()[t_idx] + }); - // Copy the left child value into the input buffer. - let lefts: [FieldArray; WIDTH] = array::from_fn(|k| children[2 * k]); - let packed_lefts = pack_array(&lefts); - packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_lefts); - current_pos += HASH_LEN; + // Pack left children (even indices) directly into destination + pack_even_into(&mut packed_input, left_offset, children_chunk); - // Copy the right child value into the input buffer. - let rights: [FieldArray; WIDTH] = array::from_fn(|k| children[2 * k + 1]); - let packed_rights = pack_array(&rights); - packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_rights); + // Pack right children (odd indices) directly into destination + pack_odd_into(&mut packed_input, right_offset, children_chunk); + // Compress all WIDTH parent pairs simultaneously let packed_parents = poseidon_compress::( &perm, &packed_input, ); - // unpack the parents from SIMD to scalar output - let mut parents = [FieldArray([F::ZERO; HASH_LEN]); WIDTH]; - unpack_array(&packed_parents, &mut parents); - - parents - }, - )); + // Unpack directly to output slice + unpack_array(&packed_parents, parents_chunk); + }); - // handle non WIDTH left over elements - let remainder = children.par_chunks_exact(2 * WIDTH).remainder(); + // Handle remainder (elements that don't fill a complete SIMD batch) + let remainder_start = (children.len() / (2 * WIDTH)) * WIDTH; + let children_remainder = &children[remainder_start * 2..]; + let parents_remainder = &mut parents[remainder_start..]; - // TODO: parallel iterator here likely not worth it? - let num_simd_parents = parents.len(); - parents.par_extend(remainder.par_chunks_exact(2).enumerate().map(|(i, pair)| { - let pos = parent_start + num_simd_parents + i; - Self::apply(parameter, &Self::tree_tweak(level, pos as u32), pair) - })); + for (i, pair) in children_remainder.chunks_exact(2).enumerate() { + let pos = parent_start + remainder_start + i; + parents_remainder[i] = + Self::apply(parameter, &Self::tree_tweak(level, pos as u32), pair); + } parents } @@ -548,6 +541,10 @@ impl< // Cache strategy: process one chain at a time to maximize locality. // All epochs for that chain stay in registers across iterations. + // Offsets for chain compression: [parameter | tweak | current_value] + let chain_tweak_offset = PARAMETER_LEN; + let chain_value_offset = PARAMETER_LEN + TWEAK_LEN; + for (chain_index, packed_chain) in packed_chains.iter_mut().enumerate().take(num_chains) { @@ -557,32 +554,25 @@ impl< // Current position in the chain. let pos = (step + 1) as u8; - // Generate tweaks for all epochs in this SIMD batch. - // Each lane gets a tweak specific to its epoch. - let packed_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| { - PackedF::from_fn(|lane| { - Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos) - .to_field_elements::()[t_idx] - }) - }); - // Assemble the packed input for the hash function. // Layout: [parameter | tweak | current_value] let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH]; - let mut current_pos = 0; - // Copy parameter into the input buffer. - packed_input[current_pos..current_pos + PARAMETER_LEN] - .copy_from_slice(&packed_parameter); - current_pos += PARAMETER_LEN; + // Copy pre-packed parameter + packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); - // Copy tweak into the input buffer. - packed_input[current_pos..current_pos + TWEAK_LEN] - .copy_from_slice(&packed_tweak); - current_pos += TWEAK_LEN; + // Pack tweaks directly into destination + pack_fn_into::( + &mut packed_input, + chain_tweak_offset, + |t_idx, lane| { + Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos) + .to_field_elements::()[t_idx] + }, + ); - // Copy current chain value into the input buffer. - packed_input[current_pos..current_pos + HASH_LEN] + // Copy current chain value (already packed) + packed_input[chain_value_offset..chain_value_offset + HASH_LEN] .copy_from_slice(packed_chain); // Apply the hash function to advance the chain. @@ -602,23 +592,34 @@ impl< // // This uses the sponge construction for variable-length input. - // Generate tree tweaks for all epochs. - // Level 0 indicates this is a bottom-layer leaf in the tree. - let packed_tree_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| { - PackedF::from_fn(|lane| { + // Assemble the sponge input. + // Layout: [parameter | tree_tweak | all_chain_ends] + let sponge_tweak_offset = PARAMETER_LEN; + let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN; + let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN; + + let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len]; + + // Copy pre-packed parameter + packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter); + + // Pack tree tweaks directly (level 0 for bottom-layer leaves) + pack_fn_into::( + &mut packed_leaf_input, + sponge_tweak_offset, + |t_idx, lane| { Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::() [t_idx] - }) - }); + }, + ); - // Assemble the sponge input. - // Layout: [parameter | tree_tweak | all_chain_ends] - let packed_leaf_input: Vec<_> = packed_parameter - .iter() - .chain(packed_tree_tweak.iter()) - .chain(packed_chains.iter().flatten()) - .copied() - .collect(); + // Copy all chain ends (already packed) + for (c_idx, chain) in packed_chains.iter().enumerate() { + packed_leaf_input + [sponge_chains_offset + c_idx * HASH_LEN + ..sponge_chains_offset + (c_idx + 1) * HASH_LEN] + .copy_from_slice(chain); + } // Apply the sponge hash to produce the leaf. // This absorbs all chain ends and squeezes out the final hash. From c1dfc588dded2fd98d2d0a30fc7ea821c78088c8 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:09:23 +0100 Subject: [PATCH 05/13] use `#[inline]` without (always) --- src/simd_utils.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/simd_utils.rs b/src/simd_utils.rs index 0b90940..da1afa0 100644 --- a/src/simd_utils.rs +++ b/src/simd_utils.rs @@ -26,7 +26,7 @@ use crate::{F, PackedF, array::FieldArray}; /// /// This vertical packing enables efficient SIMD operations where a single instruction /// processes the same element position across multiple arrays simultaneously. -#[inline(always)] +#[inline] pub fn pack_array(data: &[FieldArray]) -> [PackedF; N] { array::from_fn(|i| PackedF::from_fn(|j| data[j][i])) } @@ -37,7 +37,7 @@ pub fn pack_array(data: &[FieldArray]) -> [PackedF; N] { /// /// This is the inverse operation of `pack_array`. The output buffer must be preallocated /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray`. -#[inline(always)] +#[inline] pub fn unpack_array(packed_data: &[PackedF; N], output: &mut [FieldArray]) { // Optimized for cache locality: iterate over output lanes first for j in 0..PackedF::WIDTH { @@ -84,7 +84,7 @@ pub fn pack_into(dest: &mut [PackedF], offset: usize, data: &[Fi /// * `dest` - Destination slice to pack into /// * `offset` - Starting index in `dest` /// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH) -#[inline(always)] +#[inline] pub fn pack_even_into(dest: &mut [PackedF], offset: usize, data: &[FieldArray]) { for i in 0..N { dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane][i]); @@ -100,7 +100,7 @@ pub fn pack_even_into(dest: &mut [PackedF], offset: usize, data: /// * `dest` - Destination slice to pack into /// * `offset` - Starting index in `dest` /// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH) -#[inline(always)] +#[inline] pub fn pack_odd_into(dest: &mut [PackedF], offset: usize, data: &[FieldArray]) { for i in 0..N { dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane + 1][i]); @@ -116,7 +116,7 @@ pub fn pack_odd_into(dest: &mut [PackedF], offset: usize, data: /// * `dest` - Destination slice to pack into /// * `offset` - Starting index in `dest` /// * `f` - Function that takes (element_index, lane_index) and returns a field element -#[inline(always)] +#[inline] pub fn pack_fn_into( dest: &mut [PackedF], offset: usize, From 6a182fd3d1fb3ae3dc39a92f0bb921048236cd19 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:10:07 +0100 Subject: [PATCH 06/13] remove `unpack_to_array`, `pack_column` and `pack_into` & their tests --- src/simd_utils.rs | 90 ----------------------------------------------- 1 file changed, 90 deletions(-) diff --git a/src/simd_utils.rs b/src/simd_utils.rs index da1afa0..550705b 100644 --- a/src/simd_utils.rs +++ b/src/simd_utils.rs @@ -47,34 +47,6 @@ pub fn unpack_array(packed_data: &[PackedF; N], output: &mut [Fi } } -#[inline(always)] -pub fn unpack_to_array( - packed_data: [PackedF; N], -) -> [FieldArray; PackedF::WIDTH] { - array::from_fn(|j| FieldArray(array::from_fn(|i| packed_data[i].as_slice()[j]))) -} - -#[inline(always)] -pub fn pack_column(col: [F; PackedF::WIDTH]) -> PackedF { - PackedF::from_fn(|i| col[i]) -} - -/// Pack contiguous FieldArrays directly into a destination slice at the given offset. -/// -/// Packs `data[0..WIDTH]` into `dest[offset..offset+N]`. -/// This avoids creating an intermediate `[PackedF; N]` array. -/// -/// # Arguments -/// * `dest` - Destination slice to pack into -/// * `offset` - Starting index in `dest` -/// * `data` - Source slice of FieldArrays (must have length >= WIDTH) -#[inline(always)] -pub fn pack_into(dest: &mut [PackedF], offset: usize, data: &[FieldArray]) { - for i in 0..N { - dest[offset + i] = PackedF::from_fn(|lane| data[lane][i]); - } -} - /// Pack even-indexed FieldArrays (stride 2) directly into destination. /// /// Packs `data[0], data[2], data[4], ...` into `dest[offset..offset+N]`. @@ -175,24 +147,6 @@ mod tests { } } - #[test] - fn test_unpack_to_array() { - // Create packed data - let packed: [PackedF; 2] = [ - PackedF::from_fn(|i| F::from_u64(i as u64)), - PackedF::from_fn(|i| F::from_u64((i + 100) as u64)), - ]; - - // Unpack using the new function - let output = unpack_to_array(packed); - - // Verify - for (lane, arr) in output.iter().enumerate() { - assert_eq!(arr[0], F::from_u64(lane as u64)); - assert_eq!(arr[1], F::from_u64((lane + 100) as u64)); - } - } - #[test] fn test_pack_preserves_element_order() { // Create data where each array has sequential values @@ -259,50 +213,6 @@ mod tests { prop_assert_eq!(original, unpacked); } - #[test] - fn proptest_unpack_to_array_matches_unpack_array( - _seed in any::() - ) { - let mut rng = rand::rng(); - - // Generate random packed data - let packed: [PackedF; 8] = array::from_fn(|_| { - PackedF::from_fn(|_| rng.random()) - }); - - // Unpack using both methods - let mut output1 = [FieldArray([F::ZERO; 8]); PackedF::WIDTH]; - unpack_array(&packed, &mut output1); - let output2 = unpack_to_array(packed); - - // Verify they match - prop_assert_eq!(output1, output2); - } - - #[test] - fn proptest_pack_into_matches_pack_array( - _seed in any::() - ) { - let mut rng = rand::rng(); - - // Generate random data - let data: [FieldArray<7>; PackedF::WIDTH] = array::from_fn(|_| { - FieldArray(array::from_fn(|_| rng.random())) - }); - - // Pack using pack_array - let expected = pack_array(&data); - - // Pack using pack_into - let mut dest = [PackedF::ZERO; 10]; - pack_into(&mut dest, 2, &data); - - // Verify they match at the offset - for i in 0..7 { - prop_assert_eq!(dest[2 + i], expected[i]); - } - } - #[test] fn proptest_pack_even_odd_into( _seed in any::() From 026bf3b2afb44c7c7a4b24f1910fe2dc10749563 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:10:46 +0100 Subject: [PATCH 07/13] annotate `unpack_array` for loop with needless_range_loop No, I do not prefer ``` for (j, ) in output.iter_mut().enumerate().take(PackedF::WIDTH){ ``` thank you, Clippy. --- src/simd_utils.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/simd_utils.rs b/src/simd_utils.rs index 550705b..c4d355f 100644 --- a/src/simd_utils.rs +++ b/src/simd_utils.rs @@ -40,6 +40,7 @@ pub fn pack_array(data: &[FieldArray]) -> [PackedF; N] { #[inline] pub fn unpack_array(packed_data: &[PackedF; N], output: &mut [FieldArray]) { // Optimized for cache locality: iterate over output lanes first + #[allow(clippy::needless_range_loop)] for j in 0..PackedF::WIDTH { for i in 0..N { output[j].0[i] = packed_data[i].as_slice()[j]; From 6faeaff43b7c66b0095e328c92d3fda191e78485 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:17:02 +0100 Subject: [PATCH 08/13] put input / output layout doc comment for `unpack_array` back Was removed in https://github.com/tcoratger/leanSig/commit/e8a727381248df83f123f2a8c9a616d9bdb1f277 and I overlooked it while squashing a bunch of our commits. --- src/simd_utils.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/simd_utils.rs b/src/simd_utils.rs index c4d355f..a181a6d 100644 --- a/src/simd_utils.rs +++ b/src/simd_utils.rs @@ -37,6 +37,22 @@ pub fn pack_array(data: &[FieldArray]) -> [PackedF; N] { /// /// This is the inverse operation of `pack_array`. The output buffer must be preallocated /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray`. +/// +/// Input layout (vertical): each PackedF holds one element from each array +/// ```text +/// packed_data[0] = PackedF([a0, b0, c0, ...]) +/// packed_data[1] = PackedF([a1, b1, c1, ...]) +/// packed_data[2] = PackedF([a2, b2, c2, ...]) +/// ... +/// ``` +/// +/// Output layout (horizontal): each FieldArray is one complete array +/// ```text +/// output[0] = FieldArray([a0, a1, a2, ..., aN]) +/// output[1] = FieldArray([b0, b1, b2, ..., bN]) +/// output[2] = FieldArray([c0, c1, c2, ..., cN]) +/// ... +/// ``` #[inline] pub fn unpack_array(packed_data: &[PackedF; N], output: &mut [FieldArray]) { // Optimized for cache locality: iterate over output lanes first From d06234767a655a835865f5874c47ec0bd0daaae5 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:24:50 +0100 Subject: [PATCH 09/13] remove comment about test section --- src/symmetric/tweak_hash/poseidon.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index eddec2f..265c1cd 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -1371,8 +1371,6 @@ mod tests { } } - // ==================== compute_tree_layer tests ==================== - /// Scalar reference implementation for compute_tree_layer. /// Used to verify the SIMD implementation produces correct results. fn compute_tree_layer_scalar( From 83a8a3c348db261645b28b04db9cfd3c38fb4131 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:25:01 +0100 Subject: [PATCH 10/13] use clippy too_many_lines for `compute_tree_leaves` --- src/symmetric/tweak_hash/poseidon.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 265c1cd..026779f 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -446,6 +446,7 @@ impl< parents } + #[allow(clippy::too_many_lines)] fn compute_tree_leaves( prf_key: &PRF::Key, parameter: &Self::Parameter, From 755292bc469c0f0afcadacf971ae158efeed362f Mon Sep 17 00:00:00 2001 From: Vindaar Date: Tue, 16 Dec 2025 13:59:27 +0100 Subject: [PATCH 11/13] use chunked iterator for assignment of packed chains --- src/symmetric/tweak_hash/poseidon.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index 026779f..eefd700 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -615,11 +615,9 @@ impl< ); // Copy all chain ends (already packed) - for (c_idx, chain) in packed_chains.iter().enumerate() { - packed_leaf_input - [sponge_chains_offset + c_idx * HASH_LEN - ..sponge_chains_offset + (c_idx + 1) * HASH_LEN] - .copy_from_slice(chain); + let dst = &mut packed_leaf_input[sponge_chains_offset .. sponge_chains_offset + packed_chains.len() * HASH_LEN]; + for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) { + dst_chunk.copy_from_slice(src_chain); } // Apply the sponge hash to produce the leaf. From 99cbaf5c923bffe0652cad932fad7be471d76390 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 17 Dec 2025 12:17:49 +0100 Subject: [PATCH 12/13] fix `level` passed to `tree_tweak` in default impl --- src/symmetric/tweak_hash.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs index 61426b8..8f98f2f 100644 --- a/src/symmetric/tweak_hash.rs +++ b/src/symmetric/tweak_hash.rs @@ -63,11 +63,7 @@ pub trait TweakableHash { // Parent index in this layer let parent_pos = (parent_start + i) as u32; // Hash children into their parent using the tweak - Self::apply( - parameter, - &Self::tree_tweak(level + 1, parent_pos), - children, - ) + Self::apply(parameter, &Self::tree_tweak(level, parent_pos), children) }) .collect() } From 9571be327fd7542e7b5ce11f6b80591d77b7e38d Mon Sep 17 00:00:00 2001 From: Vindaar Date: Wed, 17 Dec 2025 12:58:03 +0100 Subject: [PATCH 13/13] add doc comments for compute_tree_layer and poseidon override --- src/symmetric/tweak_hash.rs | 21 ++++++++++++++++++++- src/symmetric/tweak_hash/poseidon.rs | 4 ++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs index 8f98f2f..ff032bf 100644 --- a/src/symmetric/tweak_hash.rs +++ b/src/symmetric/tweak_hash.rs @@ -48,7 +48,26 @@ pub trait TweakableHash { message: &[Self::Domain], ) -> Self::Domain; - /// Applies the calculation for a single tweak hash tree layer. + /// Computes one layer of a Merkle tree by hashing pairs of children into parents. + /// + /// Consecutive pairs of child nodes produce their parent node by hashing + /// `(children[2*i], children[2*i+1])`. Each hash application uses a unique + /// tweak derived from the tree level and position. + /// + /// # Arguments + /// * `parameter` - Public parameter for the hash function + /// * `level` - Tree level of the *parent* nodes being computed. NOTE: callers + /// need to pass `level + 1` where `level` is the children's level, since + /// tree levels are numbered from leaves (level 0) upward. + /// * `parent_start` - Starting index of the first parent in this layer, used + /// for computing position-dependent tweaks + /// * `children` - Slice of child nodes to hash pairwise (length must be even) + /// + /// # Returns + /// A vector of parent nodes with length `children.len() / 2`. + /// + /// This default implementation processes pairs in parallel using Rayon. + /// The Poseidon implementation overrides this with a SIMD-accelerated variant. fn compute_tree_layer( parameter: &Self::Parameter, level: u8, diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs index eefd700..c257297 100644 --- a/src/symmetric/tweak_hash/poseidon.rs +++ b/src/symmetric/tweak_hash/poseidon.rs @@ -371,6 +371,10 @@ impl< } } + /// SIMD-accelerated computation of one Merkle tree layer. + /// + /// Processes `PackedF::WIDTH` parent pairs simultaneously using SIMD instructions, + /// with a scalar fallback for any remainder elements. fn compute_tree_layer( parameter: &Self::Parameter, level: u8,