From 3585dada3f71e7b13a6d6e9a292733273d87b0b6 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 9 Dec 2025 15:11:32 +0100
Subject: [PATCH 01/13] implement variant of tree layer computation using SIMD
 packing

---
 src/symmetric/tweak_hash.rs          | 26 +++++++++
 src/symmetric/tweak_hash/poseidon.rs | 83 ++++++++++++++++++++++++++++
 src/symmetric/tweak_hash_tree.rs     | 18 +-----
 3 files changed, 111 insertions(+), 16 deletions(-)

diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs
index efbb90d..dd0d3ba 100644
--- a/src/symmetric/tweak_hash.rs
+++ b/src/symmetric/tweak_hash.rs
@@ -1,5 +1,7 @@
 use rand::Rng;
 
+use rayon::prelude::*;
+
 use crate::serialization::Serializable;
 use crate::symmetric::prf::Pseudorandom;
 
@@ -46,6 +48,30 @@ pub trait TweakableHash {
         message: &[Self::Domain],
     ) -> Self::Domain;
 
+    /// Applies the calculation for a single tweak hash tree layer.
+    fn compute_tree_layer(
+        parameter: &Self::Parameter,
+        level: u8,
+        parent_start: usize,
+        children: &[Self::Domain],
+    ) -> Vec<Self::Domain> {
+        // default implementation is scalar. tweak_tree/poseidon.rs provides a SIMD variant
+        children
+            .par_chunks_exact(2)
+            .enumerate()
+            .map(|(i, children)| {
+                // Parent index in this layer
+                let parent_pos = (parent_start + i) as u32;
+                // Hash children into their parent using the tweak
+                Self::apply(
+                    parameter,
+                    &Self::tree_tweak((level as u8) + 1, parent_pos),
+                    children,
+                )
+            })
+            .collect()
+    }
+
     /// Computes bottom tree leaves by walking hash chains for multiple epochs.
     ///
     /// This method has a default scalar implementation that processes epochs in parallel.
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index bcd709f..7f4fb72 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -371,6 +371,89 @@ impl<
         }
     }
 
+    fn compute_tree_layer(
+        parameter: &Self::Parameter,
+        level: u8,
+        parent_start: usize,
+        children: &[Self::Domain],
+    ) -> Vec<Self::Domain> {
+        // SIMD implementation specifically for Poseidon
+
+        // Broadcast the hash parameter to all SIMD lanes.
+        // Each lane will use the same parameter
+        let packed_parameter: [PackedF; PARAMETER_LEN] =
+            array::from_fn(|i| PackedF::from(parameter[i]));
+
+        const WIDTH: usize = PackedF::WIDTH;
+
+        // permutation to use for the compression. 24 as we merge two inputs
+        let perm = poseidon2_24();
+
+        // preallocate a vector that can hold the SIMD part as well as any possible scalar remainder
+        let mut parents = Vec::with_capacity(children.len() / 2);
+        parents.par_extend(children.par_chunks_exact(2 * WIDTH).enumerate().flat_map(
+            |(i, children)| {
+                let parent_pos = (parent_start + i * WIDTH) as u32;
+                let packed_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| {
+                    PackedF::from_fn(|lane| {
+                        let parent_pos_per_lane = parent_pos + (lane as u32);
+                        Self::tree_tweak(level, parent_pos_per_lane)
+                            .to_field_elements::<TWEAK_LEN>()[t_idx]
+                    })
+                });
+
+                // Assemble the packed input for the hash function.
+                // Layout: [parameter | tweak | left | right]
+                let mut packed_input = [PackedF::ZERO; MERGE_COMPRESSION_WIDTH];
+                let mut current_pos = 0;
+
+                // Copy parameter into the input buffer.
+                packed_input[current_pos..current_pos + PARAMETER_LEN]
+                    .copy_from_slice(&packed_parameter);
+                current_pos += PARAMETER_LEN;
+
+                // Copy tweak into the input buffer.
+                packed_input[current_pos..current_pos + TWEAK_LEN].copy_from_slice(&packed_tweak);
+                current_pos += TWEAK_LEN;
+
+                // Copy the left child value into the input buffer.
+                let lefts: [FieldArray<HASH_LEN>; WIDTH] = array::from_fn(|k| children[2 * k]);
+                let packed_lefts = pack_array(&lefts);
+                packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_lefts);
+                current_pos += HASH_LEN;
+
+                // Copy the right child value into the input buffer.
+                let rights: [FieldArray<HASH_LEN>; WIDTH] = array::from_fn(|k| children[2 * k + 1]);
+                let packed_rights = pack_array(&rights);
+                packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_rights);
+
+                let packed_parents =
+                    poseidon_compress::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                        &perm,
+                        &packed_input,
+                    );
+
+                // unpack the parents from SIMD to scalar output
+                let mut parents = [FieldArray([F::ZERO; HASH_LEN]); WIDTH];
+                unpack_array(&packed_parents, &mut parents);
+
+                parents
+            },
+        ));
+
+        // handle non WIDTH left over elements
+        let remainder = children.par_chunks_exact(2 * WIDTH).remainder();
+
+        // TODO: parallel iterator here likely not worth it?
+        let num_simd_parents = parents.len();
+        parents.par_extend(remainder.par_chunks_exact(2).enumerate().map(|(i, pair)| {
+            let pos = parent_start + num_simd_parents + i;
+            Self::apply(parameter, &Self::tree_tweak(level, pos as u32), pair)
+        }));
+
+        parents
+    }
+
     fn compute_tree_leaves<PRF>(
         prf_key: &PRF::Key,
         parameter: &Self::Parameter,
diff --git a/src/symmetric/tweak_hash_tree.rs b/src/symmetric/tweak_hash_tree.rs
index 74ef9ad..a976008 100644
--- a/src/symmetric/tweak_hash_tree.rs
+++ b/src/symmetric/tweak_hash_tree.rs
@@ -2,7 +2,6 @@ use crate::serialization::Serializable;
 use crate::symmetric::tweak_hash::TweakableHash;
 use rand::rngs::StdRng;
 use rand::{Rng, SeedableRng};
-use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use ssz::{Decode, DecodeError, Encode};
 
@@ -391,21 +390,8 @@ where
             // Compute all parents in parallel, pairing children two-by-two
             //
             // We do exact chunks of two children, no remainder.
-            let parents = prev
-                .nodes
-                .par_chunks_exact(2)
-                .enumerate()
-                .map(|(i, children)| {
-                    // Parent index in this layer
-                    let parent_pos = (parent_start + i) as u32;
-                    // Hash children into their parent using the tweak
-                    TH::apply(
-                        parameter,
-                        &TH::tree_tweak((level as u8) + 1, parent_pos),
-                        children,
-                    )
-                })
-                .collect();
+            let parents =
+                TH::compute_tree_layer(&parameter, level as u8 + 1, parent_start, &prev.nodes);
 
             // Add the new layer with padding so next iteration also has even start and length
             layers.push(HashTreeLayer::padded(rng, parents, parent_start));

From c6c16fc79d40520e5e6106edaf7d83ca19070883 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 9 Dec 2025 15:48:05 +0100
Subject: [PATCH 02/13] [tests] add test cases for SIMD tree construction

---
 src/symmetric/tweak_hash/poseidon.rs | 314 +++++++++++++++++++++++++++
 1 file changed, 314 insertions(+)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 7f4fb72..5442161 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -1370,4 +1370,318 @@ mod tests {
             }
         }
     }
+
+    // ==================== compute_tree_layer tests ====================
+
+    /// Scalar reference implementation for compute_tree_layer.
+    /// Used to verify the SIMD implementation produces correct results.
+    fn compute_tree_layer_scalar<TH: TweakableHash>(
+        parameter: &TH::Parameter,
+        level: u8,
+        parent_start: usize,
+        children: &[TH::Domain],
+    ) -> Vec<TH::Domain> {
+        children
+            .chunks_exact(2)
+            .enumerate()
+            .map(|(i, pair)| {
+                TH::apply(
+                    parameter,
+                    &TH::tree_tweak(level, (parent_start + i) as u32),
+                    pair,
+                )
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_compute_tree_layer_matches_scalar() {
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        // Test with 16 children (8 pairs)
+        let children: Vec<_> = (0..16)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let level = 1u8;
+        let parent_start = 0usize;
+
+        let simd_result =
+            PoseidonTweak44::compute_tree_layer(&parameter, level, parent_start, &children);
+        let scalar_result = compute_tree_layer_scalar::<PoseidonTweak44>(
+            &parameter,
+            level,
+            parent_start,
+            &children,
+        );
+
+        assert_eq!(simd_result.len(), scalar_result.len());
+        assert_eq!(simd_result, scalar_result);
+    }
+
+    #[test]
+    fn test_compute_tree_layer_output_length() {
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        // Test various input sizes
+        for num_pairs in [1, 2, 4, 7, 8, 15, 16, 17, 32, 33] {
+            let children: Vec<_> = (0..num_pairs * 2)
+                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+                .collect();
+
+            let result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+
+            assert_eq!(
+                result.len(),
+                num_pairs,
+                "Expected {} parents for {} children, got {}",
+                num_pairs,
+                num_pairs * 2,
+                result.len()
+            );
+        }
+    }
+
+    #[test]
+    fn test_compute_tree_layer_determinism() {
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let children: Vec<_> = (0..20)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let result1 = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let result2 = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+
+        assert_eq!(
+            result1, result2,
+            "compute_tree_layer should be deterministic"
+        );
+    }
+
+    #[test]
+    fn test_compute_tree_layer_level_affects_output() {
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let children: Vec<_> = (0..16)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let result_level_1 = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let result_level_2 = PoseidonTweak44::compute_tree_layer(&parameter, 2, 0, &children);
+
+        assert_ne!(
+            result_level_1, result_level_2,
+            "Different levels should produce different outputs"
+        );
+    }
+
+    #[test]
+    fn test_compute_tree_layer_parent_start_affects_output() {
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let children: Vec<_> = (0..16)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let result_start_0 = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let result_start_10 = PoseidonTweak44::compute_tree_layer(&parameter, 1, 10, &children);
+
+        assert_ne!(
+            result_start_0, result_start_10,
+            "Different parent_start should produce different outputs"
+        );
+    }
+
+    #[test]
+    fn test_compute_tree_layer_simd_boundary_exact_width() {
+        // Test with exactly 2 * WIDTH children (one full SIMD batch, no remainder)
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+        let children: Vec<_> = (0..2 * width)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let simd_result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let scalar_result =
+            compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, 1, 0, &children);
+
+        assert_eq!(simd_result, scalar_result);
+    }
+
+    #[test]
+    fn test_compute_tree_layer_simd_boundary_with_remainder() {
+        // Test with 2 * WIDTH + 2 children (one SIMD batch + one remainder pair)
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+        let children: Vec<_> = (0..2 * width + 2)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let simd_result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let scalar_result =
+            compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, 1, 0, &children);
+
+        assert_eq!(
+            simd_result.len(),
+            width + 1,
+            "Should have WIDTH + 1 parents"
+        );
+        assert_eq!(simd_result, scalar_result);
+    }
+
+    #[test]
+    fn test_compute_tree_layer_only_remainder() {
+        // Test with fewer than 2 * WIDTH children (entire computation is remainder)
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+
+        // Test sizes smaller than one SIMD batch
+        for num_pairs in 1..width {
+            let children: Vec<_> = (0..num_pairs * 2)
+                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+                .collect();
+
+            let simd_result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+            let scalar_result =
+                compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, 1, 0, &children);
+
+            assert_eq!(
+                simd_result, scalar_result,
+                "Failed for num_pairs = {}",
+                num_pairs
+            );
+        }
+    }
+
+    #[test]
+    fn test_compute_tree_layer_two_simd_batches() {
+        // Test with 4 * WIDTH children (two full SIMD batches)
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+        let children: Vec<_> = (0..4 * width)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let simd_result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let scalar_result =
+            compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, 1, 0, &children);
+
+        assert_eq!(simd_result.len(), 2 * width);
+        assert_eq!(simd_result, scalar_result);
+    }
+
+    #[test]
+    fn test_compute_tree_layer_two_batches_with_remainder() {
+        // Test with 4 * WIDTH + 2 children (two SIMD batches + one remainder pair)
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+        let children: Vec<_> = (0..4 * width + 2)
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
+
+        let simd_result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+        let scalar_result =
+            compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, 1, 0, &children);
+
+        assert_eq!(simd_result.len(), 2 * width + 1);
+        assert_eq!(simd_result, scalar_result);
+    }
+
+    #[test]
+    fn test_compute_tree_layer_boundary_sweep() {
+        // Test all sizes from 2 to 4 * WIDTH + 2 to catch off-by-one errors
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+        let max_pairs = 4 * width + 1;
+
+        for num_pairs in 1..=max_pairs {
+            let children: Vec<_> = (0..num_pairs * 2)
+                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+                .collect();
+
+            let simd_result = PoseidonTweak44::compute_tree_layer(&parameter, 1, 0, &children);
+            let scalar_result =
+                compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, 1, 0, &children);
+
+            assert_eq!(
+                simd_result, scalar_result,
+                "Mismatch for num_pairs = {} (WIDTH = {})",
+                num_pairs, width
+            );
+        }
+    }
+
+    #[test]
+    fn test_compute_tree_layer_nonzero_parent_start() {
+        // Test with various parent_start values to ensure tweaks are correct
+        let mut rng = rand::rng();
+        let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+
+        let width = PackedF::WIDTH;
+
+        for parent_start in [0, 1, 10, 100, 1000] {
+            let children: Vec<_> = (0..2 * width + 4)
+                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+                .collect();
+
+            let simd_result =
+                PoseidonTweak44::compute_tree_layer(&parameter, 1, parent_start, &children);
+            let scalar_result = compute_tree_layer_scalar::<PoseidonTweak44>(
+                &parameter,
+                1,
+                parent_start,
+                &children,
+            );
+
+            assert_eq!(
+                simd_result, scalar_result,
+                "Mismatch for parent_start = {}",
+                parent_start
+            );
+        }
+    }
+
+    proptest! {
+        #[test]
+        fn proptest_compute_tree_layer_matches_scalar(
+            num_pairs in 1usize..64,
+            level in 0u8..32,
+            parent_start in 0usize..1000,
+            seed in any::<u64>(),
+        ) {
+            use rand::SeedableRng;
+            let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
+
+            let parameter = PoseidonTweak44::rand_parameter(&mut rng);
+            let children: Vec<_> = (0..num_pairs * 2)
+                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+                .collect();
+
+            let simd_result =
+                PoseidonTweak44::compute_tree_layer(&parameter, level, parent_start, &children);
+            let scalar_result =
+                compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, level, parent_start, &children);
+
+            prop_assert_eq!(simd_result.len(), num_pairs);
+            prop_assert_eq!(simd_result, scalar_result);
+        }
+    }
 }

From 3cc78ea394c6d4ba4404f424d5f8aa526e9dca81 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 10 Dec 2025 13:12:40 +0100
Subject: [PATCH 03/13] address clippy errors

---
 src/symmetric/tweak_hash.rs          | 2 +-
 src/symmetric/tweak_hash/poseidon.rs | 3 +--
 src/symmetric/tweak_hash_tree.rs     | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs
index dd0d3ba..61426b8 100644
--- a/src/symmetric/tweak_hash.rs
+++ b/src/symmetric/tweak_hash.rs
@@ -65,7 +65,7 @@ pub trait TweakableHash {
                 // Hash children into their parent using the tweak
                 Self::apply(
                     parameter,
-                    &Self::tree_tweak((level as u8) + 1, parent_pos),
+                    &Self::tree_tweak(level + 1, parent_pos),
                     children,
                 )
             })
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 5442161..b12d94b 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -378,14 +378,13 @@ impl<
         children: &[Self::Domain],
     ) -> Vec<Self::Domain> {
         // SIMD implementation specifically for Poseidon
+        const WIDTH: usize = PackedF::WIDTH;
 
         // Broadcast the hash parameter to all SIMD lanes.
         // Each lane will use the same parameter
         let packed_parameter: [PackedF; PARAMETER_LEN] =
             array::from_fn(|i| PackedF::from(parameter[i]));
 
-        const WIDTH: usize = PackedF::WIDTH;
-
         // permutation to use for the compression. 24 as we merge two inputs
         let perm = poseidon2_24();
 
diff --git a/src/symmetric/tweak_hash_tree.rs b/src/symmetric/tweak_hash_tree.rs
index a976008..7a1a54a 100644
--- a/src/symmetric/tweak_hash_tree.rs
+++ b/src/symmetric/tweak_hash_tree.rs
@@ -391,7 +391,7 @@ where
             //
             // We do exact chunks of two children, no remainder.
             let parents =
-                TH::compute_tree_layer(&parameter, level as u8 + 1, parent_start, &prev.nodes);
+                TH::compute_tree_layer(parameter, level as u8 + 1, parent_start, &prev.nodes);
 
             // Add the new layer with padding so next iteration also has even start and length
             layers.push(HashTreeLayer::padded(rng, parents, parent_start));

From c9e234f1bea6975f5bcd9321c09a58f05623e9d4 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 15 Dec 2025 19:20:02 +0100
Subject: [PATCH 04/13] avoid temporary arrays entirely during packing

Co-Authored-By: Thomas Coratger <60488569+tcoratger@users.noreply.github.com>
---
 src/simd_utils.rs                    | 234 ++++++++++++++++++++++++---
 src/symmetric/tweak_hash/poseidon.rs | 173 ++++++++++----------
 2 files changed, 298 insertions(+), 109 deletions(-)

diff --git a/src/simd_utils.rs b/src/simd_utils.rs
index c9c74bc..0b90940 100644
--- a/src/simd_utils.rs
+++ b/src/simd_utils.rs
@@ -2,7 +2,7 @@ use core::array;
 
 use p3_field::PackedValue;
 
-use crate::{PackedF, array::FieldArray};
+use crate::{F, PackedF, array::FieldArray};
 
 /// Packs scalar arrays into SIMD-friendly vertical layout.
 ///
@@ -26,7 +26,7 @@ use crate::{PackedF, array::FieldArray};
 ///
 /// This vertical packing enables efficient SIMD operations where a single instruction
 /// processes the same element position across multiple arrays simultaneously.
-#[inline]
+#[inline(always)]
 pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
     array::from_fn(|i| PackedF::from_fn(|j| data[j][i]))
 }
@@ -37,32 +37,96 @@ pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
 ///
 /// This is the inverse operation of `pack_array`. The output buffer must be preallocated
 /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray<N>`.
-///
-/// Input layout (vertical): each PackedF holds one element from each array
-/// ```text
-/// packed_data[0] = PackedF([a0, b0, c0, ...])
-/// packed_data[1] = PackedF([a1, b1, c1, ...])
-/// packed_data[2] = PackedF([a2, b2, c2, ...])
-/// ...
-/// ```
-///
-/// Output layout (horizontal): each FieldArray is one complete array
-/// ```text
-/// output[0] = FieldArray([a0, a1, a2, ..., aN])
-/// output[1] = FieldArray([b0, b1, b2, ..., bN])
-/// output[2] = FieldArray([c0, c1, c2, ..., cN])
-/// ...
-/// ```
-#[inline]
+#[inline(always)]
 pub fn unpack_array<const N: usize>(packed_data: &[PackedF; N], output: &mut [FieldArray<N>]) {
-    for (i, data) in packed_data.iter().enumerate().take(N) {
-        let unpacked_v = data.as_slice();
-        for j in 0..PackedF::WIDTH {
-            output[j][i] = unpacked_v[j];
+    // Optimized for cache locality: iterate over output lanes first
+    for j in 0..PackedF::WIDTH {
+        for i in 0..N {
+            output[j].0[i] = packed_data[i].as_slice()[j];
         }
     }
 }
 
+#[inline(always)]
+pub fn unpack_to_array<const N: usize>(
+    packed_data: [PackedF; N],
+) -> [FieldArray<N>; PackedF::WIDTH] {
+    array::from_fn(|j| FieldArray(array::from_fn(|i| packed_data[i].as_slice()[j])))
+}
+
+#[inline(always)]
+pub fn pack_column(col: [F; PackedF::WIDTH]) -> PackedF {
+    PackedF::from_fn(|i| col[i])
+}
+
+/// Pack contiguous FieldArrays directly into a destination slice at the given offset.
+///
+/// Packs `data[0..WIDTH]` into `dest[offset..offset+N]`.
+/// This avoids creating an intermediate `[PackedF; N]` array.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `data` - Source slice of FieldArrays (must have length >= WIDTH)
+#[inline(always)]
+pub fn pack_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| data[lane][i]);
+    }
+}
+
+/// Pack even-indexed FieldArrays (stride 2) directly into destination.
+///
+/// Packs `data[0], data[2], data[4], ...` into `dest[offset..offset+N]`.
+/// Useful for packing left children from interleaved `[L0, R0, L1, R1, ...]` pairs.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH)
+#[inline(always)]
+pub fn pack_even_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane][i]);
+    }
+}
+
+/// Pack odd-indexed FieldArrays (stride 2) directly into destination.
+///
+/// Packs `data[1], data[3], data[5], ...` into `dest[offset..offset+N]`.
+/// Useful for packing right children from interleaved `[L0, R0, L1, R1, ...]` pairs.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH)
+#[inline(always)]
+pub fn pack_odd_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane + 1][i]);
+    }
+}
+
+/// Pack values generated by a function directly into destination.
+///
+/// For each element index `i` in `0..N`, generates a PackedF by calling
+/// `f(i, lane)` for each SIMD lane.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `f` - Function that takes (element_index, lane_index) and returns a field element
+#[inline(always)]
+pub fn pack_fn_into<const N: usize>(
+    dest: &mut [PackedF],
+    offset: usize,
+    f: impl Fn(usize, usize) -> F,
+) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| f(i, lane));
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::F;
@@ -111,6 +175,24 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_unpack_to_array() {
+        // Create packed data
+        let packed: [PackedF; 2] = [
+            PackedF::from_fn(|i| F::from_u64(i as u64)),
+            PackedF::from_fn(|i| F::from_u64((i + 100) as u64)),
+        ];
+
+        // Unpack using the new function
+        let output = unpack_to_array(packed);
+
+        // Verify
+        for (lane, arr) in output.iter().enumerate() {
+            assert_eq!(arr[0], F::from_u64(lane as u64));
+            assert_eq!(arr[1], F::from_u64((lane + 100) as u64));
+        }
+    }
+
     #[test]
     fn test_pack_preserves_element_order() {
         // Create data where each array has sequential values
@@ -176,5 +258,111 @@ mod tests {
             // Verify roundtrip
             prop_assert_eq!(original, unpacked);
         }
+
+        #[test]
+        fn proptest_unpack_to_array_matches_unpack_array(
+            _seed in any::<u64>()
+        ) {
+            let mut rng = rand::rng();
+
+            // Generate random packed data
+            let packed: [PackedF; 8] = array::from_fn(|_| {
+                PackedF::from_fn(|_| rng.random())
+            });
+
+            // Unpack using both methods
+            let mut output1 = [FieldArray([F::ZERO; 8]); PackedF::WIDTH];
+            unpack_array(&packed, &mut output1);
+            let output2 = unpack_to_array(packed);
+
+            // Verify they match
+            prop_assert_eq!(output1, output2);
+        }
+
+        #[test]
+        fn proptest_pack_into_matches_pack_array(
+            _seed in any::<u64>()
+        ) {
+            let mut rng = rand::rng();
+
+            // Generate random data
+            let data: [FieldArray<7>; PackedF::WIDTH] = array::from_fn(|_| {
+                FieldArray(array::from_fn(|_| rng.random()))
+            });
+
+            // Pack using pack_array
+            let expected = pack_array(&data);
+
+            // Pack using pack_into
+            let mut dest = [PackedF::ZERO; 10];
+            pack_into(&mut dest, 2, &data);
+
+            // Verify they match at the offset
+            for i in 0..7 {
+                prop_assert_eq!(dest[2 + i], expected[i]);
+            }
+        }
+
+        #[test]
+        fn proptest_pack_even_odd_into(
+            _seed in any::<u64>()
+        ) {
+            let mut rng = rand::rng();
+
+            // Generate interleaved pairs: [L0, R0, L1, R1, ...]
+            let pairs: [FieldArray<5>; 2 * PackedF::WIDTH] = array::from_fn(|_| {
+                FieldArray(array::from_fn(|_| rng.random()))
+            });
+
+            // Pack even (left children) and odd (right children)
+            let mut dest = [PackedF::ZERO; 12];
+            pack_even_into(&mut dest, 1, &pairs);
+            pack_odd_into(&mut dest, 6, &pairs);
+
+            // Verify even indices were packed correctly
+            for i in 0..5 {
+                for lane in 0..PackedF::WIDTH {
+                    prop_assert_eq!(
+                        dest[1 + i].as_slice()[lane],
+                        pairs[2 * lane][i],
+                        "Even packing mismatch at element {}, lane {}", i, lane
+                    );
+                }
+            }
+
+            // Verify odd indices were packed correctly
+            for i in 0..5 {
+                for lane in 0..PackedF::WIDTH {
+                    prop_assert_eq!(
+                        dest[6 + i].as_slice()[lane],
+                        pairs[2 * lane + 1][i],
+                        "Odd packing mismatch at element {}, lane {}", i, lane
+                    );
+                }
+            }
+        }
+
+        #[test]
+        fn proptest_pack_fn_into(
+            _seed in any::<u64>()
+        ) {
+            // Pack using a function that generates predictable values
+            let mut dest = [PackedF::ZERO; 8];
+            pack_fn_into::<4>(&mut dest, 3, |elem_idx, lane_idx| {
+                F::from_u64((elem_idx * 100 + lane_idx) as u64)
+            });
+
+            // Verify
+            for i in 0..4 {
+                for lane in 0..PackedF::WIDTH {
+                    let expected = F::from_u64((i * 100 + lane) as u64);
+                    prop_assert_eq!(
+                        dest[3 + i].as_slice()[lane],
+                        expected,
+                        "pack_fn_into mismatch at element {}, lane {}", i, lane
+                    );
+                }
+            }
+        }
     }
 }
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index b12d94b..eddec2f 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -9,7 +9,7 @@ use crate::TWEAK_SEPARATOR_FOR_TREE_HASH;
 use crate::array::FieldArray;
 use crate::poseidon2_16;
 use crate::poseidon2_24;
-use crate::simd_utils::{pack_array, unpack_array};
+use crate::simd_utils::{pack_array, pack_even_into, pack_fn_into, pack_odd_into, unpack_array};
 use crate::symmetric::prf::Pseudorandom;
 use crate::symmetric::tweak_hash::chain;
 use crate::{F, PackedF};
@@ -377,78 +377,71 @@ impl<
         parent_start: usize,
         children: &[Self::Domain],
     ) -> Vec<Self::Domain> {
-        // SIMD implementation specifically for Poseidon
         const WIDTH: usize = PackedF::WIDTH;
 
-        // Broadcast the hash parameter to all SIMD lanes.
-        // Each lane will use the same parameter
+        // Pre-allocate output vector
+        let output_len = children.len() / 2;
+        let mut parents = vec![FieldArray([F::ZERO; HASH_LEN]); output_len];
+
+        // Broadcast the hash parameter to all SIMD lanes (computed once)
         let packed_parameter: [PackedF; PARAMETER_LEN] =
-            array::from_fn(|i| PackedF::from(parameter[i]));
+            array::from_fn(|i| PackedF::from(parameter.0[i]));
 
-        // permutation to use for the compression. 24 as we merge two inputs
+        // Permutation for merging two inputs (width-24)
         let perm = poseidon2_24();
 
-        // preallocate a vector that can hold the SIMD part as well as any possible scalar remainder
-        let mut parents = Vec::with_capacity(children.len() / 2);
-        parents.par_extend(children.par_chunks_exact(2 * WIDTH).enumerate().flat_map(
-            |(i, children)| {
-                let parent_pos = (parent_start + i * WIDTH) as u32;
-                let packed_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| {
-                    PackedF::from_fn(|lane| {
-                        let parent_pos_per_lane = parent_pos + (lane as u32);
-                        Self::tree_tweak(level, parent_pos_per_lane)
-                            .to_field_elements::<TWEAK_LEN>()[t_idx]
-                    })
-                });
+        // Offsets for assembling packed_input: [parameter | tweak | left | right]
+        let tweak_offset = PARAMETER_LEN;
+        let left_offset = PARAMETER_LEN + TWEAK_LEN;
+        let right_offset = PARAMETER_LEN + TWEAK_LEN + HASH_LEN;
 
-                // Assemble the packed input for the hash function.
-                // Layout: [parameter | tweak | left | right]
+        // Process SIMD batches with in-place mutation
+        parents
+            .par_chunks_exact_mut(WIDTH)
+            .zip(children.par_chunks_exact(2 * WIDTH))
+            .enumerate()
+            .for_each(|(chunk_idx, (parents_chunk, children_chunk))| {
+                let parent_pos = (parent_start + chunk_idx * WIDTH) as u32;
+
+                // Assemble packed input directly: [parameter | tweak | left | right]
                 let mut packed_input = [PackedF::ZERO; MERGE_COMPRESSION_WIDTH];
-                let mut current_pos = 0;
 
-                // Copy parameter into the input buffer.
-                packed_input[current_pos..current_pos + PARAMETER_LEN]
-                    .copy_from_slice(&packed_parameter);
-                current_pos += PARAMETER_LEN;
+                // Copy pre-packed parameter
+                packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
 
-                // Copy tweak into the input buffer.
-                packed_input[current_pos..current_pos + TWEAK_LEN].copy_from_slice(&packed_tweak);
-                current_pos += TWEAK_LEN;
+                // Pack tweaks directly into destination
+                pack_fn_into::<TWEAK_LEN>(&mut packed_input, tweak_offset, |t_idx, lane| {
+                    Self::tree_tweak(level, parent_pos + lane as u32)
+                        .to_field_elements::<TWEAK_LEN>()[t_idx]
+                });
 
-                // Copy the left child value into the input buffer.
-                let lefts: [FieldArray<HASH_LEN>; WIDTH] = array::from_fn(|k| children[2 * k]);
-                let packed_lefts = pack_array(&lefts);
-                packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_lefts);
-                current_pos += HASH_LEN;
+                // Pack left children (even indices) directly into destination
+                pack_even_into(&mut packed_input, left_offset, children_chunk);
 
-                // Copy the right child value into the input buffer.
-                let rights: [FieldArray<HASH_LEN>; WIDTH] = array::from_fn(|k| children[2 * k + 1]);
-                let packed_rights = pack_array(&rights);
-                packed_input[current_pos..current_pos + HASH_LEN].copy_from_slice(&packed_rights);
+                // Pack right children (odd indices) directly into destination
+                pack_odd_into(&mut packed_input, right_offset, children_chunk);
 
+                // Compress all WIDTH parent pairs simultaneously
                 let packed_parents =
                     poseidon_compress::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
                         &perm,
                         &packed_input,
                     );
 
-                // unpack the parents from SIMD to scalar output
-                let mut parents = [FieldArray([F::ZERO; HASH_LEN]); WIDTH];
-                unpack_array(&packed_parents, &mut parents);
-
-                parents
-            },
-        ));
+                // Unpack directly to output slice
+                unpack_array(&packed_parents, parents_chunk);
+            });
 
-        // handle non WIDTH left over elements
-        let remainder = children.par_chunks_exact(2 * WIDTH).remainder();
+        // Handle remainder (elements that don't fill a complete SIMD batch)
+        let remainder_start = (children.len() / (2 * WIDTH)) * WIDTH;
+        let children_remainder = &children[remainder_start * 2..];
+        let parents_remainder = &mut parents[remainder_start..];
 
-        // TODO: parallel iterator here likely not worth it?
-        let num_simd_parents = parents.len();
-        parents.par_extend(remainder.par_chunks_exact(2).enumerate().map(|(i, pair)| {
-            let pos = parent_start + num_simd_parents + i;
-            Self::apply(parameter, &Self::tree_tweak(level, pos as u32), pair)
-        }));
+        for (i, pair) in children_remainder.chunks_exact(2).enumerate() {
+            let pos = parent_start + remainder_start + i;
+            parents_remainder[i] =
+                Self::apply(parameter, &Self::tree_tweak(level, pos as u32), pair);
+        }
 
         parents
     }
@@ -548,6 +541,10 @@ impl<
                 // Cache strategy: process one chain at a time to maximize locality.
                 // All epochs for that chain stay in registers across iterations.
 
+                // Offsets for chain compression: [parameter | tweak | current_value]
+                let chain_tweak_offset = PARAMETER_LEN;
+                let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+
                 for (chain_index, packed_chain) in
                     packed_chains.iter_mut().enumerate().take(num_chains)
                 {
@@ -557,32 +554,25 @@ impl<
                         // Current position in the chain.
                         let pos = (step + 1) as u8;
 
-                        // Generate tweaks for all epochs in this SIMD batch.
-                        // Each lane gets a tweak specific to its epoch.
-                        let packed_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| {
-                            PackedF::from_fn(|lane| {
-                                Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
-                                    .to_field_elements::<TWEAK_LEN>()[t_idx]
-                            })
-                        });
-
                         // Assemble the packed input for the hash function.
                         // Layout: [parameter | tweak | current_value]
                         let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
-                        let mut current_pos = 0;
 
-                        // Copy parameter into the input buffer.
-                        packed_input[current_pos..current_pos + PARAMETER_LEN]
-                            .copy_from_slice(&packed_parameter);
-                        current_pos += PARAMETER_LEN;
+                        // Copy pre-packed parameter
+                        packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
 
-                        // Copy tweak into the input buffer.
-                        packed_input[current_pos..current_pos + TWEAK_LEN]
-                            .copy_from_slice(&packed_tweak);
-                        current_pos += TWEAK_LEN;
+                        // Pack tweaks directly into destination
+                        pack_fn_into::<TWEAK_LEN>(
+                            &mut packed_input,
+                            chain_tweak_offset,
+                            |t_idx, lane| {
+                                Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
+                                    .to_field_elements::<TWEAK_LEN>()[t_idx]
+                            },
+                        );
 
-                        // Copy current chain value into the input buffer.
-                        packed_input[current_pos..current_pos + HASH_LEN]
+                        // Copy current chain value (already packed)
+                        packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
                             .copy_from_slice(packed_chain);
 
                         // Apply the hash function to advance the chain.
@@ -602,23 +592,34 @@ impl<
                 //
                 // This uses the sponge construction for variable-length input.
 
-                // Generate tree tweaks for all epochs.
-                // Level 0 indicates this is a bottom-layer leaf in the tree.
-                let packed_tree_tweak = array::from_fn::<_, TWEAK_LEN, _>(|t_idx| {
-                    PackedF::from_fn(|lane| {
+                // Assemble the sponge input.
+                // Layout: [parameter | tree_tweak | all_chain_ends]
+                let sponge_tweak_offset = PARAMETER_LEN;
+                let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
+                let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
+
+                let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+
+                // Copy pre-packed parameter
+                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                pack_fn_into::<TWEAK_LEN>(
+                    &mut packed_leaf_input,
+                    sponge_tweak_offset,
+                    |t_idx, lane| {
                         Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
                             [t_idx]
-                    })
-                });
+                    },
+                );
 
-                // Assemble the sponge input.
-                // Layout: [parameter | tree_tweak | all_chain_ends]
-                let packed_leaf_input: Vec<_> = packed_parameter
-                    .iter()
-                    .chain(packed_tree_tweak.iter())
-                    .chain(packed_chains.iter().flatten())
-                    .copied()
-                    .collect();
+                // Copy all chain ends (already packed)
+                for (c_idx, chain) in packed_chains.iter().enumerate() {
+                    packed_leaf_input
+                        [sponge_chains_offset + c_idx * HASH_LEN
+                            ..sponge_chains_offset + (c_idx + 1) * HASH_LEN]
+                        .copy_from_slice(chain);
+                }
 
                 // Apply the sponge hash to produce the leaf.
                 // This absorbs all chain ends and squeezes out the final hash.

From c1dfc588dded2fd98d2d0a30fc7ea821c78088c8 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:09:23 +0100
Subject: [PATCH 05/13] use `#[inline]` without (always)

---
 src/simd_utils.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/simd_utils.rs b/src/simd_utils.rs
index 0b90940..da1afa0 100644
--- a/src/simd_utils.rs
+++ b/src/simd_utils.rs
@@ -26,7 +26,7 @@ use crate::{F, PackedF, array::FieldArray};
 ///
 /// This vertical packing enables efficient SIMD operations where a single instruction
 /// processes the same element position across multiple arrays simultaneously.
-#[inline(always)]
+#[inline]
 pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
     array::from_fn(|i| PackedF::from_fn(|j| data[j][i]))
 }
@@ -37,7 +37,7 @@ pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
 ///
 /// This is the inverse operation of `pack_array`. The output buffer must be preallocated
 /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray<N>`.
-#[inline(always)]
+#[inline]
 pub fn unpack_array<const N: usize>(packed_data: &[PackedF; N], output: &mut [FieldArray<N>]) {
     // Optimized for cache locality: iterate over output lanes first
     for j in 0..PackedF::WIDTH {
@@ -84,7 +84,7 @@ pub fn pack_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[Fi
 /// * `dest` - Destination slice to pack into
 /// * `offset` - Starting index in `dest`
 /// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH)
-#[inline(always)]
+#[inline]
 pub fn pack_even_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
     for i in 0..N {
         dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane][i]);
@@ -100,7 +100,7 @@ pub fn pack_even_into<const N: usize>(dest: &mut [PackedF], offset: usize, data:
 /// * `dest` - Destination slice to pack into
 /// * `offset` - Starting index in `dest`
 /// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH)
-#[inline(always)]
+#[inline]
 pub fn pack_odd_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
     for i in 0..N {
         dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane + 1][i]);
@@ -116,7 +116,7 @@ pub fn pack_odd_into<const N: usize>(dest: &mut [PackedF], offset: usize, data:
 /// * `dest` - Destination slice to pack into
 /// * `offset` - Starting index in `dest`
 /// * `f` - Function that takes (element_index, lane_index) and returns a field element
-#[inline(always)]
+#[inline]
 pub fn pack_fn_into<const N: usize>(
     dest: &mut [PackedF],
     offset: usize,

From 6a182fd3d1fb3ae3dc39a92f0bb921048236cd19 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:10:07 +0100
Subject: [PATCH 06/13] remove `unpack_to_array`, `pack_column` and `pack_into`
 & their tests

---
 src/simd_utils.rs | 90 -----------------------------------------------
 1 file changed, 90 deletions(-)

diff --git a/src/simd_utils.rs b/src/simd_utils.rs
index da1afa0..550705b 100644
--- a/src/simd_utils.rs
+++ b/src/simd_utils.rs
@@ -47,34 +47,6 @@ pub fn unpack_array<const N: usize>(packed_data: &[PackedF; N], output: &mut [Fi
     }
 }
 
-#[inline(always)]
-pub fn unpack_to_array<const N: usize>(
-    packed_data: [PackedF; N],
-) -> [FieldArray<N>; PackedF::WIDTH] {
-    array::from_fn(|j| FieldArray(array::from_fn(|i| packed_data[i].as_slice()[j])))
-}
-
-#[inline(always)]
-pub fn pack_column(col: [F; PackedF::WIDTH]) -> PackedF {
-    PackedF::from_fn(|i| col[i])
-}
-
-/// Pack contiguous FieldArrays directly into a destination slice at the given offset.
-///
-/// Packs `data[0..WIDTH]` into `dest[offset..offset+N]`.
-/// This avoids creating an intermediate `[PackedF; N]` array.
-///
-/// # Arguments
-/// * `dest` - Destination slice to pack into
-/// * `offset` - Starting index in `dest`
-/// * `data` - Source slice of FieldArrays (must have length >= WIDTH)
-#[inline(always)]
-pub fn pack_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
-    for i in 0..N {
-        dest[offset + i] = PackedF::from_fn(|lane| data[lane][i]);
-    }
-}
-
 /// Pack even-indexed FieldArrays (stride 2) directly into destination.
 ///
 /// Packs `data[0], data[2], data[4], ...` into `dest[offset..offset+N]`.
@@ -175,24 +147,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_unpack_to_array() {
-        // Create packed data
-        let packed: [PackedF; 2] = [
-            PackedF::from_fn(|i| F::from_u64(i as u64)),
-            PackedF::from_fn(|i| F::from_u64((i + 100) as u64)),
-        ];
-
-        // Unpack using the new function
-        let output = unpack_to_array(packed);
-
-        // Verify
-        for (lane, arr) in output.iter().enumerate() {
-            assert_eq!(arr[0], F::from_u64(lane as u64));
-            assert_eq!(arr[1], F::from_u64((lane + 100) as u64));
-        }
-    }
-
     #[test]
     fn test_pack_preserves_element_order() {
         // Create data where each array has sequential values
@@ -259,50 +213,6 @@ mod tests {
             prop_assert_eq!(original, unpacked);
         }
 
-        #[test]
-        fn proptest_unpack_to_array_matches_unpack_array(
-            _seed in any::<u64>()
-        ) {
-            let mut rng = rand::rng();
-
-            // Generate random packed data
-            let packed: [PackedF; 8] = array::from_fn(|_| {
-                PackedF::from_fn(|_| rng.random())
-            });
-
-            // Unpack using both methods
-            let mut output1 = [FieldArray([F::ZERO; 8]); PackedF::WIDTH];
-            unpack_array(&packed, &mut output1);
-            let output2 = unpack_to_array(packed);
-
-            // Verify they match
-            prop_assert_eq!(output1, output2);
-        }
-
-        #[test]
-        fn proptest_pack_into_matches_pack_array(
-            _seed in any::<u64>()
-        ) {
-            let mut rng = rand::rng();
-
-            // Generate random data
-            let data: [FieldArray<7>; PackedF::WIDTH] = array::from_fn(|_| {
-                FieldArray(array::from_fn(|_| rng.random()))
-            });
-
-            // Pack using pack_array
-            let expected = pack_array(&data);
-
-            // Pack using pack_into
-            let mut dest = [PackedF::ZERO; 10];
-            pack_into(&mut dest, 2, &data);
-
-            // Verify they match at the offset
-            for i in 0..7 {
-                prop_assert_eq!(dest[2 + i], expected[i]);
-            }
-        }
-
         #[test]
         fn proptest_pack_even_odd_into(
             _seed in any::<u64>()

From 026bf3b2afb44c7c7a4b24f1910fe2dc10749563 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:10:46 +0100
Subject: [PATCH 07/13] annotate `unpack_array` for loop with
 needless_range_loop

No, I do not prefer

```
for (j, <item>) in output.iter_mut().enumerate().take(PackedF::WIDTH){
```

thank you, Clippy.
---
 src/simd_utils.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/simd_utils.rs b/src/simd_utils.rs
index 550705b..c4d355f 100644
--- a/src/simd_utils.rs
+++ b/src/simd_utils.rs
@@ -40,6 +40,7 @@ pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
 #[inline]
 pub fn unpack_array<const N: usize>(packed_data: &[PackedF; N], output: &mut [FieldArray<N>]) {
     // Optimized for cache locality: iterate over output lanes first
+    #[allow(clippy::needless_range_loop)]
     for j in 0..PackedF::WIDTH {
         for i in 0..N {
             output[j].0[i] = packed_data[i].as_slice()[j];

From 6faeaff43b7c66b0095e328c92d3fda191e78485 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:17:02 +0100
Subject: [PATCH 08/13] put input / output layout doc comment for
 `unpack_array` back

Was removed in
https://github.com/tcoratger/leanSig/commit/e8a727381248df83f123f2a8c9a616d9bdb1f277
and I overlooked it while squashing a bunch of our commits.
---
 src/simd_utils.rs | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/simd_utils.rs b/src/simd_utils.rs
index c4d355f..a181a6d 100644
--- a/src/simd_utils.rs
+++ b/src/simd_utils.rs
@@ -37,6 +37,22 @@ pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
 ///
 /// This is the inverse operation of `pack_array`. The output buffer must be preallocated
 /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray<N>`.
+///
+/// Input layout (vertical): each PackedF holds one element from each array
+/// ```text
+/// packed_data[0] = PackedF([a0, b0, c0, ...])
+/// packed_data[1] = PackedF([a1, b1, c1, ...])
+/// packed_data[2] = PackedF([a2, b2, c2, ...])
+/// ...
+/// ```
+///
+/// Output layout (horizontal): each FieldArray is one complete array
+/// ```text
+/// output[0] = FieldArray([a0, a1, a2, ..., aN])
+/// output[1] = FieldArray([b0, b1, b2, ..., bN])
+/// output[2] = FieldArray([c0, c1, c2, ..., cN])
+/// ...
+/// ```
 #[inline]
 pub fn unpack_array<const N: usize>(packed_data: &[PackedF; N], output: &mut [FieldArray<N>]) {
     // Optimized for cache locality: iterate over output lanes first

From d06234767a655a835865f5874c47ec0bd0daaae5 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:24:50 +0100
Subject: [PATCH 09/13] remove comment about test section

---
 src/symmetric/tweak_hash/poseidon.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index eddec2f..265c1cd 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -1371,8 +1371,6 @@ mod tests {
         }
     }
 
-    // ==================== compute_tree_layer tests ====================
-
     /// Scalar reference implementation for compute_tree_layer.
     /// Used to verify the SIMD implementation produces correct results.
     fn compute_tree_layer_scalar<TH: TweakableHash>(

From 83a8a3c348db261645b28b04db9cfd3c38fb4131 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:25:01 +0100
Subject: [PATCH 10/13] use clippy too_many_lines for `compute_tree_leaves`

---
 src/symmetric/tweak_hash/poseidon.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 265c1cd..026779f 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -446,6 +446,7 @@ impl<
         parents
     }
 
+    #[allow(clippy::too_many_lines)]
     fn compute_tree_leaves<PRF>(
         prf_key: &PRF::Key,
         parameter: &Self::Parameter,

From 755292bc469c0f0afcadacf971ae158efeed362f Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Tue, 16 Dec 2025 13:59:27 +0100
Subject: [PATCH 11/13] use chunked iterator for assignment of packed chains

---
 src/symmetric/tweak_hash/poseidon.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 026779f..eefd700 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -615,11 +615,9 @@ impl<
                 );
 
                 // Copy all chain ends (already packed)
-                for (c_idx, chain) in packed_chains.iter().enumerate() {
-                    packed_leaf_input
-                        [sponge_chains_offset + c_idx * HASH_LEN
-                            ..sponge_chains_offset + (c_idx + 1) * HASH_LEN]
-                        .copy_from_slice(chain);
+                let dst = &mut packed_leaf_input[sponge_chains_offset .. sponge_chains_offset + packed_chains.len() * HASH_LEN];
+                for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) {
+                    dst_chunk.copy_from_slice(src_chain);
                 }
 
                 // Apply the sponge hash to produce the leaf.

From 99cbaf5c923bffe0652cad932fad7be471d76390 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 17 Dec 2025 12:17:49 +0100
Subject: [PATCH 12/13] fix `level` passed to `tree_tweak` in default impl

---
 src/symmetric/tweak_hash.rs | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs
index 61426b8..8f98f2f 100644
--- a/src/symmetric/tweak_hash.rs
+++ b/src/symmetric/tweak_hash.rs
@@ -63,11 +63,7 @@ pub trait TweakableHash {
                 // Parent index in this layer
                 let parent_pos = (parent_start + i) as u32;
                 // Hash children into their parent using the tweak
-                Self::apply(
-                    parameter,
-                    &Self::tree_tweak(level + 1, parent_pos),
-                    children,
-                )
+                Self::apply(parameter, &Self::tree_tweak(level, parent_pos), children)
             })
             .collect()
     }

From 9571be327fd7542e7b5ce11f6b80591d77b7e38d Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 17 Dec 2025 12:58:03 +0100
Subject: [PATCH 13/13] add doc comments for compute_tree_layer and poseidon
 override

---
 src/symmetric/tweak_hash.rs          | 21 ++++++++++++++++++++-
 src/symmetric/tweak_hash/poseidon.rs |  4 ++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs
index 8f98f2f..ff032bf 100644
--- a/src/symmetric/tweak_hash.rs
+++ b/src/symmetric/tweak_hash.rs
@@ -48,7 +48,26 @@ pub trait TweakableHash {
         message: &[Self::Domain],
     ) -> Self::Domain;
 
-    /// Applies the calculation for a single tweak hash tree layer.
+    /// Computes one layer of a Merkle tree by hashing pairs of children into parents.
+    ///
+    /// Consecutive pairs of child nodes produce their parent node by hashing
+    /// `(children[2*i], children[2*i+1])`. Each hash application uses a unique
+    /// tweak derived from the tree level and position.
+    ///
+    /// # Arguments
+    /// * `parameter` - Public parameter for the hash function
+    /// * `level` - Tree level of the *parent* nodes being computed. NOTE: callers
+    ///   need to pass `level + 1` where `level` is the children's level, since
+    ///   tree levels are numbered from leaves (level 0) upward.
+    /// * `parent_start` - Starting index of the first parent in this layer, used
+    ///   for computing position-dependent tweaks
+    /// * `children` - Slice of child nodes to hash pairwise (length must be even)
+    ///
+    /// # Returns
+    /// A vector of parent nodes with length `children.len() / 2`.
+    ///
+    /// This default implementation processes pairs in parallel using Rayon.
+    /// The Poseidon implementation overrides this with a SIMD-accelerated variant.
     fn compute_tree_layer(
         parameter: &Self::Parameter,
         level: u8,
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index eefd700..c257297 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -371,6 +371,10 @@ impl<
         }
     }
 
+    /// SIMD-accelerated computation of one Merkle tree layer.
+    ///
+    /// Processes `PackedF::WIDTH` parent pairs simultaneously using SIMD instructions,
+    /// with a scalar fallback for any remainder elements.
     fn compute_tree_layer(
         parameter: &Self::Parameter,
         level: u8,