leanEthereum · tcoratger · Dec 17, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 10, 2025
diff --git a/src/simd_utils.rs b/src/simd_utils.rs
@@ -2,7 +2,7 @@
 
 use p3_field::PackedValue;
 
-use crate::{PackedF, array::FieldArray};
+use crate::{F, PackedF, array::FieldArray};
 
 /// Packs scalar arrays into SIMD-friendly vertical layout.
 ///
@@ -26,7 +26,7 @@
 ///
 /// This vertical packing enables efficient SIMD operations where a single instruction
 /// processes the same element position across multiple arrays simultaneously.
-#[inline]
+#[inline(always)]
 pub fn pack_array<const N: usize>(data: &[FieldArray<N>]) -> [PackedF; N] {
     array::from_fn(|i| PackedF::from_fn(|j| data[j][i]))
 }
@@ -37,32 +37,96 @@
 ///
 /// This is the inverse operation of `pack_array`. The output buffer must be preallocated
 /// with size `[WIDTH]` where `WIDTH = PackedF::WIDTH`, and each element is a `FieldArray<N>`.
-///
-/// Input layout (vertical): each PackedF holds one element from each array
-/// ```text
-/// packed_data[0] = PackedF([a0, b0, c0, ...])
-/// packed_data[1] = PackedF([a1, b1, c1, ...])
-/// packed_data[2] = PackedF([a2, b2, c2, ...])
-/// ...
-/// ```
-///
-/// Output layout (horizontal): each FieldArray is one complete array
-/// ```text
-/// output[0] = FieldArray([a0, a1, a2, ..., aN])
-/// output[1] = FieldArray([b0, b1, b2, ..., bN])
-/// output[2] = FieldArray([c0, c1, c2, ..., cN])
-/// ...
-/// ```
-#[inline]
+#[inline(always)]
 pub fn unpack_array<const N: usize>(packed_data: &[PackedF; N], output: &mut [FieldArray<N>]) {
-    for (i, data) in packed_data.iter().enumerate().take(N) {
-        let unpacked_v = data.as_slice();
-        for j in 0..PackedF::WIDTH {
-            output[j][i] = unpacked_v[j];
+    // Optimized for cache locality: iterate over output lanes first
+    for j in 0..PackedF::WIDTH {
+        for i in 0..N {
+            output[j].0[i] = packed_data[i].as_slice()[j];
         }
     }
 }
 
+#[inline(always)]
+pub fn unpack_to_array<const N: usize>(
+    packed_data: [PackedF; N],
+) -> [FieldArray<N>; PackedF::WIDTH] {
+    array::from_fn(|j| FieldArray(array::from_fn(|i| packed_data[i].as_slice()[j])))
+}
+
+#[inline(always)]
+pub fn pack_column(col: [F; PackedF::WIDTH]) -> PackedF {
+    PackedF::from_fn(|i| col[i])
+}
+
+/// Pack contiguous FieldArrays directly into a destination slice at the given offset.
+///
+/// Packs `data[0..WIDTH]` into `dest[offset..offset+N]`.
+/// This avoids creating an intermediate `[PackedF; N]` array.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `data` - Source slice of FieldArrays (must have length >= WIDTH)
+#[inline(always)]
+pub fn pack_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| data[lane][i]);
+    }
+}
+
+/// Pack even-indexed FieldArrays (stride 2) directly into destination.
+///
+/// Packs `data[0], data[2], data[4], ...` into `dest[offset..offset+N]`.
+/// Useful for packing left children from interleaved `[L0, R0, L1, R1, ...]` pairs.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH)
+#[inline(always)]
+pub fn pack_even_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane][i]);
+    }
+}
+
+/// Pack odd-indexed FieldArrays (stride 2) directly into destination.
+///
+/// Packs `data[1], data[3], data[5], ...` into `dest[offset..offset+N]`.
+/// Useful for packing right children from interleaved `[L0, R0, L1, R1, ...]` pairs.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `data` - Source slice of interleaved pairs (must have length >= 2 * WIDTH)
+#[inline(always)]
+pub fn pack_odd_into<const N: usize>(dest: &mut [PackedF], offset: usize, data: &[FieldArray<N>]) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| data[2 * lane + 1][i]);
+    }
+}
+
+/// Pack values generated by a function directly into destination.
+///
+/// For each element index `i` in `0..N`, generates a PackedF by calling
+/// `f(i, lane)` for each SIMD lane.
+///
+/// # Arguments
+/// * `dest` - Destination slice to pack into
+/// * `offset` - Starting index in `dest`
+/// * `f` - Function that takes (element_index, lane_index) and returns a field element
+#[inline(always)]
+pub fn pack_fn_into<const N: usize>(
+    dest: &mut [PackedF],
+    offset: usize,
+    f: impl Fn(usize, usize) -> F,
+) {
+    for i in 0..N {
+        dest[offset + i] = PackedF::from_fn(|lane| f(i, lane));
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::F;
@@ -111,6 +175,24 @@
         }
     }
 
+    #[test]
+    fn test_unpack_to_array() {
+        // Create packed data
+        let packed: [PackedF; 2] = [
+            PackedF::from_fn(|i| F::from_u64(i as u64)),
+            PackedF::from_fn(|i| F::from_u64((i + 100) as u64)),
+        ];
+
+        // Unpack using the new function
+        let output = unpack_to_array(packed);
+
+        // Verify
+        for (lane, arr) in output.iter().enumerate() {
+            assert_eq!(arr[0], F::from_u64(lane as u64));
+            assert_eq!(arr[1], F::from_u64((lane + 100) as u64));
+        }
+    }
+
     #[test]
     fn test_pack_preserves_element_order() {
         // Create data where each array has sequential values
@@ -176,5 +258,111 @@
             // Verify roundtrip
             prop_assert_eq!(original, unpacked);
         }
+
+        #[test]
+        fn proptest_unpack_to_array_matches_unpack_array(
+            _seed in any::<u64>()
+        ) {
+            let mut rng = rand::rng();
+
+            // Generate random packed data
+            let packed: [PackedF; 8] = array::from_fn(|_| {
+                PackedF::from_fn(|_| rng.random())
+            });
+
+            // Unpack using both methods
+            let mut output1 = [FieldArray([F::ZERO; 8]); PackedF::WIDTH];
+            unpack_array(&packed, &mut output1);
+            let output2 = unpack_to_array(packed);
+
+            // Verify they match
+            prop_assert_eq!(output1, output2);
+        }
+
+        #[test]
+        fn proptest_pack_into_matches_pack_array(
+            _seed in any::<u64>()
+        ) {
+            let mut rng = rand::rng();
+
+            // Generate random data
+            let data: [FieldArray<7>; PackedF::WIDTH] = array::from_fn(|_| {
+                FieldArray(array::from_fn(|_| rng.random()))
+            });
+
+            // Pack using pack_array
+            let expected = pack_array(&data);
+
+            // Pack using pack_into
+            let mut dest = [PackedF::ZERO; 10];
+            pack_into(&mut dest, 2, &data);
+
+            // Verify they match at the offset
+            for i in 0..7 {
+                prop_assert_eq!(dest[2 + i], expected[i]);
+            }
+        }
+
+        #[test]
+        fn proptest_pack_even_odd_into(
+            _seed in any::<u64>()
+        ) {
+            let mut rng = rand::rng();
+
+            // Generate interleaved pairs: [L0, R0, L1, R1, ...]
+            let pairs: [FieldArray<5>; 2 * PackedF::WIDTH] = array::from_fn(|_| {
+                FieldArray(array::from_fn(|_| rng.random()))
+            });
+
+            // Pack even (left children) and odd (right children)
+            let mut dest = [PackedF::ZERO; 12];
+            pack_even_into(&mut dest, 1, &pairs);
+            pack_odd_into(&mut dest, 6, &pairs);
+
+            // Verify even indices were packed correctly
+            for i in 0..5 {
+                for lane in 0..PackedF::WIDTH {
+                    prop_assert_eq!(
+                        dest[1 + i].as_slice()[lane],
+                        pairs[2 * lane][i],
+                        "Even packing mismatch at element {}, lane {}", i, lane
+                    );
+                }
+            }
+
+            // Verify odd indices were packed correctly
+            for i in 0..5 {
+                for lane in 0..PackedF::WIDTH {
+                    prop_assert_eq!(
+                        dest[6 + i].as_slice()[lane],
+                        pairs[2 * lane + 1][i],
+                        "Odd packing mismatch at element {}, lane {}", i, lane
+                    );
+                }
+            }
+        }
+
+        #[test]
+        fn proptest_pack_fn_into(
+            _seed in any::<u64>()
+        ) {
+            // Pack using a function that generates predictable values
+            let mut dest = [PackedF::ZERO; 8];
+            pack_fn_into::<4>(&mut dest, 3, |elem_idx, lane_idx| {
+                F::from_u64((elem_idx * 100 + lane_idx) as u64)
+            });
+
+            // Verify
+            for i in 0..4 {
+                for lane in 0..PackedF::WIDTH {
+                    let expected = F::from_u64((i * 100 + lane) as u64);
+                    prop_assert_eq!(
+                        dest[3 + i].as_slice()[lane],
+                        expected,
+                        "pack_fn_into mismatch at element {}, lane {}", i, lane
+                    );
+                }
+            }
+        }
     }
 }
diff --git a/src/symmetric/tweak_hash.rs b/src/symmetric/tweak_hash.rs
@@ -1,5 +1,7 @@
 use rand::Rng;
 
+use rayon::prelude::*;
+
 use crate::serialization::Serializable;
 use crate::symmetric::prf::Pseudorandom;
 
@@ -46,6 +48,30 @@ pub trait TweakableHash {
         message: &[Self::Domain],
     ) -> Self::Domain;
 
+    /// Applies the calculation for a single tweak hash tree layer.
+    fn compute_tree_layer(
+        parameter: &Self::Parameter,
+        level: u8,
+        parent_start: usize,
+        children: &[Self::Domain],
+    ) -> Vec<Self::Domain> {
+        // default implementation is scalar. tweak_tree/poseidon.rs provides a SIMD variant
+        children
+            .par_chunks_exact(2)
+            .enumerate()
+            .map(|(i, children)| {
+                // Parent index in this layer
+                let parent_pos = (parent_start + i) as u32;
+                // Hash children into their parent using the tweak
+                Self::apply(
+                    parameter,
+                    &Self::tree_tweak(level + 1, parent_pos),
+                    children,
+                )
+            })
+            .collect()
+    }
+
     /// Computes bottom tree leaves by walking hash chains for multiple epochs.
     ///
     /// This method has a default scalar implementation that processes epochs in parallel.