diff --git a/.github/workflows/rust-lint.yml b/.github/workflows/rust-lint.yml
index ed514af0..14b944e9 100644
--- a/.github/workflows/rust-lint.yml
+++ b/.github/workflows/rust-lint.yml
@@ -10,6 +10,13 @@ jobs:
       - uses: actions/checkout@v3
       - uses: dtolnay/rust-toolchain@stable
 
+      # Note: This is a workaround for an issue that just started appearing in lint checks
+      # and I'm not yet sure if it's due to GitHub Actions having updated something behind
+      # the scenes:
+      # error: 'cargo-fmt' is not installed for the toolchain 'stable-x86_64-unknown-linux-gnu'
+      - name: Install rustfmt
+        run: rustup component add rustfmt clippy
+
       - name: Install tools
         run: |
           cargo install cargo-deny
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b7a4f75f..62d14e01 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,39 @@ The format is based on Keep a Changelog and this project adheres to
 ### Migration
 - If there are breaking changes, put a short, actionable checklist here.
 
-## [0.14.0-alpha] - 2024-09-08
+---
+
+## [0.15.0-alpha] - 2025-09-25
+### Breaking
+- Default payload alignment increased from 16 bytes to 64 bytes to ensure
+  SIMD- and cacheline-safe zero-copy access across SSE/AVX/AVX-512 code
+  paths. Readers/writers compiled with `<= 0.14.x-alpha` that assume
+  16-byte alignment will not be able to parse 0.15.x stores correctly.
+
+### Added
+- Debug/test-only assertions (`assert_aligned`, `assert_aligned_offset`)
+  to validate both pointer- and offset-level alignment invariants.
+
+### Changed
+- Updated documentation and examples to reflect the new 64-byte default
+  `PAYLOAD_ALIGNMENT` (still configurable in
+  `src/storage_engine/constants.rs`).
+- `EntryHandle::as_arrow_buffer` and `into_arrow_buffer` now check both
+  pointer and offset alignment when compiled in test or debug mode.
+
+### Migration
+- Stores created with 0.15.x are not backward-compatible with
+  0.14.x readers/writers due to the alignment change.
+- To migrate:
+  1. Read entries with your existing 0.14.x binary.
+  2. Rewrite into a fresh 0.15.x store (which will apply 64-byte
+     alignment).
+  3. Deploy upgraded readers before upgrading writers in multi-service
+     environments.
+
+---
+
+## [0.14.0-alpha] - 2025-09-08
 ### Breaking
 - Files written by 0.14.0-alpha use padded payload starts for fixed alignment.
   Older readers (<= 0.13.x-alpha) may misinterpret pre-pad bytes as part of the
diff --git a/Cargo.lock b/Cargo.lock
index c7d68453..de3d18f1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1819,7 +1819,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "async-trait",
  "bincode",
@@ -1847,7 +1847,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-entry-handle"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "arrow",
  "crc32fast",
@@ -1856,7 +1856,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-extensions"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "bincode",
  "doc-comment",
@@ -1868,7 +1868,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-muxio-service-definition"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "bitcode",
  "muxio-rpc-service",
@@ -1876,7 +1876,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-ws-client"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "async-trait",
  "muxio-rpc-service",
@@ -1890,7 +1890,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-ws-server"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "clap",
  "indoc",
@@ -2532,3 +2532,19 @@ dependencies = [
  "quote",
  "syn",
 ]
+
+[[patch.unused]]
+name = "muxio-rpc-service"
+version = "0.10.0-alpha"
+
+[[patch.unused]]
+name = "muxio-rpc-service-caller"
+version = "0.10.0-alpha"
+
+[[patch.unused]]
+name = "muxio-tokio-rpc-client"
+version = "0.10.0-alpha"
+
+[[patch.unused]]
+name = "muxio-tokio-rpc-server"
+version = "0.10.0-alpha"
diff --git a/Cargo.toml b/Cargo.toml
index 5229a819..797b617c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [workspace.package]
 authors = ["Jeremy Harris <jeremy.harris@zenosmosis.com>"]
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 edition = "2024"
 repository = "https://github.com/jzombie/rust-simd-r-drive"
 license = "Apache-2.0"
@@ -79,10 +79,10 @@ resolver = "2"
 
 [workspace.dependencies]
 # Intra-workspace crates
-simd-r-drive = { path = ".", version = "0.14.0-alpha" }
-simd-r-drive-entry-handle = { path = "./simd-r-drive-entry-handle", version = "0.14.0-alpha" }
-simd-r-drive-ws-client = { path = "./experiments/simd-r-drive-ws-client", version = "0.14.0-alpha" }
-simd-r-drive-muxio-service-definition = { path = "./experiments/simd-r-drive-muxio-service-definition", version = "0.14.0-alpha" }
+simd-r-drive = { path = ".", version = "0.15.0-alpha" }
+simd-r-drive-entry-handle = { path = "./simd-r-drive-entry-handle", version = "0.15.0-alpha" }
+simd-r-drive-ws-client = { path = "./experiments/simd-r-drive-ws-client", version = "0.15.0-alpha" }
+simd-r-drive-muxio-service-definition = { path = "./experiments/simd-r-drive-muxio-service-definition", version = "0.15.0-alpha" }
 muxio-tokio-rpc-client = "0.9.0-alpha"
 muxio-tokio-rpc-server = "0.9.0-alpha"
 muxio-rpc-service = "0.9.0-alpha"
diff --git a/README.md b/README.md
index e3d8caf6..2a33467a 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@
 
 `SIMD R Drive` is a high-performance, thread-safe storage engine using a single-file storage container optimized for zero-copy binary access.
 
+Payloads are written at fixed 64-byte aligned boundaries, ensuring efficient zero-copy access and predictable performance for SIMD and cache-friendly workloads.
+
 Can be used as a command line interface (CLI) app, or as a library in another application. Continuously tested on Mac, Linux, and Windows.
 
 [Documentation](https://docs.rs/simd-r-drive/latest/simd_r_drive/)
@@ -48,11 +50,13 @@ Additionally, `SIMD R Drive` is designed to handle datasets larger than availabl
 
 ## Fixed Payload Alignment (Zero-Copy Typed Slices)
 
-Every non-tombstone payload now starts at a fixed, power-of-two boundary (16 bytes by default, configurable). This guarantees that, when your payload length matches the element size, you can reinterpret bytes as typed slices (e.g., `&[u16]`, `&[u32]`, `&[u64]`, `&[u128]`) without copying.
+Every non-tombstone payload now begins on a fixed, power-of-two boundary (64 bytes by default). This matches the size of a typical CPU cacheline and ensures SIMD/vector loads (AVX, AVX-512, SVE, etc.) can operate at full speed without crossing cacheline boundaries.
+
+When your payload length matches the element size, you can safely reinterpret the bytes as typed slices (e.g., &[u16], &[u32], &[u64], &[u128]) without copying.
 
-This change is transparent to the public API and works with all write modes, including streaming. The on-disk layout may include a few padding bytes per entry to maintain alignment. Tombstones are unaffected.
+The on-disk layout may include a few padding bytes per entry to maintain alignment. Tombstones are unaffected.
 
-Practical benefits include faster vectorized reads, simpler use of zero-copy helpers (e.g., casting libraries), and fewer fallback copies. If you need a stricter boundary for a target platform, adjust the [alignment constant](./src/storage_engine/constants.rs) and rebuild.
+Practical benefits include cache-friendly zero-copy reads, predictable SIMD performance, simpler use of casting libraries, and fewer fallback copies. If a different boundary is required for your hardware, adjust the [alignment constant](./simd-r-drive-entry-handle/src/constants.rs) and rebuild.
 
 ## Single-File Storage Container for Binary Data
 
@@ -103,6 +107,8 @@ Think of it as a self-contained binary filesystem—capable of storing and retri
   <img src="./assets/storage-layout.png" title="Storage Layout" />
 </div>
 
+_Note: Illustration is conceptual and does not show the 64-byte aligned  boundaries used in the actual on-disk format. In practice, every payload is padded to start on a fixed 64-byte boundary for cacheline and SIMD efficiency._
+
 Aligned entry (non-tombstone):
 
 | Offset Range   | Field              | Size (Bytes) | Description                       |
diff --git a/experiments/bindings/python-ws-client/Cargo.lock b/experiments/bindings/python-ws-client/Cargo.lock
index 2ff051d4..6f99a211 100644
--- a/experiments/bindings/python-ws-client/Cargo.lock
+++ b/experiments/bindings/python-ws-client/Cargo.lock
@@ -1048,7 +1048,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "async-trait",
  "clap",
@@ -1064,7 +1064,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-entry-handle"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "crc32fast",
  "memmap2",
@@ -1072,7 +1072,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-muxio-service-definition"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "bitcode",
  "muxio-rpc-service",
@@ -1080,7 +1080,7 @@ dependencies = [
 
 [[package]]
 name = "simd-r-drive-ws-client"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 dependencies = [
  "async-trait",
  "muxio-rpc-service",
diff --git a/experiments/bindings/python_(old_client)/pyproject.toml b/experiments/bindings/python_(old_client)/pyproject.toml
index 2ea15565..df5aef25 100644
--- a/experiments/bindings/python_(old_client)/pyproject.toml
+++ b/experiments/bindings/python_(old_client)/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "simd-r-drive-py"
-version = "0.14.0-alpha"
+version = "0.15.0-alpha"
 description = "SIMD-optimized append-only schema-less storage engine. Key-based binary storage in a single-file storage container."
 repository = "https://github.com/jzombie/rust-simd-r-drive"
 license = "Apache-2.0"
diff --git a/simd-r-drive-entry-handle/src/constants.rs b/simd-r-drive-entry-handle/src/constants.rs
index 62bf73ee..16b54a06 100644
--- a/simd-r-drive-entry-handle/src/constants.rs
+++ b/simd-r-drive-entry-handle/src/constants.rs
@@ -9,3 +9,10 @@ pub const CHECKSUM_RANGE: Range<usize> = 16..20;
 
 // Define checksum length explicitly since `CHECKSUM_RANGE.len()` isn't `const`
 pub const CHECKSUM_LEN: usize = CHECKSUM_RANGE.end - CHECKSUM_RANGE.start;
+
+/// Fixed alignment (power of two) for the start of every payload.
+/// 64 bytes matches cache-line size and SIMD-friendly alignment.
+/// This improves chances of staying zero-copy in vector kernels.
+/// Max pre-pad per entry is `PAYLOAD_ALIGNMENT - 1` bytes.
+pub const PAYLOAD_ALIGN_LOG2: u8 = 6; // 2^6 = 64
+pub const PAYLOAD_ALIGNMENT: u64 = 1 << PAYLOAD_ALIGN_LOG2;
diff --git a/simd-r-drive-entry-handle/src/debug_assert_aligned.rs b/simd-r-drive-entry-handle/src/debug_assert_aligned.rs
new file mode 100644
index 00000000..f16b4ed5
--- /dev/null
+++ b/simd-r-drive-entry-handle/src/debug_assert_aligned.rs
@@ -0,0 +1,88 @@
+/// Debug-only pointer alignment assertion that is safe to export.
+///
+/// Why this style:
+/// - We need to re-export a symbol other crates can call, but we do not
+///   want benches or release builds to pull in debug-only deps or code.
+/// - Putting `#[cfg(...)]` on the function itself makes the symbol
+///   vanish in release/bench. Callers would then need their own cfg
+///   fences, which is brittle across crates.
+/// - By keeping the function always present and gating only its body,
+///   callers can invoke it unconditionally. In debug/test it asserts;
+///   in release/bench it compiles to a no-op.
+///
+/// Build behavior:
+/// - In debug/test, the inner block runs and uses `debug_assert!`.
+/// - In release/bench, the else block keeps the args "used" so the
+///   function is a true no-op (no codegen warnings, no panic paths).
+///
+/// Cost:
+/// - Inlining plus the cfg-ed body means zero runtime cost in release
+///   and bench profiles.
+///
+/// Usage:
+/// - Call anywhere you want a cheap alignment check in debug/test,
+///   including from other crates that depend on this one.
+#[inline]
+pub fn debug_assert_aligned(ptr: *const u8, align: usize) {
+    #[cfg(any(test, debug_assertions))]
+    {
+        debug_assert!(align.is_power_of_two());
+        debug_assert!(
+            (ptr as usize & (align - 1)) == 0,
+            "buffer base is not {}-byte aligned",
+            align
+        );
+    }
+
+    #[cfg(not(any(test, debug_assertions)))]
+    {
+        // Release/bench: no-op. Keep args used to avoid warnings.
+        let _ = ptr;
+        let _ = align;
+    }
+}
+
+/// Debug-only file-offset alignment assertion that is safe to export.
+///
+/// Same rationale as `debug_assert_aligned`: keep a stable symbol that
+/// callers can invoke without cfg fences, while ensuring zero cost in
+/// release/bench builds.
+///
+/// Why not a module-level cfg or `use`:
+/// - Some bench setups compile with `--all-features` and may still pull
+///   modules in ways that trip cfg-ed imports. Gating inside the body
+///   avoids those hazards and keeps the bench linker happy.
+///
+/// Behavior:
+/// - Debug/test: checks that `off` is a multiple of the configured
+///   `PAYLOAD_ALIGNMENT`.
+/// - Release/bench: no-op, arguments are marked used.
+///
+/// Notes:
+/// - This asserts the *derived start offset* of a payload, not the
+///   pointer. Use the pointer variant to assert the actual address you
+///   hand to consumers like Arrow.
+#[inline]
+pub fn debug_assert_aligned_offset(off: u64) {
+    #[cfg(any(test, debug_assertions))]
+    {
+        use crate::constants::PAYLOAD_ALIGNMENT;
+
+        debug_assert!(
+            PAYLOAD_ALIGNMENT.is_power_of_two(),
+            "PAYLOAD_ALIGNMENT must be a power of two"
+        );
+        debug_assert!(
+            off.is_multiple_of(PAYLOAD_ALIGNMENT),
+            "derived payload start not {}-byte aligned (got {})",
+            PAYLOAD_ALIGNMENT,
+            off
+        );
+    }
+
+    #[cfg(not(any(test, debug_assertions)))]
+    {
+        // Release/bench: no-op. Keep arg used to avoid warnings.
+        let _ = off;
+    }
+}
diff --git a/simd-r-drive-entry-handle/src/entry_handle.rs b/simd-r-drive-entry-handle/src/entry_handle.rs
index 905faf49..1fdec85c 100644
--- a/simd-r-drive-entry-handle/src/entry_handle.rs
+++ b/simd-r-drive-entry-handle/src/entry_handle.rs
@@ -387,11 +387,20 @@ impl EntryHandle {
         use std::ptr::NonNull;
         use std::sync::Arc;
 
-        // Pointer to the start of the payload.
-        let ptr = NonNull::new(self.as_slice().as_ptr() as *mut u8).expect("non-null slice ptr");
+        let slice = self.as_slice();
+        #[cfg(any(test, debug_assertions))]
+        {
+            use crate::{
+                constants::PAYLOAD_ALIGNMENT, debug_assert_aligned, debug_assert_aligned_offset,
+            };
+            // Assert actual pointer alignment.
+            debug_assert_aligned(slice.as_ptr(), PAYLOAD_ALIGNMENT as usize);
+            // Assert derived file offset alignment.
+            debug_assert_aligned_offset(self.range.start as u64);
+        }
 
-        // Owner keeps the mmap alive for the Buffer's lifetime.
-        unsafe { Buffer::from_custom_allocation(ptr, self.size(), Arc::new(self.clone())) }
+        let ptr = NonNull::new(slice.as_ptr() as *mut u8).expect("non-null slice ptr");
+        unsafe { Buffer::from_custom_allocation(ptr, slice.len(), Arc::new(self.clone())) }
     }
 
     /// Convert this handle into an Arrow `Buffer` without copying.
@@ -418,11 +427,20 @@ impl EntryHandle {
         use std::ptr::NonNull;
         use std::sync::Arc;
 
-        let len: usize = self.size();
-        let ptr = NonNull::new(self.as_slice().as_ptr() as *mut u8).expect("non-null slice ptr");
+        let slice = self.as_slice();
+        #[cfg(any(test, debug_assertions))]
+        {
+            use crate::{
+                constants::PAYLOAD_ALIGNMENT, debug_assert_aligned, debug_assert_aligned_offset,
+            };
+            // Assert actual pointer alignment.
+            debug_assert_aligned(slice.as_ptr(), PAYLOAD_ALIGNMENT as usize);
+            // Assert derived file offset alignment.
+            debug_assert_aligned_offset(self.range.start as u64);
+        }
 
-        // Move self into the owner to avoid an extra Arc bump later.
-        unsafe { Buffer::from_custom_allocation(ptr, len, Arc::new(self)) }
+        let ptr = NonNull::new(slice.as_ptr() as *mut u8).expect("non-null slice ptr");
+        unsafe { Buffer::from_custom_allocation(ptr, slice.len(), Arc::new(self)) }
     }
 }
 
diff --git a/simd-r-drive-entry-handle/src/lib.rs b/simd-r-drive-entry-handle/src/lib.rs
index 674881f9..673af58a 100644
--- a/simd-r-drive-entry-handle/src/lib.rs
+++ b/simd-r-drive-entry-handle/src/lib.rs
@@ -5,3 +5,6 @@ pub use entry_handle::*;
 
 pub mod entry_metadata;
 pub use entry_metadata::*;
+
+pub mod debug_assert_aligned;
+pub use debug_assert_aligned::*;
diff --git a/src/storage_engine/constants.rs b/src/storage_engine/constants.rs
index b6af64f2..283fd4ba 100644
--- a/src/storage_engine/constants.rs
+++ b/src/storage_engine/constants.rs
@@ -5,8 +5,3 @@ pub const NULL_BYTE: [u8; 1] = [0];
 
 /// Stream copy chunk size.
 pub const WRITE_STREAM_BUFFER_SIZE: usize = 64 * 1024; // 64 KB
-
-/// Fixed alignment (power of two) for the start of every payload.
-/// 16 bytes covers u8/u16/u32/u64/u128 on mainstream targets.
-pub const PAYLOAD_ALIGN_LOG2: u8 = 4;
-pub const PAYLOAD_ALIGNMENT: u64 = 1 << PAYLOAD_ALIGN_LOG2;
diff --git a/src/storage_engine/data_store.rs b/src/storage_engine/data_store.rs
index 05000afc..a4d186e4 100644
--- a/src/storage_engine/data_store.rs
+++ b/src/storage_engine/data_store.rs
@@ -17,6 +17,9 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex, RwLock, RwLockReadGuard};
 use tracing::{debug, info, warn};
 
+#[cfg(any(test, debug_assertions))]
+use simd_r_drive_entry_handle::debug_assert_aligned_offset;
+
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
@@ -344,6 +347,11 @@ impl DataStore {
                 return None;
             }
 
+            #[cfg(any(test, debug_assertions))]
+            {
+                debug_assert_aligned_offset(entry_start as u64);
+            }
+
             Some(EntryHandle {
                 mmap_arc: mmap_arc.clone(),
                 range: entry_start..entry_end,
@@ -399,6 +407,11 @@ impl DataStore {
             {
                 prev_tail
             } else {
+                #[cfg(any(test, debug_assertions))]
+                {
+                    debug_assert_aligned_offset(derived_start);
+                }
+
                 derived_start
             };
 
@@ -539,6 +552,11 @@ impl DataStore {
             return None;
         }
 
+        #[cfg(any(test, debug_assertions))]
+        {
+            debug_assert_aligned_offset(entry_start as u64);
+        }
+
         Some(EntryHandle {
             mmap_arc: mmap_arc.clone(),
             range: entry_start..entry_end,
@@ -1072,6 +1090,11 @@ impl DataStoreReader for DataStore {
             return Ok(None);
         }
 
+        #[cfg(any(test, debug_assertions))]
+        {
+            debug_assert_aligned_offset(entry_start as u64);
+        }
+
         Ok(Some(EntryHandle {
             mmap_arc,
             range: entry_start..entry_end,
diff --git a/src/utils/align_or_copy.rs b/src/utils/align_or_copy.rs
index 9192a6e7..24cc6c55 100644
--- a/src/utils/align_or_copy.rs
+++ b/src/utils/align_or_copy.rs
@@ -59,7 +59,7 @@ where
         Cow::Borrowed(aligned)
     } else {
         assert!(
-            bytes.len() % N == 0,
+            bytes.len().is_multiple_of(N),
             "Input length must be a multiple of element size"
         );
 
diff --git a/tests/alignment_tests.rs b/tests/alignment_tests.rs
index 0b6cf353..4083dc7d 100644
--- a/tests/alignment_tests.rs
+++ b/tests/alignment_tests.rs
@@ -25,7 +25,7 @@ fn assert_payload_addr_aligned(bytes: &[u8]) {
     let ptr = bytes.as_ptr() as usize;
     let a = PAYLOAD_ALIGNMENT as usize;
     assert!(
-        ptr % a == 0,
+        ptr.is_multiple_of(a),
         "payload start address is not {}-byte aligned",
         a
     );
@@ -42,13 +42,13 @@ fn assert_can_view_as<T: Copy>(bytes: &[u8]) {
     );
     let ptr = bytes.as_ptr() as usize;
     assert!(
-        ptr % a_t == 0,
+        ptr.is_multiple_of(a_t),
         "payload addr {} is not aligned to T (align {})",
         ptr,
         a_t
     );
     assert!(
-        bytes.len() % size_of::<T>() == 0,
+        bytes.len().is_multiple_of(size_of::<T>()),
         "payload length {} is not a multiple of {}",
         bytes.len(),
         size_of::<T>()
@@ -67,47 +67,68 @@ fn assert_bytemuck_view_u128(bytes: &[u8]) {
 }
 
 #[cfg(target_arch = "x86_64")]
-fn assert_simd_16_byte_loadable(bytes: &[u8]) {
+fn assert_simd_64_byte_loadable(bytes: &[u8]) {
+    // Enforce 64B-aligned base for clean cacheline-friendly loads.
     assert!(
-        (bytes.as_ptr() as usize) % 16 == 0,
-        "SIMD pointer must be 16-byte aligned"
+        (bytes.as_ptr() as usize).is_multiple_of(64),
+        "SIMD pointer must be 64-byte aligned"
     );
-    let lanes = bytes.len() / 16;
+    // Process only the full 64B lanes; ignore any tail < 64B.
+    let lanes = bytes.len() / 64;
+    if lanes == 0 {
+        return;
+    }
     unsafe {
         for i in 0..lanes {
-            let p = bytes.as_ptr().add(i * 16) as *const __m128i;
-            let v = _mm_load_si128(p);
-            core::hint::black_box(v);
+            let base = bytes.as_ptr().add(i * 64);
+            let p0 = base.add(0) as *const __m128i;
+            let p1 = base.add(16) as *const __m128i;
+            let p2 = base.add(32) as *const __m128i;
+            let p3 = base.add(48) as *const __m128i;
+            let v0 = _mm_load_si128(p0);
+            let v1 = _mm_load_si128(p1);
+            let v2 = _mm_load_si128(p2);
+            let v3 = _mm_load_si128(p3);
+            core::hint::black_box((v0, v1, v2, v3));
         }
     }
 }
 
 #[cfg(target_arch = "aarch64")]
-fn assert_simd_16_byte_loadable(bytes: &[u8]) {
+fn assert_simd_64_byte_loadable(bytes: &[u8]) {
+    // Enforce 64B-aligned base for clean cacheline-friendly loads.
     assert!(
-        (bytes.as_ptr() as usize) % 16 == 0,
-        "SIMD pointer must be 16-byte aligned"
+        (bytes.as_ptr() as usize).is_multiple_of(64),
+        "SIMD pointer must be 64-byte aligned"
     );
-    let lanes = bytes.len() / 16;
+    // Process only the full 64B lanes; ignore any tail < 64B.
+    let lanes = bytes.len() / 64;
+    if lanes == 0 {
+        return;
+    }
     unsafe {
         for i in 0..lanes {
-            let p = bytes.as_ptr().add(i * 16);
-            let v0 = vld1q_u8(p);
-            core::hint::black_box(v0);
-            let p_vec = p as *const uint8x16_t;
-            let v1: uint8x16_t = core::ptr::read(p_vec);
-            core::hint::black_box(v1);
+            let base = bytes.as_ptr().add(i * 64);
+            let v0 = vld1q_u8(base.add(0));
+            let v1 = vld1q_u8(base.add(16));
+            let v2 = vld1q_u8(base.add(32));
+            let v3 = vld1q_u8(base.add(48));
+            // Also test an aligned typed read path.
+            let p0 = base.add(0) as *const uint8x16_t;
+            let r0: uint8x16_t = core::ptr::read(p0);
+            core::hint::black_box((v0, v1, v2, v3, r0));
         }
     }
 }
 
 #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
-fn assert_simd_16_byte_loadable(bytes: &[u8]) {
-    // Portable fallback: re-assert address and u128 view conditions.
+fn assert_simd_64_byte_loadable(bytes: &[u8]) {
+    // Portable fallback: enforce 64B alignment; if we have >= 64B,
+    // prove we could read at least one 16B lane safely.
     assert_payload_addr_aligned(bytes);
-    if bytes.len() >= 16 && bytes.len() % 16 == 0 {
+    if bytes.len() >= 64 {
         assert_can_view_as::<u128>(bytes);
-        assert_bytemuck_view_u128(bytes);
+        let _: &[u128] = try_cast_slice(bytes).expect("cast &[u8]->&[u128] failed");
     }
 }
 
@@ -160,10 +181,10 @@ fn byte_alignment_unaligned_then_overwrite_and_simd() {
     // Phase 2: delete one string (tombstone, no pre-pad).
     store.delete(b"k_s2").unwrap();
 
-    // Phase 3: overwrite with 16B-multiple payloads.
-    let s1_aligned = vec![0xA5u8; 2 * 16]; // 32 bytes
-    let s3_aligned = vec![0xB6u8; 3 * 16]; // 48 bytes
-    let u32_aligned = vec![0xCCu8; 16 * 4]; // 64 bytes
+    // Phase 3: overwrite with 64B-multiple payloads.
+    let s1_aligned = vec![0xA5u8; 64]; // 64 bytes
+    let s3_aligned = vec![0xB6u8; 2 * 64]; // 128 bytes
+    let u32_aligned = vec![0xCCu8; 64]; // 64 bytes
 
     store.write(b"k_s1", &s1_aligned).unwrap();
     store.write(b"k_s3", &s3_aligned).unwrap();
@@ -198,7 +219,7 @@ fn byte_alignment_unaligned_then_overwrite_and_simd() {
     assert_bytemuck_view_u64(e_u64_new.as_slice());
     assert_bytemuck_view_u128(e_u128_new.as_slice());
 
-    // SIMD loads or portable fallback.
+    // SIMD 64B lanes (or portable fallback).
     for bytes in [
         e_s1_new.as_slice(),
         e_s3_new.as_slice(),
@@ -206,8 +227,8 @@ fn byte_alignment_unaligned_then_overwrite_and_simd() {
         e_u64_new.as_slice(),
         e_u128_new.as_slice(),
     ] {
-        if bytes.len() >= 16 {
-            assert_simd_16_byte_loadable(bytes);
+        if bytes.len() >= 64 {
+            assert_simd_64_byte_loadable(bytes);
         }
     }
 
@@ -215,8 +236,8 @@ fn byte_alignment_unaligned_then_overwrite_and_simd() {
     for entry in store.iter_entries() {
         let bytes = entry.as_slice();
         assert_payload_addr_aligned(bytes);
-        if bytes.len() >= 16 {
-            assert_simd_16_byte_loadable(bytes);
+        if bytes.len() >= 64 {
+            assert_simd_64_byte_loadable(bytes);
         }
     }