diff --git a/Cargo.lock b/Cargo.lock
index 81a6cd1531..db2dc1e4fb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1461,6 +1461,7 @@ dependencies = [
"commonware-parallel",
"commonware-utils",
"console-subscriber",
+ "crc32fast",
"criterion",
"futures",
"getrandom 0.2.16",
diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs
index 988491fa6a..409203cc80 100644
--- a/consensus/fuzz/src/lib.rs
+++ b/consensus/fuzz/src/lib.rs
@@ -28,14 +28,20 @@ use commonware_cryptography::{
};
use commonware_p2p::simulated::{Config as NetworkConfig, Link, Network};
use commonware_runtime::{buffer::PoolRef, deterministic, Clock, Metrics, Runner, Spawner};
-use commonware_utils::{max_faults, NZUsize};
+use commonware_utils::{max_faults, NZUsize, NZU16};
use futures::{channel::mpsc::Receiver, future::join_all, StreamExt};
use rand::{rngs::StdRng, RngCore, SeedableRng};
-use std::{cell::RefCell, num::NonZeroUsize, panic, sync::Arc, time::Duration};
+use std::{
+ cell::RefCell,
+ num::{NonZeroU16, NonZeroUsize},
+ panic,
+ sync::Arc,
+ time::Duration,
+};
pub const EPOCH: u64 = 333;
-const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
const MIN_REQUIRED_CONTAINERS: u64 = 10;
const MAX_REQUIRED_CONTAINERS: u64 = 50;
diff --git a/consensus/src/aggregation/mod.rs b/consensus/src/aggregation/mod.rs
index 7eacb9a938..7400ba0a1d 100644
--- a/consensus/src/aggregation/mod.rs
+++ b/consensus/src/aggregation/mod.rs
@@ -103,19 +103,19 @@ mod tests {
deterministic::{self, Context},
Clock, Metrics, Quota, Runner, Spawner,
};
- use commonware_utils::{test_rng, NZUsize, NonZeroDuration};
+ use commonware_utils::{test_rng, NZUsize, NonZeroDuration, NZU16};
use futures::{channel::oneshot, future::join_all};
use rand::{rngs::StdRng, Rng};
use std::{
collections::BTreeMap,
- num::{NonZeroU32, NonZeroUsize},
+ num::{NonZeroU16, NonZeroU32, NonZeroUsize},
time::Duration,
};
use tracing::debug;
type Registrations
= BTreeMap
, Receiver
)>;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX);
const TEST_NAMESPACE: &[u8] = b"my testing namespace";
diff --git a/consensus/src/marshal/mod.rs b/consensus/src/marshal/mod.rs
index 3e483a9a39..1de1edb636 100644
--- a/consensus/src/marshal/mod.rs
+++ b/consensus/src/marshal/mod.rs
@@ -130,7 +130,7 @@ mod tests {
};
use commonware_runtime::{buffer::PoolRef, deterministic, Clock, Metrics, Quota, Runner};
use commonware_storage::archive::immutable;
- use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU64};
+ use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU16, NZU64};
use futures::StreamExt;
use rand::{
seq::{IteratorRandom, SliceRandom},
@@ -138,7 +138,7 @@ mod tests {
};
use std::{
collections::BTreeMap,
- num::{NonZeroU32, NonZeroU64, NonZeroUsize},
+ num::{NonZeroU16, NonZeroU32, NonZeroU64, NonZeroUsize},
time::{Duration, Instant},
};
use tracing::info;
@@ -150,7 +150,7 @@ mod tests {
type S = bls12381_threshold::Scheme;
type P = ConstantProvider;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
const NAMESPACE: &[u8] = b"test";
const NUM_VALIDATORS: u32 = 4;
diff --git a/consensus/src/ordered_broadcast/mod.rs b/consensus/src/ordered_broadcast/mod.rs
index d65f1bba7a..75dcb5e06f 100644
--- a/consensus/src/ordered_broadcast/mod.rs
+++ b/consensus/src/ordered_broadcast/mod.rs
@@ -97,16 +97,16 @@ mod tests {
deterministic::{self, Context},
Clock, Metrics, Quota, Runner, Spawner,
};
- use commonware_utils::NZUsize;
+ use commonware_utils::{NZUsize, NZU16};
use futures::{channel::oneshot, future::join_all};
use std::{
collections::{BTreeMap, HashMap},
- num::{NonZeroU32, NonZeroUsize},
+ num::{NonZeroU16, NonZeroU32, NonZeroUsize},
time::Duration,
};
use tracing::debug;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX);
const TEST_NAMESPACE: &[u8] = b"ordered_broadcast_test";
diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs
index 7b13cb5293..4ca1bcbd4b 100644
--- a/consensus/src/simplex/actors/voter/mod.rs
+++ b/consensus/src/simplex/actors/voter/mod.rs
@@ -71,15 +71,15 @@ mod tests {
use commonware_macros::{select, test_traced};
use commonware_p2p::simulated::{Config as NConfig, Network};
use commonware_runtime::{deterministic, Clock, Metrics, Quota, Runner};
- use commonware_utils::{quorum, NZUsize};
+ use commonware_utils::{quorum, NZUsize, NZU16};
use futures::{channel::mpsc, FutureExt, StreamExt};
use std::{
- num::NonZeroU32,
+ num::{NonZeroU16, NonZeroU32},
sync::{Arc, Mutex},
time::Duration,
};
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX);
diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs
index be2cad6b7d..0d2439eac4 100644
--- a/consensus/src/simplex/mod.rs
+++ b/consensus/src/simplex/mod.rs
@@ -320,20 +320,20 @@ mod tests {
use commonware_runtime::{
buffer::PoolRef, deterministic, Clock, Metrics, Quota, Runner, Spawner,
};
- use commonware_utils::{max_faults, quorum, test_rng, NZUsize};
+ use commonware_utils::{max_faults, quorum, test_rng, NZUsize, NZU16};
use engine::Engine;
use futures::{future::join_all, StreamExt};
use rand::{rngs::StdRng, Rng as _};
use std::{
collections::{BTreeMap, HashMap},
- num::{NonZeroU32, NonZeroUsize},
+ num::{NonZeroU16, NonZeroU32, NonZeroUsize},
sync::{Arc, Mutex},
time::Duration,
};
use tracing::{debug, info, warn};
use types::Activity;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX);
diff --git a/examples/bridge/src/bin/validator.rs b/examples/bridge/src/bin/validator.rs
index fb290f00b6..420adce080 100644
--- a/examples/bridge/src/bin/validator.rs
+++ b/examples/bridge/src/bin/validator.rs
@@ -20,7 +20,7 @@ use commonware_runtime::{
buffer::PoolRef, tokio, Metrics, Network, Quota, RayonPoolSpawner, Runner,
};
use commonware_stream::{dial, Config as StreamConfig};
-use commonware_utils::{from_hex, ordered::Set, union, NZUsize, TryCollect, NZU32};
+use commonware_utils::{from_hex, ordered::Set, union, NZUsize, TryCollect, NZU16, NZU32};
use std::{
net::{IpAddr, Ipv4Addr, SocketAddr},
str::FromStr,
@@ -264,7 +264,7 @@ fn main() {
activity_timeout: ViewDelta::new(10),
skip_timeout: ViewDelta::new(5),
fetch_concurrent: 32,
- buffer_pool: PoolRef::new(NZUsize!(16_384), NZUsize!(10_000)),
+ buffer_pool: PoolRef::new(NZU16!(16_384), NZUsize!(10_000)),
},
);
diff --git a/examples/log/src/main.rs b/examples/log/src/main.rs
index db93410646..b61ad3775c 100644
--- a/examples/log/src/main.rs
+++ b/examples/log/src/main.rs
@@ -55,7 +55,7 @@ use commonware_consensus::{
use commonware_cryptography::{ed25519, Sha256, Signer as _};
use commonware_p2p::{authenticated::discovery, Manager};
use commonware_runtime::{buffer::PoolRef, tokio, Metrics, Quota, Runner};
-use commonware_utils::{ordered::Set, union, NZUsize, TryCollect, NZU32};
+use commonware_utils::{ordered::Set, union, NZUsize, TryCollect, NZU16, NZU32};
use std::{
net::{IpAddr, Ipv4Addr, SocketAddr},
str::FromStr,
@@ -223,7 +223,7 @@ fn main() {
activity_timeout: ViewDelta::new(10),
skip_timeout: ViewDelta::new(5),
fetch_concurrent: 32,
- buffer_pool: PoolRef::new(NZUsize!(16_384), NZUsize!(10_000)),
+ buffer_pool: PoolRef::new(NZU16!(16_384), NZUsize!(10_000)),
};
let engine = simplex::Engine::new(context.with_label("engine"), cfg);
diff --git a/examples/reshare/src/dkg/state.rs b/examples/reshare/src/dkg/state.rs
index 447f0b258b..20e1656398 100644
--- a/examples/reshare/src/dkg/state.rs
+++ b/examples/reshare/src/dkg/state.rs
@@ -30,15 +30,15 @@ use commonware_storage::journal::{
contiguous::variable::{Config as CVConfig, Journal as CVJournal},
segmented::variable::{Config as SVConfig, Journal as SVJournal},
};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use futures::StreamExt;
use std::{
collections::BTreeMap,
- num::{NonZeroU32, NonZeroUsize},
+ num::{NonZeroU16, NonZeroU32, NonZeroUsize},
};
use tracing::debug;
-const PAGE_SIZE: NonZeroUsize = NZUsize!(1 << 12);
+const PAGE_SIZE: NonZeroU16 = NZU16!(1 << 12);
const POOL_CAPACITY: NonZeroUsize = NZUsize!(1 << 20);
const WRITE_BUFFER: NonZeroUsize = NZUsize!(1 << 12);
const READ_BUFFER: NonZeroUsize = NZUsize!(1 << 20);
diff --git a/examples/reshare/src/engine.rs b/examples/reshare/src/engine.rs
index 331f3dc9aa..48edc03173 100644
--- a/examples/reshare/src/engine.rs
+++ b/examples/reshare/src/engine.rs
@@ -27,10 +27,14 @@ use commonware_runtime::{
buffer::PoolRef, spawn_cell, Clock, ContextCell, Handle, Metrics, Network, Spawner, Storage,
};
use commonware_storage::archive::immutable;
-use commonware_utils::{ordered::Set, union, NZUsize, NZU32, NZU64};
+use commonware_utils::{ordered::Set, union, NZUsize, NZU16, NZU32, NZU64};
use futures::{channel::mpsc, future::try_join_all};
use rand_core::CryptoRngCore;
-use std::{marker::PhantomData, num::NonZero, time::Instant};
+use std::{
+ marker::PhantomData,
+ num::{NonZero, NonZeroU16},
+ time::Instant,
+};
use tracing::{error, info, warn};
const MAILBOX_SIZE: usize = 10;
@@ -45,7 +49,7 @@ const FREEZER_VALUE_TARGET_SIZE: u64 = 1024 * 1024 * 1024; // 1GB
const FREEZER_VALUE_COMPRESSION: Option = Some(3);
const REPLAY_BUFFER: NonZero = NZUsize!(8 * 1024 * 1024); // 8MB
const WRITE_BUFFER: NonZero = NZUsize!(1024 * 1024); // 1MB
-const BUFFER_POOL_PAGE_SIZE: NonZero = NZUsize!(4_096); // 4KB
+const BUFFER_POOL_PAGE_SIZE: NonZeroU16 = NZU16!(4_096); // 4KB
const BUFFER_POOL_CAPACITY: NonZero = NZUsize!(8_192); // 32MB
const MAX_REPAIR: NonZero = NZUsize!(50);
diff --git a/examples/reshare/src/orchestrator/actor.rs b/examples/reshare/src/orchestrator/actor.rs
index 689a4edba4..3f4fe9a44b 100644
--- a/examples/reshare/src/orchestrator/actor.rs
+++ b/examples/reshare/src/orchestrator/actor.rs
@@ -23,7 +23,7 @@ use commonware_parallel::Strategy;
use commonware_runtime::{
buffer::PoolRef, spawn_cell, Clock, ContextCell, Handle, Metrics, Network, Spawner, Storage,
};
-use commonware_utils::{vec::NonEmptyVec, NZUsize};
+use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU16};
use futures::{channel::mpsc, StreamExt};
use rand_core::CryptoRngCore;
use std::{collections::BTreeMap, marker::PhantomData, time::Duration};
@@ -105,7 +105,7 @@ where
config: Config,
) -> (Self, Mailbox) {
let (sender, mailbox) = mpsc::channel(config.mailbox_size);
- let pool_ref = PoolRef::new(NZUsize!(16_384), NZUsize!(10_000));
+ let pool_ref = PoolRef::new(NZU16!(16_384), NZUsize!(10_000));
(
Self {
diff --git a/examples/sync/src/databases/any.rs b/examples/sync/src/databases/any.rs
index fc2bbc8b26..4bd206d74e 100644
--- a/examples/sync/src/databases/any.rs
+++ b/examples/sync/src/databases/any.rs
@@ -17,7 +17,7 @@ use commonware_storage::{
operation::Committable,
},
};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use std::{future::Future, num::NonZeroU64};
use tracing::error;
@@ -39,7 +39,7 @@ pub fn create_config() -> Config {
log_write_buffer: NZUsize!(1024),
translator: Translator::default(),
thread_pool: None,
- buffer_pool: buffer::PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: buffer::PoolRef::new(NZU16!(1024), NZUsize!(10)),
}
}
diff --git a/examples/sync/src/databases/immutable.rs b/examples/sync/src/databases/immutable.rs
index 245cccebb3..264dc0ff79 100644
--- a/examples/sync/src/databases/immutable.rs
+++ b/examples/sync/src/databases/immutable.rs
@@ -11,7 +11,7 @@ use commonware_storage::{
Durable, Merkleized,
},
};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use std::{future::Future, num::NonZeroU64};
use tracing::error;
@@ -36,7 +36,7 @@ pub fn create_config() -> Config {
log_write_buffer: NZUsize!(1024),
translator: commonware_storage::translator::EightCap,
thread_pool: None,
- buffer_pool: commonware_runtime::buffer::PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: commonware_runtime::buffer::PoolRef::new(NZU16!(1024), NZUsize!(10)),
}
}
diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml
index 0d07c2783a..8ec609c8b8 100644
--- a/runtime/Cargo.toml
+++ b/runtime/Cargo.toml
@@ -23,6 +23,7 @@ commonware-conformance = { workspace = true, optional = true }
commonware-macros.workspace = true
commonware-parallel = { workspace = true, features = ["std"] }
commonware-utils = { workspace = true, features = ["std"] }
+crc32fast.workspace = true
futures.workspace = true
governor.workspace = true
io-uring = { workspace = true, optional = true }
diff --git a/runtime/conformance.toml b/runtime/conformance.toml
index a065ec07fb..3aa9ea75bf 100644
--- a/runtime/conformance.toml
+++ b/runtime/conformance.toml
@@ -1,3 +1,7 @@
["commonware_runtime::storage::tests::conformance::CodecConformance"]
n_cases = 65536
hash = "541c356728d47b13f1d3ac800926ef3ae2396c82f5d4e043f5c7641c4c22b4b9"
+
+["commonware_runtime::utils::buffer::pool::tests::conformance::CodecConformance"]
+n_cases = 65536
+hash = "2ca927141b521b7cccc541ec0df8614e418d317fc864ce11f428aefb330cf256"
diff --git a/runtime/fuzz/Cargo.toml b/runtime/fuzz/Cargo.toml
index 1632bdbb8d..92a133d280 100644
--- a/runtime/fuzz/Cargo.toml
+++ b/runtime/fuzz/Cargo.toml
@@ -19,3 +19,9 @@ name = "buffer"
path = "fuzz_targets/buffer.rs"
test = false
doc = false
+
+[[bin]]
+name = "blob_integrity"
+path = "fuzz_targets/blob_integrity.rs"
+test = false
+doc = false
diff --git a/runtime/fuzz/fuzz_targets/blob_integrity.rs b/runtime/fuzz/fuzz_targets/blob_integrity.rs
new file mode 100644
index 0000000000..002a51c01c
--- /dev/null
+++ b/runtime/fuzz/fuzz_targets/blob_integrity.rs
@@ -0,0 +1,249 @@
+//! Fuzz test for blob integrity verification.
+//!
+//! This test verifies that random bit corruptions in persisted blob data are appropriately
+//! detected and gracefully handled by page-oriented blob wrappers.
+//!
+//! Strategy:
+//! 1. Write several pages worth of data to an Append blob
+//! 2. Flip a random bit in the underlying blob
+//! 3. Attempt to read various ranges:
+//! - Reads from uncorrupted pages should succeed with correct data
+//! - Reads from corrupted pages should either fail OR return correct data
+//! (if the bit flip was in padding/unused bytes)
+//! 4. Test both Append.read_at() and as_blob_reader()
+
+#![no_main]
+
+use arbitrary::{Arbitrary, Unstructured};
+use commonware_runtime::{
+ buffer::pool::{Append, PoolRef},
+ deterministic, Blob, Runner, Storage,
+};
+use commonware_utils::{NZUsize, NZU16};
+use libfuzzer_sys::fuzz_target;
+
+/// CRC record size.
+const CRC_SIZE: u64 = 12;
+/// Buffer capacity for the Append wrapper.
+const BUFFER_CAPACITY: usize = 1024;
+/// Buffer capacity for the blob reader.
+const READER_BUFFER_CAPACITY: usize = 256;
+/// Maximum number of read operations to perform.
+const MAX_READS: usize = 20;
+
+#[derive(Debug)]
+struct FuzzInput {
+ /// Seed for deterministic execution.
+ seed: u64,
+ /// Logical page size (1-255).
+ page_size: u8,
+ /// Pool page cache capacity (1-10).
+ pool_capacity: u8,
+ /// Number of pages to write (1-10).
+ num_pages: u8,
+ /// Byte offset within the blob to corrupt (will be modulo physical_size).
+ corrupt_byte_offset: u16,
+ /// Bit position within the byte to flip (0-7).
+ corrupt_bit: u8,
+ /// Read operations to perform after corruption.
+ reads: Vec,
+}
+
+impl<'a> Arbitrary<'a> for FuzzInput {
+ fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result {
+ let num_reads = u.int_in_range(0..=MAX_READS)?;
+ let reads = (0..num_reads)
+ .map(|_| ReadOp::arbitrary(u))
+ .collect::, _>>()?;
+
+ Ok(FuzzInput {
+ seed: u.arbitrary()?,
+ page_size: u.int_in_range(1..=255)?,
+ pool_capacity: u.int_in_range(1..=10)?,
+ num_pages: u.int_in_range(1..=10)?,
+ corrupt_byte_offset: u.arbitrary()?,
+ corrupt_bit: u.int_in_range(0..=7)?,
+ reads,
+ })
+ }
+}
+
+#[derive(Debug)]
+struct ReadOp {
+ /// Logical offset to read from.
+ offset: u16,
+ /// Number of bytes to read (1-256).
+ len: u16,
+ /// Whether to use the Read wrapper (true) or Append.read_at (false).
+ use_reader: bool,
+}
+
+impl<'a> Arbitrary<'a> for ReadOp {
+ fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result {
+ Ok(ReadOp {
+ offset: u.arbitrary()?,
+ len: u.int_in_range(1..=256)?,
+ use_reader: u.arbitrary()?,
+ })
+ }
+}
+
+fn fuzz(input: FuzzInput) {
+ let executor = deterministic::Runner::default();
+ executor.start(|context| async move {
+ let page_size = input.page_size as u64;
+ let physical_page_size = page_size + CRC_SIZE;
+ let pool_capacity = input.pool_capacity as usize;
+ let pool_ref = PoolRef::new(NZU16!(page_size as u16), NZUsize!(pool_capacity));
+
+ // Compute logical size from number of pages.
+ let logical_size = input.num_pages as u64 * page_size;
+
+ // Generate deterministic data based on seed.
+ let expected_data: Vec = (0..logical_size)
+ .map(|i| ((input.seed.wrapping_add(i)) & 0xFF) as u8)
+ .collect();
+
+ // Step 1: Write data to the blob.
+ let (blob, _) = context
+ .open("test_partition", b"integrity_test")
+ .await
+ .expect("cannot open blob");
+
+ let append = Append::new(blob.clone(), 0, BUFFER_CAPACITY, pool_ref.clone())
+ .await
+ .expect("cannot create append wrapper");
+
+ append
+ .append(&expected_data)
+ .await
+ .expect("cannot append data");
+ append.sync().await.expect("cannot sync");
+ drop(append);
+
+ // Step 2: Corrupt a single bit in the blob.
+ // Calculate physical size: full pages + partial page (if any).
+ let full_pages = logical_size / page_size;
+ let partial_bytes = logical_size % page_size;
+ let physical_size = if partial_bytes > 0 {
+ (full_pages + 1) * physical_page_size
+ } else {
+ full_pages * physical_page_size
+ };
+ let corrupt_offset = (input.corrupt_byte_offset as u64) % physical_size;
+ let corrupt_bit = input.corrupt_bit;
+
+ // Read the byte, flip the bit, write it back.
+ let byte_buf = blob
+ .read_at(vec![0u8; 1], corrupt_offset)
+ .await
+ .expect("cannot read byte to corrupt");
+ let corrupted_byte = byte_buf.as_ref()[0] ^ (1 << corrupt_bit);
+ blob.write_at(vec![corrupted_byte], corrupt_offset)
+ .await
+ .expect("cannot write corrupted byte");
+ blob.sync().await.expect("cannot sync corruption");
+
+ // Determine which logical page was corrupted.
+ let corrupted_page = corrupt_offset / physical_page_size;
+
+ // Step 3: Re-open and attempt reads.
+ let (blob, size) = context
+ .open("test_partition", b"integrity_test")
+ .await
+ .expect("cannot reopen blob");
+
+ // The append wrapper may truncate if the corruption affected the last page's CRC
+ // during initialization, so we handle both cases.
+ let append = match Append::new(blob, size, BUFFER_CAPACITY, pool_ref.clone()).await {
+ Ok(a) => a,
+ Err(_) => {
+ // Corruption was severe enough to fail initialization - this is acceptable.
+ return;
+ }
+ };
+
+ let reported_size = append.size().await;
+
+ // Step 4: Perform read operations and verify results.
+ for read_op in &input.reads {
+ let offset = read_op.offset as u64;
+ let len = read_op.len as usize;
+
+ // Skip reads that would be entirely out of bounds.
+ if offset >= reported_size {
+ continue;
+ }
+
+ // Clamp length to not exceed reported size.
+ let len = len.min((reported_size - offset) as usize);
+
+ // Determine which pages this read spans.
+ let start_page = offset / page_size;
+ let end_page = (offset + len as u64 - 1) / page_size;
+ let read_touches_corrupted_page =
+ start_page <= corrupted_page && corrupted_page <= end_page;
+
+ if read_op.use_reader {
+ // Use as_blob_reader.
+ // Note: The Read wrapper buffers multiple pages at once, so corruption on ANY
+ // page in the buffer can cause a read to fail - not just the page being accessed.
+ // We can only verify that successful reads return correct data.
+ let reader_result = append.as_blob_reader(NZUsize!(READER_BUFFER_CAPACITY)).await;
+ let mut reader = match reader_result {
+ Ok(r) => r,
+ Err(_) => continue, // Reader creation failed, skip.
+ };
+
+ // Seek to the read offset.
+ if reader.seek_to(offset).is_err() {
+ continue;
+ }
+
+ let mut buf = vec![0u8; len];
+ let read_result = reader.read_exact(&mut buf, len).await;
+
+ if let Ok(()) = read_result {
+ // Read succeeded - data must match expected.
+ let expected_slice = &expected_data[offset as usize..offset as usize + len];
+ assert_eq!(
+ &buf, expected_slice,
+ "Read via reader returned wrong data at offset {}, len {}",
+ offset, len
+ );
+ }
+ // Read failures are acceptable due to buffering behavior.
+ } else {
+ // Use Append.read_at directly.
+ let buf = vec![0u8; len];
+ let read_result = append.read_at(buf, offset).await;
+
+ match read_result {
+ Ok(buf) => {
+ // Read succeeded - data must match expected.
+ let buf: Vec = buf.into();
+ let expected_slice = &expected_data[offset as usize..offset as usize + len];
+ assert_eq!(
+ &buf, expected_slice,
+ "Read via Append returned wrong data at offset {}, len {}",
+ offset, len
+ );
+ }
+ Err(_) => {
+ // Read failed - this is only acceptable if the read touched
+ // the corrupted page.
+ assert!(
+ read_touches_corrupted_page,
+ "Read via Append failed at offset {}, len {} but didn't touch corrupted page {}",
+ offset, len, corrupted_page
+ );
+ }
+ }
+ }
+ }
+ });
+}
+
+fuzz_target!(|input: FuzzInput| {
+ fuzz(input);
+});
diff --git a/runtime/fuzz/fuzz_targets/buffer.rs b/runtime/fuzz/fuzz_targets/buffer.rs
index de596ae15b..6acc67df84 100644
--- a/runtime/fuzz/fuzz_targets/buffer.rs
+++ b/runtime/fuzz/fuzz_targets/buffer.rs
@@ -2,10 +2,13 @@
use arbitrary::Arbitrary;
use commonware_runtime::{
- buffer::{Append, PoolRef, Read, Write},
+ buffer::{
+ pool::{Append, PoolRef},
+ Read, Write,
+ },
deterministic, Blob, Runner, Storage,
};
-use commonware_utils::NZUsize;
+use commonware_utils::{NZUsize, NZU16};
use libfuzzer_sys::fuzz_target;
const MAX_SIZE: usize = 1024 * 1024;
@@ -77,7 +80,9 @@ enum FuzzOperation {
offset: u16,
},
AppendSize,
- AppendCloneBlob,
+ AppendAsReader {
+ buffer_size: u16,
+ },
AppendReadAt {
data_size: u16,
offset: u16,
@@ -148,8 +153,7 @@ fn fuzz(input: FuzzInput) {
pool_page_size,
pool_capacity,
} => {
- let buffer_size = NZUsize!((buffer_size as usize).clamp(1, MAX_SIZE));
- let pool_page_size = NZUsize!((pool_page_size as usize).clamp(1, MAX_SIZE));
+ let buffer_size = (buffer_size as usize).clamp(0, MAX_SIZE);
let pool_capacity = NZUsize!((pool_capacity as usize).clamp(1, MAX_SIZE));
let (blob, _) = context
@@ -157,8 +161,14 @@ fn fuzz(input: FuzzInput) {
.await
.expect("cannot open write blob");
- pool_ref = Some(PoolRef::new(pool_page_size, pool_capacity));
- pool_page_size_ref = Some(pool_page_size);
+ // Only create a new pool if one doesn't exist. Reusing the same blob with
+ // a different page size would corrupt reads since page size is embedded
+ // in the CRC records.
+ if pool_ref.is_none() {
+ let pool_page_size = pool_page_size.clamp(1, u16::MAX);
+ pool_ref = Some(PoolRef::new(NZU16!(pool_page_size), pool_capacity));
+ pool_page_size_ref = Some(pool_page_size);
+ }
if let Some(ref pool) = pool_ref {
append_buffer =
@@ -236,7 +246,7 @@ fn fuzz(input: FuzzInput) {
};
let current_size = append.size().await;
if current_size.checked_add(data.len() as u64).is_some() {
- let _ = append.append(data).await;
+ let _ = append.append(&data).await;
}
}
}
@@ -260,15 +270,13 @@ fn fuzz(input: FuzzInput) {
} => {
if let Some(ref pool) = pool_ref {
let offset = offset as u64;
- let data = if data.len() > MAX_SIZE {
- &data[..MAX_SIZE]
- } else {
- &data[..]
- };
- if let Some(pool_page_size) = pool_page_size_ref {
- let aligned_offset = (offset / pool_page_size.get() as u64)
- * pool_page_size.get() as u64;
- let _ = pool.cache(blob_id as u64, data, aligned_offset).await;
+ if data.len() >= pool.page_size() as usize {
+ let data = &data[..pool.page_size() as usize];
+ if let Some(pool_page_size) = pool_page_size_ref {
+ let aligned_offset =
+ (offset / pool_page_size as u64) * pool_page_size as u64;
+ let _ = pool.cache(blob_id as u64, data, aligned_offset).await;
+ }
}
}
}
@@ -320,9 +328,15 @@ fn fuzz(input: FuzzInput) {
}
}
- FuzzOperation::AppendCloneBlob => {
+ FuzzOperation::AppendAsReader { buffer_size } => {
if let Some(ref append) = append_buffer {
- let _ = append.clone_blob().await;
+ let buffer_size = NZUsize!((buffer_size as usize).clamp(1, MAX_SIZE));
+ // This fuzzer never corrupts data, so CRC validation in as_blob_reader
+ // should always succeed. A failure here indicates a bug.
+ let _ = append
+ .as_blob_reader(buffer_size)
+ .await
+ .expect("Failed to create blob reader");
}
}
diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs
index 115f9ad939..6422f05efb 100644
--- a/runtime/src/lib.rs
+++ b/runtime/src/lib.rs
@@ -111,8 +111,12 @@ pub enum Error {
expected: std::ops::RangeInclusive,
found: u16,
},
+ #[error("invalid or missing checksum")]
+ InvalidChecksum,
#[error("offset overflow")]
OffsetOverflow,
+ #[error("immutable blob")]
+ ImmutableBlob,
#[error("io error: {0}")]
Io(#[from] IoError),
}
diff --git a/runtime/src/utils/buffer/append.rs b/runtime/src/utils/buffer/append.rs
deleted file mode 100644
index 27b35d7e17..0000000000
--- a/runtime/src/utils/buffer/append.rs
+++ /dev/null
@@ -1,545 +0,0 @@
-use crate::{
- buffer::{tip::Buffer, PoolRef},
- Blob, Error, RwLock,
-};
-use commonware_utils::{NZUsize, StableBuf};
-use std::{num::NonZeroUsize, sync::Arc};
-
-/// A [Blob] wrapper that supports appending new data that is both read and write cached, and
-/// provides buffer-pool managed read caching of older data.
-///
-/// # Concurrent Access
-///
-/// This implementation allows readers to proceed while flush I/O is in progress, as long as they
-/// are reading from the write buffer or the pool cache. Readers that need to access data from the
-/// underlying blob (cache miss) will wait for any in-progress write to complete.
-///
-/// The implementation involves two locks: one for the write buffer (and blob size metadata), and
-/// one for the underlying blob itself. To avoid deadlocks, the buffer lock is always acquired
-/// before the blob lock.
-#[derive(Clone)]
-pub struct Append {
- /// The underlying blob being wrapped, protected by a lock for I/O coordination.
- blob: Arc>,
-
- /// Unique id assigned by the buffer pool.
- id: u64,
-
- /// Buffer pool to consult for caching.
- pool_ref: PoolRef,
-
- /// The buffer containing the data yet to be appended to the tip of the underlying blob, as well
- /// as up to the final page_size-1 bytes from the underlying blob (to ensure the buffer's offset
- /// is always at a page boundary), paired with the actual size of the underlying blob on disk.
- ///
- /// # Invariants
- ///
- /// - The buffer's `offset` into the blob is always page aligned.
- /// - The range of bytes in this buffer never overlaps with any page buffered by `pool`. (See
- /// the warning in [Self::resize] for one uncommon exception.)
- buffer: Arc>,
-}
-
-impl Append {
- /// Create a new [Append] of provided `size` using the provided `pool` for read caching, and a
- /// write buffer with capacity `buffer_size`.
- pub async fn new(
- blob: B,
- size: u64,
- buffer_size: NonZeroUsize,
- pool_ref: PoolRef,
- ) -> Result {
- // Set a floor on the write buffer size to make sure we always write at least 1 page of new
- // data with each flush. We multiply page_size by two here since we could be storing up to
- // page_size-1 bytes of already written data in the append buffer to maintain page
- // alignment.
- let mut buffer_size = buffer_size.get();
- buffer_size = buffer_size.max(pool_ref.page_size * 2);
-
- // Initialize the append buffer to contain the last non-full page of bytes from the blob to
- // ensure its offset into the blob is always page aligned.
- let leftover_size = size % pool_ref.page_size as u64;
- let page_aligned_size = size - leftover_size;
- let mut buffer = Buffer::new(page_aligned_size, NZUsize!(buffer_size));
- if leftover_size != 0 {
- let page_buf = vec![0; leftover_size as usize];
- let buf = blob.read_at(page_buf, page_aligned_size).await?;
- assert!(!buffer.append(buf.as_ref()));
- }
-
- Ok(Self {
- blob: Arc::new(RwLock::new(blob)),
- id: pool_ref.next_id().await,
- pool_ref,
- buffer: Arc::new(RwLock::new((buffer, size))),
- })
- }
-
- /// Append all bytes in `buf` to the tip of the blob.
- pub async fn append(&self, buf: impl Into + Send) -> Result<(), Error> {
- // Prepare `buf` to be written.
- let buf = buf.into();
-
- // Acquire a write lock on the buffer and blob_size.
- let mut guard = self.buffer.write().await;
- let (buffer, _) = &mut *guard;
-
- // Ensure the write doesn't overflow.
- buffer
- .size()
- .checked_add(buf.len() as u64)
- .ok_or(Error::OffsetOverflow)?;
-
- if buffer.append(buf.as_ref()) {
- // Buffer is over capacity, flush it to the underlying blob.
- return self.flush_internal(guard).await;
- }
-
- Ok(())
- }
-
- /// Returns the current logical size of the blob including any buffered data.
- ///
- /// This represents the total size of data that would be present after flushing.
- #[allow(clippy::len_without_is_empty)]
- pub async fn size(&self) -> u64 {
- self.buffer.read().await.0.size()
- }
-
- /// Flush the append buffer to the underlying blob, caching each page worth of written data in
- /// the buffer pool.
- ///
- /// This method acquires the blob write lock before releasing the buffer lock, ensuring readers
- /// that need blob access will wait for the write to complete.
- async fn flush_internal(
- &self,
- mut buf_guard: crate::RwLockWriteGuard<'_, (Buffer, u64)>,
- ) -> Result<(), Error> {
- let (buffer, blob_size) = &mut *buf_guard;
-
- // Prepare the data to be written.
- let Some(buf) = self.prepare_flush_data(buffer, *blob_size).await else {
- return Ok(());
- };
-
- // Update blob_size *before* releasing the lock. We do this optimistically; if the write
- // fails below, the program will return an error and likely abort/panic, so maintaining
- // exact consistency on error isn't strictly required.
- let write_offset = *blob_size;
- *blob_size += buf.len() as u64;
-
- // Acquire blob write lock BEFORE releasing buffer lock. This ensures no reader can access
- // the blob until the write completes.
- let blob_guard = self.blob.write().await;
-
- // Release buffer lock, allowing concurrent buffered reads while the write is in progress.
- // Any attempts to read from the blob will block until the write completes.
- drop(buf_guard);
-
- // Perform the write while holding only blob lock.
- blob_guard.write_at(buf, write_offset).await
- }
-
- /// Prepares data from the buffer to be flushed to the blob.
- ///
- /// This method:
- /// 1. Takes the data from the write buffer.
- /// 2. Caches it in the buffer pool.
- /// 3. Returns the data to be written and the offset to write it at (if any).
- async fn prepare_flush_data(&self, buffer: &mut Buffer, blob_size: u64) -> Option> {
- // Take the buffered data, if any.
- let (mut buf, offset) = buffer.take()?;
-
- // Insert the flushed data into the buffer pool.
- let remaining = self.pool_ref.cache(self.id, &buf, offset).await;
-
- // If there's any data left over that doesn't constitute an entire page, re-buffer it into
- // the append buffer to maintain its page-boundary alignment.
- if remaining != 0 {
- buffer.offset -= remaining as u64;
- buffer.data.extend_from_slice(&buf[buf.len() - remaining..])
- }
-
- // Calculate where new data starts in the buffer to skip already-written trailing bytes.
- let new_data_start = blob_size.saturating_sub(offset) as usize;
-
- // Early exit if there's no new data to write.
- if new_data_start >= buf.len() {
- return None;
- }
-
- if new_data_start > 0 {
- buf.drain(0..new_data_start);
- }
-
- // Return the data to write, and the offset where to write it within the blob.
- Some(buf)
- }
-
- /// Clones and returns the underlying blob.
- pub async fn clone_blob(&self) -> B {
- self.blob.read().await.clone()
- }
-}
-
-impl Blob for Append {
- async fn read_at(
- &self,
- buf: impl Into + Send,
- offset: u64,
- ) -> Result {
- // Prepare `buf` to capture the read data.
- let mut buf = buf.into();
-
- // Ensure the read doesn't overflow.
- let end_offset = offset
- .checked_add(buf.len() as u64)
- .ok_or(Error::OffsetOverflow)?;
-
- // Acquire a read lock on the buffer.
- let guard = self.buffer.read().await;
- let (buffer, _) = &*guard;
-
- // If the data required is beyond the size of the blob, return an error.
- if end_offset > buffer.size() {
- return Err(Error::BlobInsufficientLength);
- }
-
- // Extract any bytes from the buffer that overlap with the requested range.
- let remaining = buffer.extract(buf.as_mut(), offset);
-
- // Release buffer lock before potential I/O.
- drop(guard);
-
- if remaining == 0 {
- return Ok(buf);
- }
-
- // Fast path: try to read *only* from pool cache without acquiring blob lock. This allows
- // concurrent reads even while a flush is in progress.
- let cached = self
- .pool_ref
- .read_cached(self.id, &mut buf.as_mut()[..remaining], offset)
- .await;
-
- if cached == remaining {
- // All bytes found in cache.
- return Ok(buf);
- }
-
- // Slow path: cache miss (partial or full), acquire blob read lock to ensure any in-flight
- // write completes before we read from the blob.
- let blob_guard = self.blob.read().await;
-
- // Read remaining bytes that were not already obtained from the earlier cache read.
- let uncached_offset = offset + cached as u64;
- let uncached_len = remaining - cached;
- self.pool_ref
- .read(
- &*blob_guard,
- self.id,
- &mut buf.as_mut()[cached..cached + uncached_len],
- uncached_offset,
- )
- .await?;
-
- Ok(buf)
- }
-
- /// This [Blob] trait method is unimplemented by [Append] and unconditionally panics.
- async fn write_at(&self, _buf: impl Into + Send, _offset: u64) -> Result<(), Error> {
- // TODO(): Extend the buffer pool to
- // support arbitrary writes.
- unimplemented!("append-only blob type does not support write_at")
- }
-
- async fn sync(&self) -> Result<(), Error> {
- // Flush any buffered data. When flush_internal returns, the write_at has completed
- // and data is in the OS buffer.
- {
- let guard = self.buffer.write().await;
- self.flush_internal(guard).await?;
- }
- // Sync the OS buffer to disk. We need the blob read lock here since sync() requires
- // access to the blob, but only a read lock since we're not modifying blob state.
- self.blob.read().await.sync().await
- }
-
- /// Resize the blob to the provided `size`.
- ///
- /// # Warning
- ///
- /// Concurrent readers which try to read past the new size during the resize may error.
- async fn resize(&self, size: u64) -> Result<(), Error> {
- // Implementation note: rewinding the blob across a page boundary potentially results in
- // stale data remaining in the buffer pool's cache. We don't proactively purge the data
- // within this function since it would be inaccessible anyway. Instead we ensure it is
- // always updated should the blob grow back to the point where we have new data for the same
- // page, if any old data hasn't expired naturally by then.
-
- // Acquire buffer lock first.
- // NOTE: We MUST acquire the buffer lock before the blob lock to avoid deadlocks with
- // `append`, which acquires buffer then blob (via `flush_internal`).
- let mut buf_guard = self.buffer.write().await;
- let (buffer, blob_size) = &mut *buf_guard;
-
- let flush_data = self.prepare_flush_data(buffer, *blob_size).await;
-
- // Acquire blob write lock to prevent concurrent reads throughout the resize.
- let blob_guard = self.blob.write().await;
-
- // Flush any buffered bytes first, using the helper.
- // We hold both locks here, so no concurrent operations can happen.
- if let Some(buf) = flush_data {
- // Write the data to the blob.
- let len = buf.len() as u64;
- blob_guard.write_at(buf, *blob_size).await?;
-
- // Update blob_size to reflect the flush.
- *blob_size += len;
- }
-
- // Resize the underlying blob.
- blob_guard.resize(size).await?;
-
- // Update the blob size.
- *blob_size = size;
-
- // Reset the append buffer to the new size, ensuring its page alignment.
- let leftover_size = size % self.pool_ref.page_size as u64;
- buffer.offset = size - leftover_size; // page aligned size
- buffer.data.clear();
- if leftover_size != 0 {
- let page_buf = vec![0; leftover_size as usize];
- let buf = blob_guard.read_at(page_buf, buffer.offset).await?;
- assert!(!buffer.append(buf.as_ref()));
- }
-
- Ok(())
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use crate::{deterministic, Runner, Storage as _};
- use commonware_macros::test_traced;
- use commonware_utils::NZUsize;
-
- const PAGE_SIZE: usize = 1024;
- const BUFFER_SIZE: usize = PAGE_SIZE * 2;
-
- #[test_traced]
- #[should_panic(expected = "not implemented")]
- fn test_append_blob_write_panics() {
- // Initialize the deterministic context
- let executor = deterministic::Runner::default();
- // Start the test within the executor
- executor.start(|context| async move {
- let (blob, size) = context
- .open("test", "blob".as_bytes())
- .await
- .expect("Failed to open blob");
- let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10));
- let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone())
- .await
- .unwrap();
- assert_eq!(blob.size().await, 0);
- blob.write_at(vec![0], 0).await.unwrap();
- });
- }
-
- #[test_traced]
- fn test_append_blob_append() {
- // Initialize the deterministic context
- let executor = deterministic::Runner::default();
- // Start the test within the executor
- executor.start(|context| async move {
- let (blob, size) = context
- .open("test", "blob".as_bytes())
- .await
- .expect("Failed to open blob");
- assert_eq!(size, 0);
-
- // Wrap the blob, then append 11 consecutive pages of data.
- let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10));
- let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone())
- .await
- .unwrap();
- for i in 0..11 {
- let buf = vec![i as u8; PAGE_SIZE];
- blob.append(buf).await.unwrap();
- }
- assert_eq!(blob.size().await, 11 * PAGE_SIZE as u64);
-
- blob.sync().await.expect("Failed to sync blob");
-
- // Make sure blob has expected size when reopened.
- let (blob, size) = context
- .open("test", "blob".as_bytes())
- .await
- .expect("Failed to open blob");
- assert_eq!(size, 11 * PAGE_SIZE as u64);
- blob.sync().await.expect("Failed to sync blob");
- });
- }
-
- #[test_traced]
- fn test_append_blob_read() {
- // Initialize the deterministic context
- let executor = deterministic::Runner::default();
- // Start the test within the executor
- executor.start(|context| async move {
- let (blob, size) = context
- .open("test", "blob".as_bytes())
- .await
- .expect("Failed to open blob");
- assert_eq!(size, 0);
-
- let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10));
- let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone())
- .await
- .unwrap();
-
- // Append one byte & sync to ensure we have "trailing bytes".
- blob.append(vec![42]).await.unwrap();
- blob.sync().await.unwrap();
-
- // Append 11 consecutive pages of data.
- for i in 0..11 {
- let buf = vec![i as u8; PAGE_SIZE];
- blob.append(buf).await.unwrap();
- }
-
- // Read from the blob across a page boundary but well outside any write buffered data.
- let mut buf = vec![0; 100];
- buf = blob
- .read_at(buf, 1 + PAGE_SIZE as u64 - 50)
- .await
- .unwrap()
- .into();
- let mut expected = vec![0; 50];
- expected.extend_from_slice(&[1; 50]);
- assert_eq!(buf, expected);
-
- // Read from the blob across a page boundary but within the write buffered data.
- let mut buf = vec![0; 100];
- buf = blob
- .read_at(buf, 1 + (PAGE_SIZE as u64 * 10) - 50)
- .await
- .unwrap()
- .into();
- let mut expected = vec![9; 50];
- expected.extend_from_slice(&[10; 50]);
- assert_eq!(buf, expected);
-
- // Read across read-only and write-buffered section, all the way up to the very last
- // byte.
- let buf_size = PAGE_SIZE * 4;
- let mut buf = vec![0; buf_size];
- buf = blob
- .read_at(buf, blob.size().await - buf_size as u64)
- .await
- .unwrap()
- .into();
- let mut expected = vec![7; PAGE_SIZE];
- expected.extend_from_slice(&[8; PAGE_SIZE]);
- expected.extend_from_slice(&[9; PAGE_SIZE]);
- expected.extend_from_slice(&[10; PAGE_SIZE]);
- assert_eq!(buf, expected);
-
- // Exercise more boundary conditions by reading every possible 2-byte slice.
- for i in 0..blob.size().await - 1 {
- let mut buf = vec![0; 2];
- buf = blob.read_at(buf, i).await.unwrap().into();
- let page_num = (i / PAGE_SIZE as u64) as u8;
- if i == 0 {
- assert_eq!(buf, &[42, 0]);
- } else if i % PAGE_SIZE as u64 == 0 {
- assert_eq!(buf, &[page_num - 1, page_num], "i = {i}");
- } else {
- assert_eq!(buf, &[page_num; 2], "i = {i}");
- }
- }
-
- // Confirm all bytes are as expected after syncing the blob.
- blob.sync().await.unwrap();
- buf = blob.read_at(vec![0], 0).await.unwrap().into();
- assert_eq!(buf, &[42]);
-
- for i in 0..11 {
- let mut buf = vec![0; PAGE_SIZE];
- buf = blob
- .read_at(buf, 1 + i * PAGE_SIZE as u64)
- .await
- .unwrap()
- .into();
- assert_eq!(buf, &[i as u8; PAGE_SIZE]);
- }
-
- blob.sync().await.expect("Failed to sync blob");
- });
- }
-
- #[test_traced]
- fn test_append_blob_tracks_physical_size() {
- let executor = deterministic::Runner::default();
- executor.start(|context| async move {
- let (blob, size) = context
- .open("test", "blob".as_bytes())
- .await
- .expect("Failed to open blob");
-
- let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10));
- let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone())
- .await
- .unwrap();
-
- // Initially blob_size should be 0.
- assert_eq!(blob.buffer.read().await.1, 0);
-
- // Write 100 bytes and sync.
- blob.append(vec![1u8; 100]).await.unwrap();
- blob.sync().await.unwrap();
- assert_eq!(blob.buffer.read().await.1, 100);
-
- // Append more data but don't sync yet, blob_size shouldn't change.
- blob.append(vec![2u8; 200]).await.unwrap();
- assert_eq!(blob.buffer.read().await.1, 100);
-
- // Force a flush by exceeding buffer.
- blob.append(vec![3u8; BUFFER_SIZE]).await.unwrap();
- assert_eq!(blob.buffer.read().await.1, 100 + 200 + BUFFER_SIZE as u64);
-
- // Test resize down and up.
- blob.resize(50).await.unwrap();
- assert_eq!(blob.buffer.read().await.1, 50);
-
- blob.resize(150).await.unwrap();
- assert_eq!(blob.buffer.read().await.1, 150);
-
- // Append after resize and sync.
- blob.append(vec![4u8; 100]).await.unwrap();
- blob.sync().await.unwrap();
- assert_eq!(blob.buffer.read().await.1, 250);
-
- // Close and reopen.
- let (blob, size) = context
- .open("test", "blob".as_bytes())
- .await
- .expect("Failed to reopen blob");
-
- let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone())
- .await
- .unwrap();
- assert_eq!(blob.buffer.read().await.1, 250);
-
- // Verify data integrity after all operations.
- let mut buf = vec![0u8; 250];
- buf = blob.read_at(buf, 0).await.unwrap().into();
- assert_eq!(&buf[0..50], &vec![1u8; 50][..]);
- assert_eq!(&buf[50..150], &vec![0u8; 100][..]); // Zeros from resize up to 150
- assert_eq!(&buf[150..250], &vec![4u8; 100][..]);
- });
- }
-}
diff --git a/runtime/src/utils/buffer/mod.rs b/runtime/src/utils/buffer/mod.rs
index 87083648b8..1f35507ed7 100644
--- a/runtime/src/utils/buffer/mod.rs
+++ b/runtime/src/utils/buffer/mod.rs
@@ -1,13 +1,11 @@
//! Buffers for reading and writing to [crate::Blob]s.
-mod append;
pub mod pool;
mod read;
mod tip;
mod write;
-pub use append::Append;
-pub use pool::{Pool, PoolRef};
+pub use pool::PoolRef;
pub use read::Read;
pub use write::Write;
diff --git a/runtime/src/utils/buffer/pool/append.rs b/runtime/src/utils/buffer/pool/append.rs
new file mode 100644
index 0000000000..ab1ec84687
--- /dev/null
+++ b/runtime/src/utils/buffer/pool/append.rs
@@ -0,0 +1,2086 @@
+//! The [Append] wrapper consists of a [Blob] and a write buffer, and provides a logical view over
+//! the underlying blob which has a page-oriented structure that provides integrity guarantees. The
+//! wrapper also provides read caching managed by a buffer pool.
+//!
+//! # Warning
+//!
+//! Writing new data to the blob can only be done through `append`. The `write` function is not
+//! supported and will panic.
+//!
+//! # Immutability
+//!
+//! The wrapper can be created in (or converted to) an immutable state, which will prevent any
+//! modifications while still supporting cached reads. This can be used to reduce its memory
+//! footprint and/or to prevent unintended modifications.
+//!
+//! # Recovery
+//!
+//! On `sync`, this wrapper will durably write buffered data to the underlying blob in pages. All
+//! pages have a [Checksum] at the end. If no CRC record existed before for the page being written,
+//! then one of the checksums will be all zero. If a checksum already existed for the page being
+//! written, then the write will overwrite only the checksum with the lesser length value. Should
+//! this write fail, the previously committed page state can still be recovered.
+//!
+//! During non-immutable blob initialization, the wrapper will back up over any page that is not
+//! accompanied by a valid CRC, treating it as the result of an incomplete write that may be
+//! invalid. Immutable blob initialization will fail if any trailing data is detected that cannot be
+//! validated by a CRC.
+
+use crate::{
+ buffer::{
+ pool::{Checksum, PoolRef, Read, CHECKSUM_SIZE},
+ tip::Buffer,
+ },
+ Blob, Error, RwLock, RwLockWriteGuard,
+};
+use commonware_utils::StableBuf;
+use std::{num::NonZeroUsize, sync::Arc};
+use tracing::warn;
+
+/// Indicates which CRC slot in a page record must not be overwritten.
+#[derive(Clone, Copy)]
+enum ProtectedCrc {
+ First,
+ Second,
+}
+
+/// Describes the state of the underlying blob with respect to the buffer.
+#[derive(Clone)]
+struct BlobState {
+ blob: B,
+
+ /// The page where the next appended byte will be written to.
+ current_page: u64,
+
+ /// The state of the partial page in the blob. If it was written due to a sync call, then this
+ /// will contain its CRC record.
+ partial_page_state: Option,
+}
+
+/// A [Blob] wrapper that supports write-cached appending of data, with checksums for data integrity
+/// and buffer pool managed caching.
+#[derive(Clone)]
+pub struct Append {
+ /// The underlying blob being wrapped.
+ blob_state: Arc>>,
+
+ /// Unique id assigned to this blob by the buffer pool.
+ id: u64,
+
+ /// A reference to the buffer pool that manages read caching for this blob.
+ pool_ref: PoolRef,
+
+ /// The write buffer containing any logical bytes following the last full page boundary in the
+ /// underlying blob.
+ buffer: Arc>,
+}
+
+/// Returns the capacity with a floor applied to ensure it can hold at least one full page of new
+/// data even when caching a nearly-full page of already written data.
+fn capacity_with_floor(capacity: usize, page_size: u64) -> usize {
+ let floor = page_size as usize * 2;
+ if capacity < floor {
+ warn!(
+ floor,
+ "requested buffer capacity is too low, increasing it to floor"
+ );
+ floor
+ } else {
+ capacity
+ }
+}
+
+impl Append {
+ /// Create a new [Append] wrapper of the provided `blob` that is known to have `blob_size`
+ /// underlying physical bytes, using the provided `pool` for read caching, and a write buffer
+ /// with capacity `capacity`. Rewinds the blob if necessary to ensure it only contains
+ /// checksum-validated data.
+ pub async fn new(
+ blob: B,
+ original_blob_size: u64,
+ capacity: usize,
+ pool_ref: PoolRef,
+ ) -> Result {
+ let (partial_page_state, pages, invalid_data_found) =
+ Self::read_last_valid_page(&blob, original_blob_size, pool_ref.page_size()).await?;
+ if invalid_data_found {
+ // Invalid data was detected, trim it from the blob.
+ let new_blob_size = pages * (pool_ref.page_size() + CHECKSUM_SIZE);
+ warn!(
+ original_blob_size,
+ new_blob_size, "truncating blob to remove invalid data"
+ );
+ blob.resize(new_blob_size).await?;
+ blob.sync().await?;
+ }
+
+ let capacity = capacity_with_floor(capacity, pool_ref.page_size());
+
+ let (blob_state, data) = match partial_page_state {
+ Some((mut partial_page, crc_record)) => {
+ // A partial page exists, make sure we buffer it.
+ partial_page.reserve(capacity - partial_page.len());
+ (
+ BlobState {
+ blob,
+ current_page: pages - 1,
+ partial_page_state: Some(crc_record),
+ },
+ partial_page,
+ )
+ }
+ None => (
+ BlobState {
+ blob,
+ current_page: pages,
+ partial_page_state: None,
+ },
+ Vec::with_capacity(capacity),
+ ),
+ };
+
+ let buffer = Buffer {
+ offset: blob_state.current_page * pool_ref.page_size(),
+ data,
+ capacity,
+ immutable: false,
+ };
+
+ Ok(Self {
+ blob_state: Arc::new(RwLock::new(blob_state)),
+ id: pool_ref.next_id().await,
+ pool_ref,
+ buffer: Arc::new(RwLock::new(buffer)),
+ })
+ }
+
+ /// Return a new [Append] wrapper of the provided `blob` that is known to have `blob_size`
+ /// underlying physical bytes, using the provided `pool` for read caching. The wrapper is for
+ /// read-only data, and any append attempts will return error. The provided `capacity` is used
+ /// only if the blob is later turned into a mutable one. Immutable blobs are assumed consistent
+ /// on disk, so any CRC verification failure results in an error without any recovery attempt.
+ pub async fn new_immutable(
+ blob: B,
+ blob_size: u64,
+ capacity: usize,
+ pool_ref: PoolRef,
+ ) -> Result {
+ let (partial_page_state, pages, invalid_data_found) =
+ Self::read_last_valid_page(&blob, blob_size, pool_ref.page_size()).await?;
+ if invalid_data_found {
+ // Invalid data was detected, so this blob is not consistent.
+ return Err(Error::InvalidChecksum);
+ }
+
+ let capacity = capacity_with_floor(capacity, pool_ref.page_size());
+
+ let (blob_state, data) = match partial_page_state {
+ Some((mut partial_page, crc_record)) => {
+ // A partial page exists, so put it in the buffer.
+ partial_page.shrink_to_fit();
+ (
+ BlobState {
+ blob,
+ current_page: pages - 1,
+ partial_page_state: Some(crc_record),
+ },
+ partial_page,
+ )
+ }
+ None => (
+ BlobState {
+ blob,
+ current_page: pages,
+ partial_page_state: None,
+ },
+ vec![],
+ ),
+ };
+ let buffer = Buffer {
+ data,
+ capacity,
+ offset: blob_state.current_page * pool_ref.page_size(),
+ immutable: true,
+ };
+
+ Ok(Self {
+ blob_state: Arc::new(RwLock::new(blob_state)),
+ id: pool_ref.next_id().await,
+ pool_ref,
+ buffer: Arc::new(RwLock::new(buffer)),
+ })
+ }
+
+ /// Returns `true` if this blob is in the immutable state.
+ pub async fn is_immutable(&self) -> bool {
+ let buffer = self.buffer.read().await;
+
+ buffer.immutable
+ }
+
+ /// Convert this blob to the immutable state if it's not already in it.
+ ///
+ /// If there is unwritten data in the buffer, it will be flushed and synced before returning.
+ pub async fn to_immutable(&self) -> Result<(), Error> {
+ // Flush any buffered data. When flush_internal returns, write_at has completed and data
+ // has been written to the underlying blob.
+ let mut buf_guard = self.buffer.write().await;
+ if buf_guard.immutable {
+ return Ok(());
+ }
+ buf_guard.immutable = true;
+ self.flush_internal(buf_guard, true).await?;
+
+ // Shrink the buffer capacity to minimum since we won't be adding to it. This requires
+ // re-acquiring the write lock.
+ {
+ let mut buf_guard = self.buffer.write().await;
+ buf_guard.data.shrink_to_fit();
+ }
+
+ // Sync the underlying blob to ensure new_immutable on restart will succeed even in the
+ // event of a crash.
+ let blob_state = self.blob_state.read().await;
+ blob_state.blob.sync().await
+ }
+
+ /// Convert this blob to the mutable state if it's not already in it.
+ pub async fn to_mutable(&self) {
+ let mut buffer = self.buffer.write().await;
+ if !buffer.immutable {
+ return;
+ }
+ buffer.immutable = false;
+ }
+
+ /// Scans backwards from the end of the blob, stopping when it finds a valid page.
+ ///
+ /// # Returns
+ ///
+ /// A tuple of `(partial_page, page_count, invalid_data_found)`:
+ ///
+ /// - `partial_page`: If the last valid page is partial (contains fewer than `page_size` logical
+ /// bytes), returns `Some((data, crc_record))` containing the logical data and its CRC record.
+ /// Returns `None` if the last valid page is full or if no valid pages exist.
+ ///
+ /// - `page_count`: The number of pages in the blob up to and including the last valid page
+ /// found (whether or not it's partial). Note that it's possible earlier pages may be invalid
+ /// since this function stops scanning when it finds one valid page.
+ ///
+ /// - `invalid_data_found`: `true` if there are any bytes in the blob that follow the last valid
+ /// page. Typically the blob should be resized to eliminate them since their integrity cannot
+ /// be guaranteed.
+ async fn read_last_valid_page(
+ blob: &B,
+ blob_size: u64,
+ page_size: u64,
+ ) -> Result<(Option<(Vec, Checksum)>, u64, bool), Error> {
+ let physical_page_size = page_size + CHECKSUM_SIZE;
+ let partial_bytes = blob_size % physical_page_size;
+ let mut last_page_end = blob_size - partial_bytes;
+
+ // If the last physical page in the blob is truncated, it can't have a valid CRC record and
+ // must be invalid.
+ let mut invalid_data_found = partial_bytes != 0;
+
+ while last_page_end != 0 {
+ // Read the last page and parse its CRC record.
+ let page_start = last_page_end - physical_page_size;
+ let buf = vec![0; physical_page_size as usize];
+ let buf = blob.read_at(buf, page_start).await?;
+
+ match Checksum::validate_page(buf.as_ref()) {
+ Some(crc_record) => {
+ // Found a valid page.
+ let (len, _) = crc_record.get_crc();
+ let len = len as u64;
+ if len != page_size {
+ // The page is partial (logical data doesn't fill the page).
+ let buf: Vec = buf.into();
+ let logical_bytes = buf[..(len as usize)].to_vec();
+ return Ok((
+ Some((logical_bytes, crc_record)),
+ last_page_end / physical_page_size,
+ invalid_data_found,
+ ));
+ }
+ // The page is full.
+ return Ok((None, last_page_end / physical_page_size, invalid_data_found));
+ }
+ None => {
+ // The page is invalid.
+ last_page_end = page_start;
+ invalid_data_found = true;
+ }
+ }
+ }
+
+ // No valid page exists in the blob.
+ Ok((None, 0, invalid_data_found))
+ }
+
+ /// Append all bytes in `buf` to the tip of the blob.
+ ///
+ /// # Errors
+ ///
+ /// * `Error::ImmutableBlob` - The blob is in the immutable state.
+ pub async fn append(&self, buf: &[u8]) -> Result<(), Error> {
+ let mut buffer = self.buffer.write().await;
+ if buffer.immutable {
+ return Err(Error::ImmutableBlob);
+ }
+
+ if !buffer.append(buf) {
+ return Ok(());
+ }
+
+ // Buffer is over capacity, so we need to write data to the blob.
+ self.flush_internal(buffer, false).await
+ }
+
+ /// Flush all full pages from the buffer to disk, resetting the buffer to contain only the bytes
+ /// in any final partial page. If `write_partial_page` is true, the partial page will be written
+ /// to the blob as well along with a CRC record.
+ async fn flush_internal(
+ &self,
+ mut buf_guard: RwLockWriteGuard<'_, Buffer>,
+ write_partial_page: bool,
+ ) -> Result<(), Error> {
+ let buffer = &mut *buf_guard;
+
+ // Cache the pages we are writing in the buffer pool so they remain cached for concurrent
+ // reads while we flush the buffer.
+ let remaining_byte_count = self
+ .pool_ref
+ .cache(self.id, &buffer.data, buffer.offset)
+ .await;
+
+ // Read the old partial page state before doing the heavy work of preparing physical pages.
+ // This is safe because partial_page_state is only modified by flush_internal, and we hold
+ // the buffer write lock which prevents concurrent flushes.
+ let old_partial_page_state = {
+ let blob_state = self.blob_state.read().await;
+ blob_state.partial_page_state.clone()
+ };
+
+ // Prepare the *physical* pages corresponding to the data in the buffer.
+ // Pass the old partial page state so the CRC record is constructed correctly.
+ let (physical_pages, partial_page_state) = self.to_physical_pages(
+ &*buffer,
+ write_partial_page,
+ old_partial_page_state.as_ref(),
+ );
+
+ // If there's nothing to write, return early.
+ if physical_pages.is_empty() {
+ return Ok(());
+ }
+
+ // Drain the provided buffer of the full pages that are now cached in the buffer pool and
+ // will be written to the blob.
+ let bytes_to_drain = buffer.data.len() - remaining_byte_count;
+ buffer.data.drain(0..bytes_to_drain);
+ buffer.offset += bytes_to_drain as u64;
+ let new_offset = buffer.offset;
+
+ // Acquire a write lock on the blob state so nobody tries to read or modify the blob while
+ // we're writing to it.
+ let mut blob_state = self.blob_state.write().await;
+
+ // Release the buffer lock to allow for concurrent reads & buffered writes while we write
+ // the physical pages.
+ drop(buf_guard);
+
+ let logical_page_size = self.pool_ref.page_size() as usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE as usize;
+ let write_at_offset = blob_state.current_page * physical_page_size as u64;
+
+ // Count only FULL pages for advancing current_page. A partial page (if included) takes
+ // up a full physical page on disk, but it's not complete - the next byte still goes to
+ // that same logical page.
+ let total_pages_in_buffer = physical_pages.len() / physical_page_size;
+ let full_pages_written = if partial_page_state.is_some() {
+ total_pages_in_buffer.saturating_sub(1)
+ } else {
+ total_pages_in_buffer
+ };
+
+ // Identify protected regions based on the OLD partial page state
+ let protected_regions = Self::identify_protected_regions(old_partial_page_state.as_ref());
+
+ // Update state before writing. This may appear to risk data loss if writes fail,
+ // but write failures are fatal per this codebase's design - callers must not use
+ // the blob after any mutable method returns an error.
+ blob_state.current_page += full_pages_written as u64;
+ blob_state.partial_page_state = partial_page_state;
+
+ // Make sure the buffer offset and underlying blob agree on the state of the tip.
+ assert_eq!(
+ blob_state.current_page * self.pool_ref.page_size(),
+ new_offset
+ );
+
+ // Write the physical pages to the blob.
+ // If there are protected regions in the first page, we need to write around them.
+ if let Some((prefix_len, protected_crc)) = protected_regions {
+ match protected_crc {
+ ProtectedCrc::First => {
+ // Protected CRC is first: [page_size..page_size+6]
+ // Write 1: New data in first page [prefix_len..page_size]
+ if prefix_len < logical_page_size {
+ blob_state
+ .blob
+ .write_at(
+ physical_pages[prefix_len..logical_page_size].to_vec(),
+ write_at_offset + prefix_len as u64,
+ )
+ .await?;
+ }
+ // Write 2: Second CRC of first page + all remaining pages [page_size+6..end]
+ let second_crc_start = logical_page_size + 6;
+ blob_state
+ .blob
+ .write_at(
+ physical_pages[second_crc_start..].to_vec(),
+ write_at_offset + second_crc_start as u64,
+ )
+ .await?;
+ }
+ ProtectedCrc::Second => {
+ // Protected CRC is second: [page_size+6..page_size+12]
+ // Write 1: New data + first CRC of first page [prefix_len..page_size+6]
+ let first_crc_end = logical_page_size + 6;
+ if prefix_len < first_crc_end {
+ blob_state
+ .blob
+ .write_at(
+ physical_pages[prefix_len..first_crc_end].to_vec(),
+ write_at_offset + prefix_len as u64,
+ )
+ .await?;
+ }
+ // Write 2: All remaining pages (if any) [physical_page_size..end]
+ if physical_pages.len() > physical_page_size {
+ blob_state
+ .blob
+ .write_at(
+ physical_pages[physical_page_size..].to_vec(),
+ write_at_offset + physical_page_size as u64,
+ )
+ .await?;
+ }
+ }
+ }
+ } else {
+ // No protected regions, write everything in one operation
+ blob_state
+ .blob
+ .write_at(physical_pages, write_at_offset)
+ .await?;
+ }
+
+ Ok(())
+ }
+
+ /// Returns the logical size of the blob. This accounts for both written and buffered data.
+ pub async fn size(&self) -> u64 {
+ let buffer = self.buffer.read().await;
+ buffer.size()
+ }
+
+ /// Reads up to `buf.len()` bytes starting at `logical_offset`, but only as many as are
+ /// available.
+ ///
+ /// This is useful for reading variable-length prefixes (like varints) where you want to read
+ /// up to a maximum number of bytes but the actual data might be shorter.
+ ///
+ /// Returns the number of bytes actually read into the buffer. Returns an error if no bytes
+ /// are available at the given offset.
+ pub async fn read_up_to(
+ &self,
+ buf: impl Into + Send,
+ logical_offset: u64,
+ ) -> Result<(StableBuf, usize), Error> {
+ let mut buf = buf.into();
+ if buf.is_empty() {
+ return Ok((buf, 0));
+ }
+ let blob_size = self.size().await;
+ let available = (blob_size.saturating_sub(logical_offset) as usize).min(buf.len());
+ if available == 0 {
+ return Err(Error::BlobInsufficientLength);
+ }
+ if buf.len() > available {
+ buf.truncate(available);
+ }
+ self.read_into(buf.as_mut(), logical_offset).await?;
+
+ Ok((buf, available))
+ }
+
+ /// Reads bytes starting at `logical_offset` into `buf`.
+ ///
+ /// This method allows reading directly into a mutable slice without taking ownership of the
+ /// buffer or requiring a specific buffer type.
+ pub async fn read_into(&self, buf: &mut [u8], logical_offset: u64) -> Result<(), Error> {
+ // Ensure the read doesn't overflow.
+ let end_offset = logical_offset
+ .checked_add(buf.len() as u64)
+ .ok_or(Error::OffsetOverflow)?;
+
+ // Acquire a read lock on the buffer.
+ let buffer = self.buffer.read().await;
+
+ // If the data required is beyond the size of the blob, return an error.
+ if end_offset > buffer.size() {
+ return Err(Error::BlobInsufficientLength);
+ }
+
+ // Extract any bytes from the buffer that overlap with the requested range.
+ let remaining = buffer.extract(buf.as_mut(), logical_offset);
+
+ // Release buffer lock before potential I/O.
+ drop(buffer);
+
+ if remaining == 0 {
+ return Ok(());
+ }
+
+ // Fast path: try to read *only* from pool cache without acquiring blob lock. This allows
+ // concurrent reads even while a flush is in progress.
+ let cached = self
+ .pool_ref
+ .read_cached(self.id, &mut buf[..remaining], logical_offset)
+ .await;
+
+ if cached == remaining {
+ // All bytes found in cache.
+ return Ok(());
+ }
+
+ // Slow path: cache miss (partial or full), acquire blob read lock to ensure any in-flight
+ // write completes before we read from the blob.
+ let blob_guard = self.blob_state.read().await;
+
+ // Read remaining bytes that were not already obtained from the earlier cache read.
+ let uncached_offset = logical_offset + cached as u64;
+ let uncached_len = remaining - cached;
+ self.pool_ref
+ .read(
+ &blob_guard.blob,
+ self.id,
+ &mut buf[cached..cached + uncached_len],
+ uncached_offset,
+ )
+ .await
+ }
+
+ /// Returns the protected region info for a partial page, if any.
+ ///
+ /// # Returns
+ ///
+ /// `None` if there's no existing partial page.
+ ///
+ /// `Some((prefix_len, protected_crc))` where:
+ /// - `prefix_len`: bytes `[0..prefix_len]` were already written and can be substituted with
+ /// zeros (skip writing)
+ /// - `protected_crc`: which CRC slot must not be overwritten
+ fn identify_protected_regions(
+ partial_page_state: Option<&Checksum>,
+ ) -> Option<(usize, ProtectedCrc)> {
+ let crc_record = partial_page_state?;
+ let (old_len, _) = crc_record.get_crc();
+ // The protected CRC is the one with the larger (authoritative) length.
+ let protected_crc = if crc_record.len1 >= crc_record.len2 {
+ ProtectedCrc::First
+ } else {
+ ProtectedCrc::Second
+ };
+ Some((old_len as usize, protected_crc))
+ }
+
+ /// Prepare a buffer containing the result of converting each buffered logical page in the input
+ /// into a physical page (meaning each page has a CRC record). If the last page is not yet full,
+ /// it will be included only if `include_partial_page` is true.
+ ///
+ /// # Arguments
+ ///
+ /// * `buffer` - The buffer containing logical page data
+ /// * `include_partial_page` - Whether to include a partial page if one exists
+ /// * `old_crc_record` - The CRC record from a previously committed partial page, if any.
+ /// When present, the first page's CRC record will preserve the old CRC in its original slot
+ /// and place the new CRC in the other slot.
+ fn to_physical_pages(
+ &self,
+ buffer: &Buffer,
+ include_partial_page: bool,
+ old_crc_record: Option<&Checksum>,
+ ) -> (Vec, Option) {
+ let logical_page_size = self.pool_ref.page_size() as usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE as usize;
+ let pages_to_write = buffer.data.len() / logical_page_size;
+ let mut write_buffer = Vec::with_capacity(pages_to_write * physical_page_size);
+
+ // For each logical page, copy over the data and then write a crc record for it.
+ for page in 0..pages_to_write {
+ let start_read_idx = page * logical_page_size;
+ let end_read_idx = start_read_idx + logical_page_size;
+ let logical_page = &buffer.data[start_read_idx..end_read_idx];
+ write_buffer.extend_from_slice(logical_page);
+
+ let crc = crc32fast::hash(logical_page);
+ let logical_page_size_u16 =
+ u16::try_from(logical_page_size).expect("page size must fit in u16 for CRC record");
+
+ // For the first page, if there's an old partial page CRC, construct the record
+ // to preserve the old CRC in its original slot.
+ let crc_record = if let (0, Some(old_crc)) = (page, old_crc_record) {
+ Self::build_crc_record_preserving_old(logical_page_size_u16, crc, old_crc)
+ } else {
+ Checksum::new(logical_page_size_u16, crc)
+ };
+ write_buffer.extend_from_slice(&crc_record.to_bytes());
+ }
+
+ if !include_partial_page {
+ return (write_buffer, None);
+ }
+
+ let partial_page = &buffer.data[pages_to_write * logical_page_size..];
+ if partial_page.is_empty() {
+ // No partial page data to write.
+ return (write_buffer, None);
+ }
+
+ // If there are no full pages and the partial page length matches what was already
+ // written, there's nothing new to write.
+ if pages_to_write == 0 {
+ if let Some(old_crc) = old_crc_record {
+ let (old_len, _) = old_crc.get_crc();
+ if partial_page.len() == old_len as usize {
+ return (write_buffer, None);
+ }
+ }
+ }
+ write_buffer.extend_from_slice(partial_page);
+ let partial_len = partial_page.len();
+ let crc = crc32fast::hash(partial_page);
+
+ // Pad with zeros to fill up to logical_page_size.
+ write_buffer.resize(write_buffer.len() + (logical_page_size - partial_len), 0);
+
+ // For partial pages: if this is the first page and there's an old CRC, preserve it.
+ // Otherwise just use the new CRC in slot 0.
+ let crc_record = if let (0, Some(old_crc)) = (pages_to_write, old_crc_record) {
+ Self::build_crc_record_preserving_old(partial_len as u16, crc, old_crc)
+ } else {
+ Checksum::new(partial_len as u16, crc)
+ };
+
+ write_buffer.extend_from_slice(&crc_record.to_bytes());
+
+ // Return the CRC record that matches what we wrote to disk, so that future flushes
+ // correctly identify which slot is protected.
+ (write_buffer, Some(crc_record))
+ }
+
+ /// Build a CRC record that preserves the old CRC in its original slot and places
+ /// the new CRC in the other slot.
+ const fn build_crc_record_preserving_old(
+ new_len: u16,
+ new_crc: u32,
+ old_crc: &Checksum,
+ ) -> Checksum {
+ let (old_len, old_crc_val) = old_crc.get_crc();
+ // The old CRC is in the slot with the larger length value (first slot wins ties).
+ if old_crc.len1 >= old_crc.len2 {
+ // Old CRC is in slot 0, put new CRC in slot 1
+ Checksum {
+ len1: old_len,
+ crc1: old_crc_val,
+ len2: new_len,
+ crc2: new_crc,
+ }
+ } else {
+ // Old CRC is in slot 1, put new CRC in slot 0
+ Checksum {
+ len1: new_len,
+ crc1: new_crc,
+ len2: old_len,
+ crc2: old_crc_val,
+ }
+ }
+ }
+
+ /// Flushes any buffered data, then returns a [Read] wrapper for the underlying blob.
+ ///
+ /// The returned reader can be used to sequentially read all data from the blob while ensuring
+ /// all data passes integrity verification.
+ pub async fn as_blob_reader(&self, capacity: NonZeroUsize) -> Result, Error> {
+ let logical_page_size = self.pool_ref.page_size();
+ let logical_page_size_nz =
+ NonZeroUsize::new(logical_page_size as usize).expect("page_size is non-zero");
+
+ // Flush any buffered data (without fsync) so the Read wrapper sees all written data.
+ // We don't need fsync here since we just want to ensure data has been written to the
+ // underlying blob, not durably persisted.
+ {
+ let buf_guard = self.buffer.write().await;
+ if !buf_guard.immutable {
+ self.flush_internal(buf_guard, true).await?;
+ }
+ }
+
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE;
+ let blob_guard = self.blob_state.read().await;
+
+ // Compute both physical and logical blob sizes.
+ let (physical_blob_size, logical_blob_size) =
+ blob_guard.partial_page_state.as_ref().map_or_else(
+ || {
+ // All pages are full.
+ let physical = physical_page_size * blob_guard.current_page;
+ let logical = logical_page_size * blob_guard.current_page;
+ (physical, logical)
+ },
+ |crc_record| {
+ // There's a partial page with a checksum.
+ let (partial_len, _) = crc_record.get_crc();
+ let partial_len = partial_len as u64;
+ // Physical: all pages including the partial one (which is padded to full size).
+ let physical = physical_page_size * (blob_guard.current_page + 1);
+ // Logical: full pages before this + partial page's actual data length.
+ let logical = logical_page_size * blob_guard.current_page + partial_len;
+ (physical, logical)
+ },
+ );
+
+ Ok(Read::new(
+ blob_guard.blob.clone(),
+ physical_blob_size,
+ logical_blob_size,
+ capacity,
+ logical_page_size_nz,
+ ))
+ }
+}
+
+impl Blob for Append {
+ async fn read_at(
+ &self,
+ buf: impl Into + Send,
+ logical_offset: u64,
+ ) -> Result {
+ let mut buf = buf.into();
+ self.read_into(buf.as_mut(), logical_offset).await?;
+ Ok(buf)
+ }
+
+ async fn sync(&self) -> Result<(), Error> {
+ // Flush any buffered data, including any partial page. When flush_internal returns,
+ // write_at has completed and data has been written to the underlying blob.
+ let buf_guard = self.buffer.write().await;
+ if buf_guard.immutable {
+ return Ok(());
+ }
+ self.flush_internal(buf_guard, true).await?;
+
+ // Sync the underlying blob. We need the blob read lock here since sync() requires access
+ // to the blob, but only a read lock since we're not modifying blob state.
+ let blob_state = self.blob_state.read().await;
+ blob_state.blob.sync().await
+ }
+
+ /// This [Blob] trait method is unimplemented by [Append] and unconditionally panics.
+ async fn write_at(&self, _buf: impl Into + Send, _offset: u64) -> Result<(), Error> {
+ // TODO(): Extend the buffer pool to
+ // support arbitrary writes.
+ unimplemented!("append-only blob type does not support write_at")
+ }
+
+ /// Resize the blob to the provided logical `size`.
+ ///
+ /// This truncates the blob to contain only `size` logical bytes. The physical blob size will
+ /// be adjusted to include the necessary CRC records for the remaining pages.
+ ///
+ /// # Warning
+ ///
+ /// - Concurrent mutable operations (append, resize) are not supported and will cause data loss.
+ /// - Concurrent readers which try to read past the new size during the resize may error.
+ /// - The resize is not guaranteed durable until the next sync.
+ async fn resize(&self, size: u64) -> Result<(), Error> {
+ let current_size = self.size().await;
+
+ // Handle growing by appending zero bytes.
+ if size > current_size {
+ let zeros_needed = (size - current_size) as usize;
+ let zeros = vec![0u8; zeros_needed];
+ self.append(&zeros).await?;
+ return Ok(());
+ }
+
+ // Implementation note: rewinding the blob across a page boundary potentially results in
+ // stale data remaining in the buffer pool's cache. We don't proactively purge the data
+ // within this function since it would be inaccessible anyway. Instead we ensure it is
+ // always updated should the blob grow back to the point where we have new data for the same
+ // page, if any old data hasn't expired naturally by then.
+
+ let logical_page_size = self.pool_ref.page_size();
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE;
+
+ // Flush any buffered data first to ensure we have a consistent state on disk.
+ self.sync().await?;
+
+ // Acquire both locks to prevent concurrent operations.
+ let mut buf_guard = self.buffer.write().await;
+ if buf_guard.immutable {
+ return Err(Error::ImmutableBlob);
+ }
+ let mut blob_guard = self.blob_state.write().await;
+
+ // Calculate the physical size needed for the new logical size.
+ let full_pages = size / logical_page_size;
+ let partial_bytes = size % logical_page_size;
+ let new_physical_size = if partial_bytes > 0 {
+ // We need full_pages + 1 physical pages to hold the partial data.
+ // The partial page will be padded to full physical page size.
+ (full_pages + 1) * physical_page_size
+ } else {
+ // No partial page needed.
+ full_pages * physical_page_size
+ };
+
+ // Resize the underlying blob.
+ blob_guard.blob.resize(new_physical_size).await?;
+ blob_guard.partial_page_state = None;
+
+ // Update blob state and buffer based on the desired logical size. The partial page data is
+ // read with CRC validation; the validated length may exceed partial_bytes (reflecting the
+ // old data length), but we only load the prefix we need. The next sync will write the
+ // correct CRC for the new length.
+ //
+ // Note: This updates state before validation completes, which could leave state
+ // inconsistent if validation fails. This is acceptable because failures from mutable
+ // methods are fatal - callers must not use the blob after any error.
+
+ blob_guard.current_page = full_pages;
+ buf_guard.offset = full_pages * logical_page_size;
+
+ if partial_bytes > 0 {
+ // There's a partial page. Read its data from disk with CRC validation.
+ let page_data =
+ super::get_page_from_blob(&blob_guard.blob, full_pages, logical_page_size).await?;
+
+ // Ensure the validated data covers what we need.
+ if (page_data.len() as u64) < partial_bytes {
+ return Err(Error::InvalidChecksum);
+ }
+
+ buf_guard.data = page_data.as_ref()[..partial_bytes as usize].to_vec();
+ } else {
+ // No partial page - all pages are full or blob is empty.
+ buf_guard.data = vec![];
+ }
+
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::{deterministic, Runner as _, Storage as _};
+ use commonware_codec::ReadExt;
+ use commonware_macros::test_traced;
+ use commonware_utils::{NZUsize, NZU16};
+ use std::num::NonZeroU16;
+
+ const PAGE_SIZE: NonZeroU16 = NZU16!(103); // janky size to ensure we test page alignment
+ const BUFFER_SIZE: usize = PAGE_SIZE.get() as usize * 2;
+
+ #[test_traced("DEBUG")]
+ fn test_append_crc_empty() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ // Open a new blob.
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ assert_eq!(blob_size, 0);
+
+ // Create a buffer pool reference.
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+
+ // Create an Append wrapper.
+ let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ // Verify initial size is 0.
+ assert_eq!(append.size().await, 0);
+
+ // Close & re-open.
+ append.sync().await.unwrap();
+ drop(append);
+
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ assert_eq!(blob_size, 0); // There was no need to write a crc since there was no data.
+
+ let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ assert_eq!(append.size().await, 0);
+ });
+ }
+
+ #[test_traced("DEBUG")]
+ fn test_append_crc_basic() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ // Open a new blob.
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ assert_eq!(blob_size, 0);
+
+ // Create a buffer pool reference.
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+
+ // Create an Append wrapper.
+ let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ // Verify initial size is 0.
+ assert_eq!(append.size().await, 0);
+
+ // Append some bytes.
+ let data = vec![1, 2, 3, 4, 5];
+ append.append(&data).await.unwrap();
+
+ // Verify size reflects appended data.
+ assert_eq!(append.size().await, 5);
+
+ // Append more bytes.
+ let more_data = vec![6, 7, 8, 9, 10];
+ append.append(&more_data).await.unwrap();
+
+ // Verify size is cumulative.
+ assert_eq!(append.size().await, 10);
+
+ // Read back the first chunk and verify.
+ let read_buf = vec![0u8; 5];
+ let read_buf = append.read_at(read_buf, 0).await.unwrap();
+ assert_eq!(read_buf.as_ref(), &data[..]);
+
+ // Read back the second chunk and verify.
+ let read_buf = vec![0u8; 5];
+ let read_buf = append.read_at(read_buf, 5).await.unwrap();
+ assert_eq!(read_buf.as_ref(), &more_data[..]);
+
+ // Read all data at once and verify.
+ let read_buf = vec![0u8; 10];
+ let read_buf = append.read_at(read_buf, 0).await.unwrap();
+ assert_eq!(read_buf.as_ref(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+
+ // Close and reopen the blob and make sure the data is still there and the trailing
+ // checksum is written & stripped as expected.
+ append.sync().await.unwrap();
+ drop(append);
+
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ // Physical page = 103 logical + 12 Checksum = 115 bytes (padded partial page)
+ assert_eq!(blob_size, 115);
+ let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 10); // CRC should be stripped after verification
+
+ // Append data that spans a page boundary.
+ // PAGE_SIZE=103 is the logical page size. We have 10 bytes, so writing
+ // 100 more bytes (total 110) will cross the page boundary at byte 103.
+ let spanning_data: Vec = (11..=110).collect();
+ append.append(&spanning_data).await.unwrap();
+ assert_eq!(append.size().await, 110);
+
+ // Read back data that spans the page boundary.
+ let read_buf = vec![0u8; 100];
+ let read_buf = append.read_at(read_buf, 10).await.unwrap();
+ assert_eq!(read_buf.as_ref(), &spanning_data[..]);
+
+ // Read all 110 bytes at once.
+ let read_buf = vec![0u8; 110];
+ let read_buf = append.read_at(read_buf, 0).await.unwrap();
+ let expected: Vec = (1..=110).collect();
+ assert_eq!(read_buf.as_ref(), &expected[..]);
+
+ // Drop and re-open and make sure bytes are still there.
+ append.sync().await.unwrap();
+ drop(append);
+
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ // 2 physical pages: 2 * 115 = 230 bytes
+ assert_eq!(blob_size, 230);
+ let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 110);
+
+ // Append data to reach exactly a page boundary.
+ // Logical page size is 103. We have 110 bytes, next boundary is 206 (103 * 2).
+ // So we need 96 more bytes.
+ let boundary_data: Vec = (111..=206).collect();
+ assert_eq!(boundary_data.len(), 96);
+ append.append(&boundary_data).await.unwrap();
+ assert_eq!(append.size().await, 206);
+
+ // Verify we can read it back.
+ let read_buf = vec![0u8; 206];
+ let read_buf = append.read_at(read_buf, 0).await.unwrap();
+ let expected: Vec = (1..=206).collect();
+ assert_eq!(read_buf.as_ref(), &expected[..]);
+
+ // Drop and re-open at the page boundary.
+ append.sync().await.unwrap();
+ drop(append);
+
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ // Physical size should be exactly 2 pages: 115 * 2 = 230 bytes
+ assert_eq!(blob_size, 230);
+ let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref)
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 206);
+
+ // Verify data is still readable after reopen.
+ let read_buf = vec![0u8; 206];
+ let read_buf = append.read_at(read_buf, 0).await.unwrap();
+ assert_eq!(read_buf.as_ref(), &expected[..]);
+ });
+ }
+
+ /// Helper to read the CRC record from raw blob bytes at the end of a physical page.
+ fn read_crc_record_from_page(page_bytes: &[u8]) -> Checksum {
+ let crc_start = page_bytes.len() - CHECKSUM_SIZE as usize;
+ Checksum::read(&mut &page_bytes[crc_start..]).unwrap()
+ }
+
+ /// Dummy marker bytes with len=0 so the mangled slot is never authoritative.
+ /// Format: [len_hi=0, len_lo=0, 0xDE, 0xAD, 0xBE, 0xEF]
+ const DUMMY_MARKER: [u8; 6] = [0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF];
+
+ #[test]
+ fn test_identify_protected_regions_equal_lengths() {
+ // When lengths are equal, the first CRC should be protected (tie-breaking rule).
+ let record = Checksum {
+ len1: 50,
+ crc1: 0xAAAAAAAA,
+ len2: 50,
+ crc2: 0xBBBBBBBB,
+ };
+
+ let result =
+ Append::::identify_protected_regions(Some(&record));
+ assert!(result.is_some());
+ let (prefix_len, protected_crc) = result.unwrap();
+ assert_eq!(prefix_len, 50);
+ assert!(
+ matches!(protected_crc, ProtectedCrc::First),
+ "First CRC should be protected when lengths are equal"
+ );
+ }
+
+ #[test]
+ fn test_identify_protected_regions_len1_larger() {
+ // When len1 > len2, the first CRC should be protected.
+ let record = Checksum {
+ len1: 100,
+ crc1: 0xAAAAAAAA,
+ len2: 50,
+ crc2: 0xBBBBBBBB,
+ };
+
+ let result =
+ Append::::identify_protected_regions(Some(&record));
+ assert!(result.is_some());
+ let (prefix_len, protected_crc) = result.unwrap();
+ assert_eq!(prefix_len, 100);
+ assert!(
+ matches!(protected_crc, ProtectedCrc::First),
+ "First CRC should be protected when len1 > len2"
+ );
+ }
+
+ #[test]
+ fn test_identify_protected_regions_len2_larger() {
+ // When len2 > len1, the second CRC should be protected.
+ let record = Checksum {
+ len1: 50,
+ crc1: 0xAAAAAAAA,
+ len2: 100,
+ crc2: 0xBBBBBBBB,
+ };
+
+ let result =
+ Append::::identify_protected_regions(Some(&record));
+ assert!(result.is_some());
+ let (prefix_len, protected_crc) = result.unwrap();
+ assert_eq!(prefix_len, 100);
+ assert!(
+ matches!(protected_crc, ProtectedCrc::Second),
+ "Second CRC should be protected when len2 > len1"
+ );
+ }
+
+ /// Test that slot 1 is NOT overwritten when it's the protected slot.
+ ///
+ /// Strategy: After extending twice (so slot 1 becomes authoritative with larger len),
+ /// mangle the non-authoritative slot 0. Then extend again - slot 0 should be overwritten
+ /// with the new CRC, while slot 1 (protected) should remain untouched.
+ #[test_traced("DEBUG")]
+ fn test_crc_slot1_protected() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+ let slot0_offset = PAGE_SIZE.get() as u64;
+ let slot1_offset = PAGE_SIZE.get() as u64 + 6;
+
+ // === Step 1: Write 10 bytes → slot 0 authoritative (len=10) ===
+ let (blob, _) = context.open("test_partition", b"slot1_prot").await.unwrap();
+ let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append.append(&(1..=10).collect::>()).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 2: Extend to 30 bytes → slot 1 authoritative (len=30) ===
+ let (blob, size) = context.open("test_partition", b"slot1_prot").await.unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(11..=30).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Verify slot 1 is now authoritative
+ let (blob, size) = context.open("test_partition", b"slot1_prot").await.unwrap();
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert!(
+ crc.len2 > crc.len1,
+ "Slot 1 should be authoritative (len2={} > len1={})",
+ crc.len2,
+ crc.len1
+ );
+
+ // Capture slot 1 bytes before mangling slot 0
+ let slot1_before: Vec = blob
+ .read_at(vec![0u8; 6], slot1_offset)
+ .await
+ .unwrap()
+ .into();
+
+ // === Step 3: Mangle slot 0 (non-authoritative) ===
+ blob.write_at(DUMMY_MARKER.to_vec(), slot0_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Verify mangle worked
+ let slot0_mangled: Vec = blob
+ .read_at(vec![0u8; 6], slot0_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_eq!(slot0_mangled, DUMMY_MARKER, "Mangle failed");
+
+ // === Step 4: Extend to 50 bytes → new CRC goes to slot 0, slot 1 protected ===
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(31..=50).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 5: Verify slot 0 was overwritten, slot 1 unchanged ===
+ let (blob, _) = context.open("test_partition", b"slot1_prot").await.unwrap();
+
+ // Slot 0 should have new CRC (not our dummy marker)
+ let slot0_after: Vec = blob
+ .read_at(vec![0u8; 6], slot0_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_ne!(
+ slot0_after, DUMMY_MARKER,
+ "Slot 0 should have been overwritten with new CRC"
+ );
+
+ // Slot 1 should be UNCHANGED (protected)
+ let slot1_after: Vec = blob
+ .read_at(vec![0u8; 6], slot1_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_eq!(
+ slot1_before, slot1_after,
+ "Slot 1 was modified! Protected region violated."
+ );
+
+ // Verify the new CRC in slot 0 has len=50
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert_eq!(crc.len1, 50, "Slot 0 should have len=50");
+ });
+ }
+
+ /// Test that slot 0 is NOT overwritten when it's the protected slot.
+ ///
+ /// Strategy: After extending three times (slot 0 becomes authoritative again with largest len),
+ /// mangle the non-authoritative slot 1. Then extend again - slot 1 should be overwritten
+ /// with the new CRC, while slot 0 (protected) should remain untouched.
+ #[test_traced("DEBUG")]
+ fn test_crc_slot0_protected() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+ let slot0_offset = PAGE_SIZE.get() as u64;
+ let slot1_offset = PAGE_SIZE.get() as u64 + 6;
+
+ // === Step 1: Write 10 bytes → slot 0 authoritative (len=10) ===
+ let (blob, _) = context.open("test_partition", b"slot0_prot").await.unwrap();
+ let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append.append(&(1..=10).collect::>()).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 2: Extend to 30 bytes → slot 1 authoritative (len=30) ===
+ let (blob, size) = context.open("test_partition", b"slot0_prot").await.unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(11..=30).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 3: Extend to 50 bytes → slot 0 authoritative (len=50) ===
+ let (blob, size) = context.open("test_partition", b"slot0_prot").await.unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(31..=50).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Verify slot 0 is now authoritative
+ let (blob, size) = context.open("test_partition", b"slot0_prot").await.unwrap();
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert!(
+ crc.len1 > crc.len2,
+ "Slot 0 should be authoritative (len1={} > len2={})",
+ crc.len1,
+ crc.len2
+ );
+
+ // Capture slot 0 bytes before mangling slot 1
+ let slot0_before: Vec = blob
+ .read_at(vec![0u8; 6], slot0_offset)
+ .await
+ .unwrap()
+ .into();
+
+ // === Step 4: Mangle slot 1 (non-authoritative) ===
+ blob.write_at(DUMMY_MARKER.to_vec(), slot1_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Verify mangle worked
+ let slot1_mangled: Vec = blob
+ .read_at(vec![0u8; 6], slot1_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_eq!(slot1_mangled, DUMMY_MARKER, "Mangle failed");
+
+ // === Step 5: Extend to 70 bytes → new CRC goes to slot 1, slot 0 protected ===
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(51..=70).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 6: Verify slot 1 was overwritten, slot 0 unchanged ===
+ let (blob, _) = context.open("test_partition", b"slot0_prot").await.unwrap();
+
+ // Slot 1 should have new CRC (not our dummy marker)
+ let slot1_after: Vec = blob
+ .read_at(vec![0u8; 6], slot1_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_ne!(
+ slot1_after, DUMMY_MARKER,
+ "Slot 1 should have been overwritten with new CRC"
+ );
+
+ // Slot 0 should be UNCHANGED (protected)
+ let slot0_after: Vec = blob
+ .read_at(vec![0u8; 6], slot0_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_eq!(
+ slot0_before, slot0_after,
+ "Slot 0 was modified! Protected region violated."
+ );
+
+ // Verify the new CRC in slot 1 has len=70
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert_eq!(crc.len2, 70, "Slot 1 should have len=70");
+ });
+ }
+
+ /// Test that the data prefix is NOT overwritten when extending a partial page.
+ ///
+ /// Strategy: Write data, then mangle the padding area (between data end and CRC start).
+ /// After extending, the original data should be unchanged but the mangled padding
+ /// should be overwritten with new data.
+ #[test_traced("DEBUG")]
+ fn test_data_prefix_not_overwritten() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+
+ // === Step 1: Write 20 bytes ===
+ let (blob, _) = context
+ .open("test_partition", b"prefix_test")
+ .await
+ .unwrap();
+ let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ let data1: Vec = (1..=20).collect();
+ append.append(&data1).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 2: Capture the first 20 bytes and mangle bytes 25-30 (in padding area) ===
+ let (blob, size) = context
+ .open("test_partition", b"prefix_test")
+ .await
+ .unwrap();
+ assert_eq!(size, physical_page_size as u64);
+
+ let prefix_before: Vec = blob.read_at(vec![0u8; 20], 0).await.unwrap().into();
+
+ // Mangle bytes 25-30 (safely in the padding area, after our 20 bytes of data)
+ blob.write_at(DUMMY_MARKER.to_vec(), 25).await.unwrap();
+ blob.sync().await.unwrap();
+
+ // === Step 3: Extend to 40 bytes ===
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(21..=40).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 4: Verify prefix unchanged, mangled area overwritten ===
+ let (blob, _) = context
+ .open("test_partition", b"prefix_test")
+ .await
+ .unwrap();
+
+ // Original 20 bytes should be unchanged
+ let prefix_after: Vec = blob.read_at(vec![0u8; 20], 0).await.unwrap().into();
+ assert_eq!(prefix_before, prefix_after, "Data prefix was modified!");
+
+ // Bytes at offset 25-30: data (21..=40) starts at offset 20, so offset 25 has value 26
+ let overwritten: Vec = blob.read_at(vec![0u8; 6], 25).await.unwrap().into();
+ assert_eq!(
+ overwritten,
+ vec![26, 27, 28, 29, 30, 31],
+ "New data should overwrite padding area"
+ );
+ });
+ }
+
+ /// Test CRC slot protection when extending past a page boundary.
+ ///
+ /// Strategy: Write partial page, mangle slot 0 (non-authoritative after we do first extend),
+ /// then extend past page boundary. Verify slot 0 gets new full-page CRC while
+ /// the mangled marker is overwritten, and second page is written correctly.
+ #[test_traced("DEBUG")]
+ fn test_crc_slot_protection_across_page_boundary() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+ let slot0_offset = PAGE_SIZE.get() as u64;
+ let slot1_offset = PAGE_SIZE.get() as u64 + 6;
+
+ // === Step 1: Write 50 bytes → slot 0 authoritative ===
+ let (blob, _) = context.open("test_partition", b"boundary").await.unwrap();
+ let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append.append(&(1..=50).collect::>()).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 2: Extend to 80 bytes → slot 1 authoritative ===
+ let (blob, size) = context.open("test_partition", b"boundary").await.unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(51..=80).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Verify slot 1 is authoritative
+ let (blob, size) = context.open("test_partition", b"boundary").await.unwrap();
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert!(crc.len2 > crc.len1, "Slot 1 should be authoritative");
+
+ // Capture slot 1 before extending past page boundary
+ let slot1_before: Vec = blob
+ .read_at(vec![0u8; 6], slot1_offset)
+ .await
+ .unwrap()
+ .into();
+
+ // Mangle slot 0 (non-authoritative)
+ blob.write_at(DUMMY_MARKER.to_vec(), slot0_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // === Step 3: Extend past page boundary (80 + 40 = 120, PAGE_SIZE=103) ===
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(81..=120).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 4: Verify results ===
+ let (blob, size) = context.open("test_partition", b"boundary").await.unwrap();
+ assert_eq!(size, (physical_page_size * 2) as u64, "Should have 2 pages");
+
+ // Slot 0 should have been overwritten with full-page CRC (not dummy marker)
+ let slot0_after: Vec = blob
+ .read_at(vec![0u8; 6], slot0_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_ne!(
+ slot0_after, DUMMY_MARKER,
+ "Slot 0 should have full-page CRC"
+ );
+
+ // Slot 1 should be UNCHANGED (protected during boundary crossing)
+ let slot1_after: Vec = blob
+ .read_at(vec![0u8; 6], slot1_offset)
+ .await
+ .unwrap()
+ .into();
+ assert_eq!(
+ slot1_before, slot1_after,
+ "Slot 1 was modified during page boundary crossing!"
+ );
+
+ // Verify page 0 has correct CRC structure
+ let page0 = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc0 = read_crc_record_from_page(page0.as_ref());
+ assert_eq!(
+ crc0.len1,
+ PAGE_SIZE.get(),
+ "Slot 0 should have full page length"
+ );
+
+ // Verify data integrity
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 120);
+ let all_data: Vec = append.read_at(vec![0u8; 120], 0).await.unwrap().into();
+ let expected: Vec = (1..=120).collect();
+ assert_eq!(all_data, expected);
+ });
+ }
+
+ /// Test that corrupting the primary CRC (but not its length) causes fallback to the previous
+ /// partial page contents.
+ ///
+ /// Strategy:
+ /// 1. Write 10 bytes → slot 0 authoritative (len=10, valid crc)
+ /// 2. Extend to 30 bytes → slot 1 authoritative (len=30, valid crc)
+ /// 3. Corrupt ONLY the crc2 value in slot 1 (not the length)
+ /// 4. Re-open and verify we fall back to slot 0's 10 bytes
+ #[test_traced("DEBUG")]
+ fn test_crc_fallback_on_corrupted_primary() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+ // crc2 is at offset: PAGE_SIZE + 6 (for len2) + 2 (skip len2 bytes) = PAGE_SIZE + 8
+ let crc2_offset = PAGE_SIZE.get() as u64 + 8;
+
+ // === Step 1: Write 10 bytes → slot 0 authoritative (len=10) ===
+ let (blob, _) = context
+ .open("test_partition", b"crc_fallback")
+ .await
+ .unwrap();
+ let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ let data1: Vec = (1..=10).collect();
+ append.append(&data1).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 2: Extend to 30 bytes → slot 1 authoritative (len=30) ===
+ let (blob, size) = context
+ .open("test_partition", b"crc_fallback")
+ .await
+ .unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append
+ .append(&(11..=30).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Verify slot 1 is now authoritative and data reads correctly
+ let (blob, size) = context
+ .open("test_partition", b"crc_fallback")
+ .await
+ .unwrap();
+ assert_eq!(size, physical_page_size as u64);
+
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert!(
+ crc.len2 > crc.len1,
+ "Slot 1 should be authoritative (len2={} > len1={})",
+ crc.len2,
+ crc.len1
+ );
+ assert_eq!(crc.len2, 30, "Slot 1 should have len=30");
+ assert_eq!(crc.len1, 10, "Slot 0 should have len=10");
+
+ // Verify we can read all 30 bytes before corruption
+ let append = Append::new(blob.clone(), size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 30);
+ let all_data: Vec = append.read_at(vec![0u8; 30], 0).await.unwrap().into();
+ let expected: Vec = (1..=30).collect();
+ assert_eq!(all_data, expected);
+ drop(append);
+
+ // === Step 3: Corrupt ONLY crc2 (not len2) ===
+ // crc2 is 4 bytes at offset PAGE_SIZE + 8
+ blob.write_at(vec![0xDE, 0xAD, 0xBE, 0xEF], crc2_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Verify corruption: len2 should still be 30, but crc2 is now garbage
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert_eq!(crc.len2, 30, "len2 should still be 30 after corruption");
+ assert_eq!(crc.crc2, 0xDEADBEEF, "crc2 should be our corrupted value");
+
+ // === Step 4: Re-open and verify fallback to slot 0's 10 bytes ===
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ // Should fall back to 10 bytes (slot 0's length)
+ assert_eq!(
+ append.size().await,
+ 10,
+ "Should fall back to slot 0's 10 bytes after primary CRC corruption"
+ );
+
+ // Verify the data is the original 10 bytes
+ let fallback_data: Vec = append.read_at(vec![0u8; 10], 0).await.unwrap().into();
+ assert_eq!(
+ fallback_data, data1,
+ "Fallback data should match original 10 bytes"
+ );
+
+ // Reading beyond 10 bytes should fail
+ let result = append.read_at(vec![0u8; 11], 0).await;
+ assert!(result.is_err(), "Reading beyond fallback size should fail");
+ });
+ }
+
+ /// Test that corrupting a non-last page's primary CRC fails even if fallback is valid.
+ ///
+ /// Non-last pages must always be full. If the primary CRC is corrupted and the fallback
+ /// indicates a partial page, validation should fail entirely (not fall back to partial).
+ ///
+ /// Strategy:
+ /// 1. Write 10 bytes → slot 0 has len=10 (partial)
+ /// 2. Extend to full page (103 bytes) → slot 1 has len=103 (full, authoritative)
+ /// 3. Extend past page boundary (e.g., 110 bytes) → page 0 is now non-last
+ /// 4. Corrupt the primary CRC of page 0 (slot 1's crc, which has len=103)
+ /// 5. Re-open and verify that reading from page 0 fails (fallback has len=10, not full)
+ #[test_traced("DEBUG")]
+ fn test_non_last_page_rejects_partial_fallback() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+ // crc2 for page 0 is at offset: PAGE_SIZE + 8
+ let page0_crc2_offset = PAGE_SIZE.get() as u64 + 8;
+
+ // === Step 1: Write 10 bytes → slot 0 has len=10 ===
+ let (blob, _) = context
+ .open("test_partition", b"non_last_page")
+ .await
+ .unwrap();
+ let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ append.append(&(1..=10).collect::>()).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // === Step 2: Extend to exactly full page (103 bytes) → slot 1 has len=103 ===
+ let (blob, size) = context
+ .open("test_partition", b"non_last_page")
+ .await
+ .unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ // Add bytes 11 through 103 (93 more bytes)
+ append
+ .append(&(11..=PAGE_SIZE.get() as u8).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Verify page 0 slot 1 is authoritative with len=103 (full page)
+ let (blob, size) = context
+ .open("test_partition", b"non_last_page")
+ .await
+ .unwrap();
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert_eq!(crc.len1, 10, "Slot 0 should have len=10");
+ assert_eq!(
+ crc.len2,
+ PAGE_SIZE.get(),
+ "Slot 1 should have len=103 (full page)"
+ );
+ assert!(crc.len2 > crc.len1, "Slot 1 should be authoritative");
+
+ // === Step 3: Extend past page boundary (add 10 more bytes for total of 113) ===
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ // Add bytes 104 through 113 (10 more bytes, now on page 1)
+ append
+ .append(&(104..=113).collect::>())
+ .await
+ .unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Verify we now have 2 pages
+ let (blob, size) = context
+ .open("test_partition", b"non_last_page")
+ .await
+ .unwrap();
+ assert_eq!(
+ size,
+ (physical_page_size * 2) as u64,
+ "Should have 2 physical pages"
+ );
+
+ // Verify data is readable before corruption
+ let append = Append::new(blob.clone(), size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 113);
+ let all_data: Vec = append.read_at(vec![0u8; 113], 0).await.unwrap().into();
+ let expected: Vec = (1..=113).collect();
+ assert_eq!(all_data, expected);
+ drop(append);
+
+ // === Step 4: Corrupt page 0's primary CRC (slot 1's crc2) ===
+ blob.write_at(vec![0xDE, 0xAD, 0xBE, 0xEF], page0_crc2_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Verify corruption: page 0's slot 1 still has len=103 but bad CRC
+ let page = blob
+ .read_at(vec![0u8; physical_page_size], 0)
+ .await
+ .unwrap();
+ let crc = read_crc_record_from_page(page.as_ref());
+ assert_eq!(crc.len2, PAGE_SIZE.get(), "len2 should still be 103");
+ assert_eq!(crc.crc2, 0xDEADBEEF, "crc2 should be corrupted");
+ // Slot 0 fallback has len=10 (partial), which is invalid for non-last page
+ assert_eq!(crc.len1, 10, "Fallback slot 0 has partial length");
+
+ // === Step 5: Re-open and try to read from page 0 ===
+ // The first page's primary CRC is bad, and fallback indicates partial (len=10).
+ // Since page 0 is not the last page, a partial fallback is invalid.
+ // Reading from page 0 should fail because the fallback CRC indicates a partial
+ // page, which is not allowed for non-last pages.
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ // The blob still reports 113 bytes because init only validates the last page.
+ // But reading from page 0 should fail because the CRC fallback is partial.
+ assert_eq!(append.size().await, 113);
+
+ // Try to read from page 0 - this should fail with InvalidChecksum because
+ // the fallback CRC has len=10 (partial), which is invalid for a non-last page.
+ let result = append.read_at(vec![0u8; 10], 0).await;
+ assert!(
+ result.is_err(),
+ "Reading from corrupted non-last page via Append should fail, but got: {:?}",
+ result
+ );
+ drop(append);
+
+ // Also verify that reading via a Read wrapper fails the same way.
+ let (blob, size) = context
+ .open("test_partition", b"non_last_page")
+ .await
+ .unwrap();
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ let mut reader = append.as_blob_reader(NZUsize!(1024)).await.unwrap();
+
+ // Try to read from offset 0 (page 0) via the Read wrapper.
+ let result = reader.read_up_to(vec![0u8; 10]).await;
+ assert!(
+ result.is_err(),
+ "Reading from corrupted non-last page via Read wrapper should fail, but got: {:?}",
+ result
+ );
+ });
+ }
+
+ #[test]
+ fn test_resize_shrink_validates_crc() {
+ // Verify that shrinking a blob to a partial page validates the CRC, rather than
+ // blindly reading raw bytes which could silently load corrupted data.
+ let executor = deterministic::Runner::default();
+
+ executor.start(|context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+
+ let (blob, size) = context
+ .open("test_partition", b"resize_crc_test")
+ .await
+ .unwrap();
+
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ // Write data across 3 pages: page 0 (full), page 1 (full), page 2 (partial).
+ // PAGE_SIZE = 103, so 250 bytes = 103 + 103 + 44.
+ let data: Vec = (0..=249).collect();
+ append.append(&data).await.unwrap();
+ append.sync().await.unwrap();
+ assert_eq!(append.size().await, 250);
+ drop(append);
+
+ // Corrupt the CRC record of page 1 (middle page).
+ let (blob, size) = context
+ .open("test_partition", b"resize_crc_test")
+ .await
+ .unwrap();
+ assert_eq!(size as usize, physical_page_size * 3);
+
+ // Page 1 CRC record is at the end of the second physical page.
+ let page1_crc_offset = (physical_page_size * 2 - CHECKSUM_SIZE as usize) as u64;
+ blob.write_at(vec![0xFF; CHECKSUM_SIZE as usize], page1_crc_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Open the blob - Append::new() validates the LAST page (page 2), which is still valid.
+ // So it should open successfully with size 250.
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+ assert_eq!(append.size().await, 250);
+
+ // Try to shrink to 150 bytes, which ends in page 1 (the corrupted page).
+ // 150 bytes = page 0 (103 full) + page 1 (47 partial).
+ // This should fail because page 1's CRC is corrupted.
+ let result = append.resize(150).await;
+ assert!(
+ matches!(result, Err(crate::Error::InvalidChecksum)),
+ "Expected InvalidChecksum when shrinking to corrupted page, got: {:?}",
+ result
+ );
+ });
+ }
+
+ #[test]
+ fn test_immutable_blob_rejects_append_and_resize() {
+ let executor = deterministic::Runner::default();
+
+ executor.start(|context| async move {
+ const PAGE_SIZE: NonZeroU16 = NZU16!(64);
+ const BUFFER_SIZE: usize = 256;
+
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(4));
+
+ let (blob, size) = context
+ .open("test_partition", b"immutable_test")
+ .await
+ .unwrap();
+
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ // Write some initial data.
+ append.append(&[1, 2, 3, 4, 5]).await.unwrap();
+ append.sync().await.unwrap();
+ assert_eq!(append.size().await, 5);
+
+ // Convert to immutable.
+ append.to_immutable().await.unwrap();
+ assert!(append.is_immutable().await);
+
+ // Verify append() returns ImmutableBlob error.
+ let result = append.append(&[6, 7, 8]).await;
+ assert!(
+ matches!(result, Err(crate::Error::ImmutableBlob)),
+ "Expected ImmutableBlob error from append(), got: {:?}",
+ result
+ );
+
+ // Verify resize() returns ImmutableBlob error.
+ let result = append.resize(100).await;
+ assert!(
+ matches!(result, Err(crate::Error::ImmutableBlob)),
+ "Expected ImmutableBlob error from resize(), got: {:?}",
+ result
+ );
+
+ // Verify sync() returns Ok.
+ let result = append.sync().await;
+ assert!(
+ result.is_ok(),
+ "sync() on immutable blob should return Ok, got: {:?}",
+ result
+ );
+
+ // Verify data is still readable.
+ let data: Vec = append.read_at(vec![0u8; 5], 0).await.unwrap().into();
+ assert_eq!(data, vec![1, 2, 3, 4, 5]);
+ });
+ }
+
+ #[test]
+ fn test_corrupted_crc_len_too_large() {
+ let executor = deterministic::Runner::default();
+
+ executor.start(|context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize;
+
+ // Step 1: Create blob with valid data
+ let (blob, size) = context
+ .open("test_partition", b"crc_len_test")
+ .await
+ .unwrap();
+
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ append.append(&[0x42; 50]).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Step 2: Corrupt the CRC record to have len > page_size
+ let (blob, size) = context
+ .open("test_partition", b"crc_len_test")
+ .await
+ .unwrap();
+ assert_eq!(size as usize, physical_page_size);
+
+ // CRC record is at the end of the physical page
+ let crc_offset = PAGE_SIZE.get() as u64;
+
+ // Create a CRC record with len1 = 0xFFFF (65535), which is >> page_size (103)
+ // Format: [len1_hi, len1_lo, crc1 (4 bytes), len2_hi, len2_lo, crc2 (4 bytes)]
+ let bad_crc_record: [u8; 12] = [
+ 0xFF, 0xFF, // len1 = 65535 (way too large)
+ 0xDE, 0xAD, 0xBE, 0xEF, // crc1 (garbage)
+ 0x00, 0x00, // len2 = 0
+ 0x00, 0x00, 0x00, 0x00, // crc2 = 0
+ ];
+ blob.write_at(bad_crc_record.to_vec(), crc_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Step 3: Try to open the blob - should NOT panic, should return error or handle gracefully
+ let result = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()).await;
+
+ // Either returns InvalidChecksum error OR truncates the corrupted data
+ // (both are acceptable behaviors - panicking is NOT acceptable)
+ match result {
+ Ok(append) => {
+ // If it opens successfully, the corrupted page should have been truncated
+ let recovered_size = append.size().await;
+ assert_eq!(
+ recovered_size, 0,
+ "Corrupted page should be truncated, size should be 0"
+ );
+ }
+ Err(e) => {
+ // Error is also acceptable (for immutable blobs)
+ assert!(
+ matches!(e, crate::Error::InvalidChecksum),
+ "Expected InvalidChecksum error, got: {:?}",
+ e
+ );
+ }
+ }
+ });
+ }
+
+ #[test]
+ fn test_corrupted_crc_both_slots_len_too_large() {
+ let executor = deterministic::Runner::default();
+
+ executor.start(|context| async move {
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+
+ // Step 1: Create blob with valid data
+ let (blob, size) = context
+ .open("test_partition", b"crc_both_bad")
+ .await
+ .unwrap();
+
+ let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone())
+ .await
+ .unwrap();
+
+ append.append(&[0x42; 50]).await.unwrap();
+ append.sync().await.unwrap();
+ drop(append);
+
+ // Step 2: Corrupt BOTH CRC slots to have len > page_size
+ let (blob, size) = context
+ .open("test_partition", b"crc_both_bad")
+ .await
+ .unwrap();
+
+ let crc_offset = PAGE_SIZE.get() as u64;
+
+ // Both slots have len > page_size
+ let bad_crc_record: [u8; 12] = [
+ 0x01, 0x00, // len1 = 256 (> 103)
+ 0xDE, 0xAD, 0xBE, 0xEF, // crc1 (garbage)
+ 0x02, 0x00, // len2 = 512 (> 103)
+ 0xCA, 0xFE, 0xBA, 0xBE, // crc2 (garbage)
+ ];
+ blob.write_at(bad_crc_record.to_vec(), crc_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Step 3: Try to open - should NOT panic
+ let result = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()).await;
+
+ match result {
+ Ok(append) => {
+ // Corrupted page truncated
+ assert_eq!(append.size().await, 0);
+ }
+ Err(e) => {
+ assert!(
+ matches!(e, crate::Error::InvalidChecksum),
+ "Expected InvalidChecksum, got: {:?}",
+ e
+ );
+ }
+ }
+ });
+ }
+}
diff --git a/runtime/src/utils/buffer/pool/mod.rs b/runtime/src/utils/buffer/pool/mod.rs
new file mode 100644
index 0000000000..c7094b8e79
--- /dev/null
+++ b/runtime/src/utils/buffer/pool/mod.rs
@@ -0,0 +1,505 @@
+//! Blob wrappers for reading and writing data with integrity guarantees, plus a buffer pool that
+//! manages read caching over the data.
+//!
+//! # Page-oriented structure
+//!
+//! Blob data is stored in _pages_ having a logical `page_size` dictated by the managing buffer
+//! pool. A _physical page_ consists of `page_size` bytes of data followed by a 12-byte _CRC
+//! record_ containing:
+//!
+//! ```text
+//! | len1 (2 bytes) | crc1 (4 bytes) | len2 (2 bytes) | crc2 (4 bytes) |
+//! ```
+//!
+//! Two checksums are stored so that partial pages can be re-written without overwriting a valid
+//! checksum for its previously committed contents. A checksum over a page is computed over the
+//! first [0,len) bytes in the page, with all other bytes in the page ignored. This implementation
+//! always 0-pads the range [len, page_size). A checksum with length 0 is never considered
+//! valid. If both checksums are valid for the page, the one with the larger `len` is considered
+//! authoritative.
+//!
+//! A _full_ page is one whose crc stores a len equal to the logical page size. Otherwise the page
+//! is called _partial_. All pages in a blob are full except for the very last page, which can be
+//! full or partial. A partial page's logical bytes are immutable on commit, and if it's re-written,
+//! it's only to add more bytes after the existing ones.
+
+use crate::{Blob, Error};
+use bytes::{Buf, BufMut};
+use commonware_codec::{EncodeFixed, FixedSize, Read as CodecRead, ReadExt, Write};
+use commonware_utils::StableBuf;
+
+mod append;
+mod page_cache;
+mod read;
+
+pub use append::Append;
+pub use page_cache::PoolRef;
+pub use read::Read;
+use tracing::{debug, error};
+
+// A checksum record contains two u16 lengths and two CRCs (each 4 bytes).
+const CHECKSUM_SIZE: u64 = 12;
+
+/// Read the designated page from the underlying blob and return its logical bytes as a vector if it
+/// passes the integrity check, returning error otherwise. Safely handles partial pages. Caller can
+/// check the length of the returned vector to determine if the page was partial vs full.
+async fn get_page_from_blob(
+ blob: &impl Blob,
+ page_num: u64,
+ logical_page_size: u64,
+) -> Result {
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE;
+ let physical_page_start = page_num * physical_page_size;
+
+ let mut page = blob
+ .read_at(vec![0; physical_page_size as usize], physical_page_start)
+ .await?;
+
+ let Some(record) = Checksum::validate_page(page.as_ref()) else {
+ return Err(Error::InvalidChecksum);
+ };
+ let (len, _) = record.get_crc();
+
+ page.truncate(len as usize);
+
+ Ok(page)
+}
+
+/// Describes a CRC record stored at the end of a page.
+///
+/// The CRC accompanied by the larger length is the one that should be treated as authoritative for
+/// the page. Two checksums are stored so that partial pages can be written without overwriting a
+/// valid checksum for a previously committed partial page.
+#[derive(Clone)]
+struct Checksum {
+ len1: u16,
+ crc1: u32,
+ len2: u16,
+ crc2: u32,
+}
+
+impl Checksum {
+ /// Create a new CRC record with the given length and CRC.
+ /// The new CRC is stored in the first slot (len1/crc1), with the second slot zeroed.
+ const fn new(len: u16, crc: u32) -> Self {
+ Self {
+ len1: len,
+ crc1: crc,
+ len2: 0,
+ crc2: 0,
+ }
+ }
+
+ /// Return the CRC record for the page if it is valid. The provided slice is assumed to be
+ /// exactly the size of a physical page. The record may not precisely reflect the bytes written
+ /// if what should have been the most recent CRC doesn't validate, in which case it will be
+ /// zeroed and the other CRC used as a fallback.
+ fn validate_page(buf: &[u8]) -> Option {
+ let page_size = buf.len() as u64;
+ if page_size < CHECKSUM_SIZE {
+ error!(
+ page_size,
+ required = CHECKSUM_SIZE,
+ "read page smaller than CRC record"
+ );
+ return None;
+ }
+
+ let crc_start_idx = (page_size - CHECKSUM_SIZE) as usize;
+ let mut crc_bytes = &buf[crc_start_idx..];
+ let mut crc_record = Self::read(&mut crc_bytes).expect("CRC record read should not fail");
+ let (len, crc) = crc_record.get_crc();
+
+ // Validate that len is in the valid range [1, logical_page_size].
+ // A page with len=0 is invalid (e.g., all-zero pages from unwritten data).
+ let len_usize = len as usize;
+ if len_usize == 0 {
+ // Both CRCs have 0 length, so there is no fallback possible.
+ debug!("Invalid CRC: len==0");
+ return None;
+ }
+
+ if len_usize > crc_start_idx {
+ // len is too large so this CRC isn't valid. Fall back to the other CRC.
+ debug!("Invalid CRC: len too long. Using fallback CRC");
+ if crc_record.validate_fallback(buf, crc_start_idx) {
+ return Some(crc_record);
+ }
+ return None;
+ }
+
+ let computed_crc = crc32fast::hash(&buf[..len_usize]);
+ if computed_crc != crc {
+ debug!("Invalid CRC: doesn't match page contents. Using fallback CRC");
+ if crc_record.validate_fallback(buf, crc_start_idx) {
+ return Some(crc_record);
+ }
+ return None;
+ }
+
+ Some(crc_record)
+ }
+
+ /// Attempts to validate a CRC record based on its fallback CRC because the primary CRC failed
+ /// validation. The primary CRC is zeroed in the process. Returns false if the fallback CRC
+ /// fails validation.
+ fn validate_fallback(&mut self, buf: &[u8], crc_start_idx: usize) -> bool {
+ let (len, crc) = self.get_fallback_crc();
+ if len == 0 {
+ // No fallback available (only one CRC was ever written to this page).
+ debug!("Invalid fallback CRC: len==0");
+ return false;
+ }
+
+ let len_usize = len as usize;
+
+ if len_usize > crc_start_idx {
+ // len is too large so this CRC isn't valid.
+ debug!("Invalid fallback CRC: len too long.");
+ return false;
+ }
+
+ let computed_crc = crc32fast::hash(&buf[..len_usize]);
+ if computed_crc != crc {
+ debug!("Invalid fallback CRC: doesn't match page contents.");
+ return false;
+ }
+
+ true
+ }
+
+ /// Returns the CRC record with the longer (authoritative) length, without performing any
+ /// validation. If they both have the same length (which should only happen due to data
+ /// corruption) return the first.
+ const fn get_crc(&self) -> (u16, u32) {
+ if self.len1 >= self.len2 {
+ (self.len1, self.crc1)
+ } else {
+ (self.len2, self.crc2)
+ }
+ }
+
+ /// Zeroes the primary CRC (because we assumed it failed validation) and returns the other. This
+ /// should only be called if the primary CRC failed validation. After this returns, get_crc will
+ /// no longer return the invalid primary CRC.
+ const fn get_fallback_crc(&mut self) -> (u16, u32) {
+ if self.len1 >= self.len2 {
+ // First CRC was primary, and must have been invalid. Zero it and return the second.
+ self.len1 = 0;
+ self.crc1 = 0;
+ (self.len2, self.crc2)
+ } else {
+ // Second CRC was primary, and must have been invalid. Zero it and return the first.
+ self.len2 = 0;
+ self.crc2 = 0;
+ (self.len1, self.crc1)
+ }
+ }
+
+ /// Returns the CRC record in its storage representation.
+ fn to_bytes(&self) -> [u8; CHECKSUM_SIZE as usize] {
+ self.encode_fixed()
+ }
+}
+
+impl Write for Checksum {
+ fn write(&self, buf: &mut impl BufMut) {
+ self.len1.write(buf);
+ self.crc1.write(buf);
+ self.len2.write(buf);
+ self.crc2.write(buf);
+ }
+}
+
+impl CodecRead for Checksum {
+ type Cfg = ();
+
+ fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result {
+ Ok(Self {
+ len1: u16::read(buf)?,
+ crc1: u32::read(buf)?,
+ len2: u16::read(buf)?,
+ crc2: u32::read(buf)?,
+ })
+ }
+}
+
+impl FixedSize for Checksum {
+ const SIZE: usize = CHECKSUM_SIZE as usize;
+}
+
+#[cfg(feature = "arbitrary")]
+impl arbitrary::Arbitrary<'_> for Checksum {
+ fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result {
+ Ok(Self {
+ len1: u.arbitrary()?,
+ crc1: u.arbitrary()?,
+ len2: u.arbitrary()?,
+ crc2: u.arbitrary()?,
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ const CHECKSUM_SIZE_USIZE: usize = CHECKSUM_SIZE as usize;
+
+ #[test]
+ fn test_crc_record_encode_read_roundtrip() {
+ let record = Checksum {
+ len1: 0x1234,
+ crc1: 0xAABBCCDD,
+ len2: 0x5678,
+ crc2: 0x11223344,
+ };
+
+ let bytes = record.to_bytes();
+ let restored = Checksum::read(&mut &bytes[..]).unwrap();
+
+ assert_eq!(restored.len1, 0x1234);
+ assert_eq!(restored.crc1, 0xAABBCCDD);
+ assert_eq!(restored.len2, 0x5678);
+ assert_eq!(restored.crc2, 0x11223344);
+ }
+
+ #[test]
+ fn test_crc_record_encoding() {
+ let record = Checksum {
+ len1: 0x0102,
+ crc1: 0x03040506,
+ len2: 0x0708,
+ crc2: 0x090A0B0C,
+ };
+
+ let bytes = record.to_bytes();
+ // Verify big-endian encoding
+ assert_eq!(
+ bytes,
+ [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C]
+ );
+ }
+
+ #[test]
+ fn test_crc_record_get_crc_len1_larger() {
+ let record = Checksum {
+ len1: 200,
+ crc1: 0xAAAAAAAA,
+ len2: 100,
+ crc2: 0xBBBBBBBB,
+ };
+
+ let (len, crc) = record.get_crc();
+ assert_eq!(len, 200);
+ assert_eq!(crc, 0xAAAAAAAA);
+ }
+
+ #[test]
+ fn test_crc_record_get_crc_len2_larger() {
+ let record = Checksum {
+ len1: 100,
+ crc1: 0xAAAAAAAA,
+ len2: 200,
+ crc2: 0xBBBBBBBB,
+ };
+
+ let (len, crc) = record.get_crc();
+ assert_eq!(len, 200);
+ assert_eq!(crc, 0xBBBBBBBB);
+ }
+
+ #[test]
+ fn test_crc_record_get_crc_equal_lengths() {
+ // When lengths are equal, len1/crc1 is returned (first slot wins ties).
+ let record = Checksum {
+ len1: 100,
+ crc1: 0xAAAAAAAA,
+ len2: 100,
+ crc2: 0xBBBBBBBB,
+ };
+
+ let (len, crc) = record.get_crc();
+ assert_eq!(len, 100);
+ assert_eq!(crc, 0xAAAAAAAA);
+ }
+
+ #[test]
+ fn test_validate_page_valid() {
+ let logical_page_size = 64usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE;
+ let mut page = vec![0u8; physical_page_size];
+
+ // Write some data
+ let data = b"hello world";
+ page[..data.len()].copy_from_slice(data);
+
+ // Compute CRC of the data portion
+ let crc = crc32fast::hash(&page[..data.len()]);
+ let record = Checksum::new(data.len() as u16, crc);
+
+ // Write the CRC record at the end
+ let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE;
+ page[crc_start..].copy_from_slice(&record.to_bytes());
+
+ // Validate - should return Some with the Checksum
+ let validated = Checksum::validate_page(&page);
+ assert!(validated.is_some());
+ let (len, _) = validated.unwrap().get_crc();
+ assert_eq!(len as usize, data.len());
+ }
+
+ #[test]
+ fn test_validate_page_invalid_crc() {
+ let logical_page_size = 64usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE;
+ let mut page = vec![0u8; physical_page_size];
+
+ // Write some data
+ let data = b"hello world";
+ page[..data.len()].copy_from_slice(data);
+
+ // Write a record with wrong CRC
+ let wrong_crc = 0xBADBADBA;
+ let record = Checksum::new(data.len() as u16, wrong_crc);
+
+ let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE;
+ page[crc_start..].copy_from_slice(&record.to_bytes());
+
+ // Should fail validation (return None)
+ let validated = Checksum::validate_page(&page);
+ assert!(validated.is_none());
+ }
+
+ #[test]
+ fn test_validate_page_corrupted_data() {
+ let logical_page_size = 64usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE;
+ let mut page = vec![0u8; physical_page_size];
+
+ // Write some data and compute correct CRC
+ let data = b"hello world";
+ page[..data.len()].copy_from_slice(data);
+ let crc = crc32fast::hash(&page[..data.len()]);
+ let record = Checksum::new(data.len() as u16, crc);
+
+ let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE;
+ page[crc_start..].copy_from_slice(&record.to_bytes());
+
+ // Corrupt the data
+ page[0] = 0xFF;
+
+ // Should fail validation (return None)
+ let validated = Checksum::validate_page(&page);
+ assert!(validated.is_none());
+ }
+
+ #[test]
+ fn test_validate_page_uses_larger_len() {
+ let logical_page_size = 64usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE;
+ let mut page = vec![0u8; physical_page_size];
+
+ // Write data and compute CRC for the larger portion
+ let data = b"hello world, this is longer";
+ page[..data.len()].copy_from_slice(data);
+ let crc = crc32fast::hash(&page[..data.len()]);
+
+ // Create a record where len2 has the valid CRC for longer data
+ let record = Checksum {
+ len1: 5,
+ crc1: 0xDEADBEEF, // Invalid CRC for shorter data
+ len2: data.len() as u16,
+ crc2: crc,
+ };
+
+ let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE;
+ page[crc_start..].copy_from_slice(&record.to_bytes());
+
+ // Should validate using len2/crc2 since len2 > len1
+ let validated = Checksum::validate_page(&page);
+ assert!(validated.is_some());
+ let (len, _) = validated.unwrap().get_crc();
+ assert_eq!(len as usize, data.len());
+ }
+
+ #[test]
+ fn test_validate_page_uses_fallback() {
+ let logical_page_size = 64usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE;
+ let mut page = vec![0u8; physical_page_size];
+
+ // Write data
+ let data = b"fallback data";
+ page[..data.len()].copy_from_slice(data);
+ let valid_crc = crc32fast::hash(&page[..data.len()]);
+ let valid_len = data.len() as u16;
+
+ // Create a record where:
+ // len1 is larger (primary) but INVALID
+ // len2 is smaller (fallback) but VALID
+ let record = Checksum {
+ len1: valid_len + 10, // Larger, so it's primary
+ crc1: 0xBAD1DEA, // Invalid CRC
+ len2: valid_len, // Smaller, so it's fallback
+ crc2: valid_crc, // Valid CRC
+ };
+
+ let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE;
+ page[crc_start..].copy_from_slice(&record.to_bytes());
+
+ // Should validate using the fallback (len2)
+ let validated = Checksum::validate_page(&page);
+
+ assert!(validated.is_some(), "Should have validated using fallback");
+ let validated = validated.unwrap();
+ let (len, crc) = validated.get_crc();
+ assert_eq!(len, valid_len);
+ assert_eq!(crc, valid_crc);
+
+ // Verify that the invalid primary was zeroed out
+ assert_eq!(validated.len1, 0);
+ assert_eq!(validated.crc1, 0);
+ }
+
+ #[test]
+ fn test_validate_page_no_fallback_available() {
+ let logical_page_size = 64usize;
+ let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE;
+ let mut page = vec![0u8; physical_page_size];
+
+ // Write some data
+ let data = b"some data";
+ page[..data.len()].copy_from_slice(data);
+
+ // Create a record where:
+ // len1 > 0 (primary) but with INVALID CRC
+ // len2 = 0 (no fallback available)
+ let record = Checksum {
+ len1: data.len() as u16,
+ crc1: 0xBAD1DEA, // Invalid CRC
+ len2: 0, // No fallback
+ crc2: 0,
+ };
+
+ let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE;
+ page[crc_start..].copy_from_slice(&record.to_bytes());
+
+ // Should fail validation since primary is invalid and no fallback exists
+ let validated = Checksum::validate_page(&page);
+ assert!(
+ validated.is_none(),
+ "Should fail when primary is invalid and fallback has len=0"
+ );
+ }
+
+ #[cfg(feature = "arbitrary")]
+ mod conformance {
+ use super::*;
+ use commonware_codec::conformance::CodecConformance;
+
+ commonware_conformance::conformance_tests! {
+ CodecConformance,
+ }
+ }
+}
diff --git a/runtime/src/utils/buffer/pool.rs b/runtime/src/utils/buffer/pool/page_cache.rs
similarity index 58%
rename from runtime/src/utils/buffer/pool.rs
rename to runtime/src/utils/buffer/pool/page_cache.rs
index 7ab0381b24..c7ddaf31ba 100644
--- a/runtime/src/utils/buffer/pool.rs
+++ b/runtime/src/utils/buffer/pool/page_cache.rs
@@ -1,25 +1,29 @@
+//! A buffer pool for caching _logical_ pages of [Blob] data in memory. The buffer pool is unaware
+//! of the physical page format used by the blob, which is left to the blob implementation.
+
+use super::get_page_from_blob;
use crate::{Blob, Error, RwLock};
use commonware_utils::StableBuf;
use futures::{future::Shared, FutureExt};
use std::{
collections::{hash_map::Entry, HashMap},
future::Future,
- num::NonZeroUsize,
+ num::{NonZeroU16, NonZeroUsize},
pin::Pin,
sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
Arc,
},
};
-use tracing::{debug, trace};
+use tracing::{debug, error, trace};
// Type alias for the future we'll be storing for each in-flight page fetch.
//
// We wrap [Error] in an Arc so it will be cloneable, which is required for the future to be
-// [Shared].
+// [Shared]. The StableBuf contains only the logical (validated) bytes of the page.
type PageFetchFut = Shared>> + Send>>>;
-/// A [Pool] caches pages of [Blob] data in memory.
+/// A [Pool] caches pages of [Blob] data in memory after verifying the integrity of each.
///
/// A single buffer pool can be used to cache data from multiple blobs by assigning a unique id to
/// each.
@@ -63,17 +67,23 @@ struct CacheEntry {
/// A bit indicating whether this page was recently referenced.
referenced: AtomicBool,
- /// The cached page itself.
+ /// The cached page itself. Only logical bytes are cached, so the vector will be 12 bytes shorter
+ /// than the physical page size.
data: Vec,
}
-/// A reference to a [Pool] that can be shared across threads via cloning, along with the page size
-/// that will be used with it. Provides the API for interacting with the buffer pool in a
+/// A reference to a page cache that can be shared across threads via cloning, along with the page
+/// size that will be used with it. Provides the API for interacting with the buffer pool in a
/// thread-safe manner.
#[derive(Clone)]
pub struct PoolRef {
- /// The size of each page in the buffer pool.
- pub(super) page_size: usize,
+ /// The size of each page in the underlying blobs managed by this buffer pool.
+ ///
+ /// # Warning
+ ///
+ /// You cannot change the page size once data has been written without invalidating it. (Reads
+ /// on blobs that were written with a different page size will fail their integrity check.)
+ page_size: u64,
/// The next id to assign to a blob that will be managed by this pool.
next_id: Arc,
@@ -83,49 +93,52 @@ pub struct PoolRef {
}
impl PoolRef {
- /// Returns a new [PoolRef] with the given `page_size` and `capacity`.
- pub fn new(page_size: NonZeroUsize, capacity: NonZeroUsize) -> Self {
+ /// Returns a new [PoolRef] that will buffer up to `capacity` pages with the
+ /// given `page_size`.
+ pub fn new(page_size: NonZeroU16, capacity: NonZeroUsize) -> Self {
+ let page_size = page_size.get() as u64;
+
Self {
- page_size: page_size.get(),
+ page_size,
next_id: Arc::new(AtomicU64::new(0)),
pool: Arc::new(RwLock::new(Pool::new(capacity.get()))),
}
}
+ /// The page size used by this buffer pool.
+ #[inline]
+ pub const fn page_size(&self) -> u64 {
+ self.page_size
+ }
+
/// Returns a unique id for the next blob that will use this buffer pool.
pub async fn next_id(&self) -> u64 {
self.next_id.fetch_add(1, Ordering::Relaxed)
}
- /// Convert an offset into the number of the page it belongs to and the offset within that page.
- pub const fn offset_to_page(&self, offset: u64) -> (u64, usize) {
+ /// Convert a logical offset into the number of the page it belongs to and the offset within
+ /// that page.
+ pub const fn offset_to_page(&self, offset: u64) -> (u64, u64) {
Pool::offset_to_page(self.page_size, offset)
}
/// Try to read the specified bytes from the buffer pool cache only. Returns the number of
/// bytes successfully read from cache and copied to `buf` before a page fault, if any.
- ///
- /// This method never reads from the underlying blob - it only checks the cache.
- ///
- /// # Warning
- ///
- /// Attempts to read any of the last (blob_size % page_size) "trailing bytes" of the blob will
- /// always return 0 since the buffer pool only deals with page sized chunks.
pub(super) async fn read_cached(
&self,
blob_id: u64,
mut buf: &mut [u8],
- mut offset: u64,
+ mut logical_offset: u64,
) -> usize {
let original_len = buf.len();
let buffer_pool = self.pool.read().await;
while !buf.is_empty() {
- let count = buffer_pool.read_at(self.page_size, blob_id, buf, offset);
+ let count = buffer_pool.read_at(self.page_size, blob_id, buf, logical_offset);
if count == 0 {
// Cache miss - return how many bytes we successfully read
break;
}
- offset += count as u64;
+ logical_offset += count as u64;
buf = &mut buf[count..];
}
original_len - buf.len()
@@ -133,13 +146,6 @@ impl PoolRef {
/// Read the specified bytes, preferentially from the buffer pool cache. Bytes not found in the
/// buffer pool will be read from the provided `blob` and cached for future reads.
- ///
- /// # Warning
- ///
- /// Attempts to read any of the last (blob_size % page_size) "trailing bytes" of the blob will
- /// result in a ReadFailed error since the buffer pool only deals with page sized chunks.
- /// Trailing bytes need to be dealt with outside of the buffer pool. For example,
- /// [crate::buffer::Append] uses a [crate::buffer::tip::Buffer] to buffer them.
pub(super) async fn read(
&self,
blob: &B,
@@ -172,10 +178,10 @@ impl PoolRef {
Ok(())
}
- /// Fetch the specified page after encountering a page fault, which may involve retrieving it
+ /// Fetch the requested page after encountering a page fault, which may involve retrieving it
/// from `blob` & caching the result in `pool`. Returns the number of bytes read, which should
/// always be non-zero.
- async fn read_after_page_fault(
+ pub(super) async fn read_after_page_fault(
&self,
blob: &B,
blob_id: u64,
@@ -185,7 +191,7 @@ impl PoolRef {
assert!(!buf.is_empty());
let (page_num, offset_in_page) = Pool::offset_to_page(self.page_size, offset);
- let page_size = self.page_size;
+ let offset_in_page = offset_in_page as usize;
trace!(page_num, blob_id, "page fault");
// Create or clone a future that retrieves the desired page from the underlying blob. This
@@ -196,7 +202,7 @@ impl PoolRef {
// There's a (small) chance the page was fetched & buffered by another task before we
// were able to acquire the write lock, so check the cache before doing anything else.
- let count = pool.read_at(page_size, blob_id, buf, offset);
+ let count = pool.read_at(self.page_size, blob_id, buf, offset);
if count != 0 {
return Ok(count);
}
@@ -208,12 +214,27 @@ impl PoolRef {
(o.get().clone(), false)
}
Entry::Vacant(v) => {
- // Nobody is currently fetching this page, so create a future that will do the work.
+ // Nobody is currently fetching this page, so create a future that will do the
+ // work. get_page_from_blob handles CRC validation and returns only logical bytes.
let blob = blob.clone();
+ let page_size = self.page_size;
let future = async move {
- blob.read_at(vec![0; page_size], page_num * page_size as u64)
+ let page = get_page_from_blob(&blob, page_num, page_size)
.await
- .map_err(Arc::new)
+ .map_err(Arc::new)?;
+ // We should never be fetching partial pages through the buffer pool. This can happen
+ // if a non-last page is corrupted and falls back to a partial CRC.
+ let len = page.as_ref().len();
+ if len != page_size as usize {
+ error!(
+ page_num,
+ expected = page_size,
+ actual = len,
+ "attempted to fetch partial page from blob"
+ );
+ return Err(Arc::new(Error::InvalidChecksum));
+ }
+ Ok(page)
};
// Make the future shareable and insert it into the map.
@@ -231,10 +252,11 @@ impl PoolRef {
let fetch_result = fetch_future.await;
if !is_first_fetcher {
// Copy the requested portion of the page into the buffer and return immediately.
- let page_buf: Vec = fetch_result.map_err(|_| Error::ReadFailed)?.into();
- let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page);
- buf[..bytes_to_copy]
- .copy_from_slice(&page_buf[offset_in_page..offset_in_page + bytes_to_copy]);
+ let page_buf = fetch_result.map_err(|_| Error::ReadFailed)?;
+ let bytes_to_copy = std::cmp::min(buf.len(), page_buf.as_ref().len() - offset_in_page);
+ buf[..bytes_to_copy].copy_from_slice(
+ &page_buf.as_ref()[offset_in_page..offset_in_page + bytes_to_copy],
+ );
return Ok(bytes_to_copy);
}
@@ -247,43 +269,42 @@ impl PoolRef {
// Remove the entry from `page_fetches`.
let _ = pool.page_fetches.remove(&(blob_id, page_num));
- // Cache the result in the buffer pool.
- let Ok(page_buf) = fetch_result else {
- return Err(Error::ReadFailed);
+ // Cache the result in the buffer pool. get_page_from_blob already validated the CRC.
+ let page_buf = match fetch_result {
+ Ok(page_buf) => page_buf,
+ Err(err) => {
+ error!(page_num, ?err, "Page fetch failed");
+ return Err(Error::ReadFailed);
+ }
};
- pool.cache(page_size, blob_id, page_buf.as_ref(), page_num);
+
+ pool.cache(self.page_size, blob_id, page_buf.as_ref(), page_num);
// Copy the requested portion of the page into the buffer.
- let page_buf: Vec = page_buf.into();
- let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page);
+ let bytes_to_copy = std::cmp::min(buf.len(), page_buf.as_ref().len() - offset_in_page);
buf[..bytes_to_copy]
- .copy_from_slice(&page_buf[offset_in_page..offset_in_page + bytes_to_copy]);
+ .copy_from_slice(&page_buf.as_ref()[offset_in_page..offset_in_page + bytes_to_copy]);
Ok(bytes_to_copy)
}
- /// Cache the provided slice of data in the buffer pool, returning the remaining bytes that
+ /// Cache the provided pages of data in the buffer pool, returning the remaining bytes that
/// didn't fill a whole page. `offset` must be page aligned.
///
- /// If the next page index would overflow `u64`, caching stops and the uncached bytes are
- /// returned. This can only occur with 1-byte pages on 64-bit architectures. On 32-bit
- /// architectures it cannot occur because the buffer length is bounded by `usize::MAX` (2^32-1),
- /// so even starting at page `u64::MAX` with 1-byte pages, at most 2^32-1 pages can be cached.
- /// On 64-bit architectures with page_size >= 2, the maximum starting page (`u64::MAX / 2`)
- /// plus maximum cacheable pages (`usize::MAX / 2`) equals `u64::MAX - 1`.
- ///
/// # Panics
///
- /// Panics if `offset` is not page aligned.
+ /// - Panics if `offset` is not page aligned.
+ /// - If the buffer is not the size of a page.
pub async fn cache(&self, blob_id: u64, mut buf: &[u8], offset: u64) -> usize {
let (mut page_num, offset_in_page) = self.offset_to_page(offset);
assert_eq!(offset_in_page, 0);
{
// Write lock the buffer pool.
+ let page_size = self.page_size as usize;
let mut buffer_pool = self.pool.write().await;
- while buf.len() >= self.page_size {
- buffer_pool.cache(self.page_size, blob_id, &buf[..self.page_size], page_num);
- buf = &buf[self.page_size..];
+ while buf.len() >= page_size {
+ buffer_pool.cache(self.page_size, blob_id, &buf[..page_size], page_num);
+ buf = &buf[page_size..];
page_num = match page_num.checked_add(1) {
Some(next) => next,
None => break,
@@ -314,11 +335,8 @@ impl Pool {
}
/// Convert an offset into the number of the page it belongs to and the offset within that page.
- const fn offset_to_page(page_size: usize, offset: u64) -> (u64, usize) {
- (
- offset / page_size as u64,
- (offset % page_size as u64) as usize,
- )
+ const fn offset_to_page(page_size: u64, offset: u64) -> (u64, u64) {
+ (offset / page_size, offset % page_size)
}
/// Attempt to fetch blob data starting at `offset` from the buffer pool. Returns the number of
@@ -326,8 +344,8 @@ impl Pool {
/// never more than `self.page_size` or the length of `buf`. The returned bytes won't cross a
/// page boundary, so multiple reads may be required even if all data in the desired range is
/// buffered.
- fn read_at(&self, page_size: usize, blob_id: u64, buf: &mut [u8], offset: u64) -> usize {
- let (page_num, offset_in_page) = Self::offset_to_page(page_size, offset);
+ fn read_at(&self, page_size: u64, blob_id: u64, buf: &mut [u8], logical_offset: u64) -> usize {
+ let (page_num, offset_in_page) = Self::offset_to_page(page_size, logical_offset);
let page_index = self.index.get(&(blob_id, page_num));
let Some(&page_index) = page_index else {
return 0;
@@ -337,20 +355,18 @@ impl Pool {
page.referenced.store(true, Ordering::Relaxed);
let page = &page.data;
- let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page);
- buf[..bytes_to_copy].copy_from_slice(&page[offset_in_page..offset_in_page + bytes_to_copy]);
+ let logical_page_size = page_size as usize;
+ let bytes_to_copy = std::cmp::min(buf.len(), logical_page_size - offset_in_page as usize);
+ buf[..bytes_to_copy].copy_from_slice(
+ &page[offset_in_page as usize..offset_in_page as usize + bytes_to_copy],
+ );
bytes_to_copy
}
/// Put the given `page` into the buffer pool.
- ///
- /// # Panics
- ///
- /// Panics if the provided page is not exactly PAGE_SIZE bytes long.
- fn cache(&mut self, page_size: usize, blob_id: u64, page: &[u8], page_num: u64) {
- assert_eq!(page.len(), page_size);
-
+ fn cache(&mut self, page_size: u64, blob_id: u64, page: &[u8], page_num: u64) {
+ assert_eq!(page.len(), page_size as usize);
let key = (blob_id, page_num);
let index_entry = self.index.entry(key);
if let Entry::Occupied(index_entry) = index_entry {
@@ -400,52 +416,59 @@ impl Pool {
#[cfg(test)]
mod tests {
- use super::*;
- use crate::{deterministic, Runner as _, Storage as _};
+ use super::{super::Checksum, *};
+ use crate::{buffer::pool::CHECKSUM_SIZE, deterministic, Runner as _, Storage as _};
use commonware_macros::test_traced;
- use commonware_utils::NZUsize;
+ use commonware_utils::{NZUsize, NZU16};
+ use std::num::NonZeroU16;
- const PAGE_SIZE: usize = 1024;
+ // Logical page size (what PoolRef uses and what gets cached).
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
+ const PAGE_SIZE_U64: u64 = PAGE_SIZE.get() as u64;
#[test_traced]
fn test_pool_basic() {
let mut pool: Pool = Pool::new(10);
- let mut buf = vec![0; PAGE_SIZE];
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
+ // Cache stores logical-sized pages.
+ let mut buf = vec![0; PAGE_SIZE.get() as usize];
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0);
assert_eq!(bytes_read, 0);
- pool.cache(PAGE_SIZE, 0, &[1; PAGE_SIZE], 0);
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
- assert_eq!(bytes_read, PAGE_SIZE);
- assert_eq!(buf, [1; PAGE_SIZE]);
+ pool.cache(PAGE_SIZE_U64, 0, &[1; PAGE_SIZE.get() as usize], 0);
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0);
+ assert_eq!(bytes_read, PAGE_SIZE.get() as usize);
+ assert_eq!(buf, [1; PAGE_SIZE.get() as usize]);
// Test replacement -- should log a duplicate page warning but still work.
- pool.cache(PAGE_SIZE, 0, &[2; PAGE_SIZE], 0);
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
- assert_eq!(bytes_read, PAGE_SIZE);
- assert_eq!(buf, [2; PAGE_SIZE]);
+ pool.cache(PAGE_SIZE_U64, 0, &[2; PAGE_SIZE.get() as usize], 0);
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0);
+ assert_eq!(bytes_read, PAGE_SIZE.get() as usize);
+ assert_eq!(buf, [2; PAGE_SIZE.get() as usize]);
// Test exceeding the cache capacity.
for i in 0u64..11 {
- pool.cache(PAGE_SIZE, 0, &[i as u8; PAGE_SIZE], i);
+ pool.cache(PAGE_SIZE_U64, 0, &[i as u8; PAGE_SIZE.get() as usize], i);
}
// Page 0 should have been evicted.
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0);
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0);
assert_eq!(bytes_read, 0);
// Page 1-10 should be in the cache.
for i in 1u64..11 {
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, i * PAGE_SIZE as u64);
- assert_eq!(bytes_read, PAGE_SIZE);
- assert_eq!(buf, [i as u8; PAGE_SIZE]);
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, i * PAGE_SIZE_U64);
+ assert_eq!(bytes_read, PAGE_SIZE.get() as usize);
+ assert_eq!(buf, [i as u8; PAGE_SIZE.get() as usize]);
}
// Test reading from an unaligned offset by adding 2 to an aligned offset. The read
- // should be 2 bytes short of a full page.
- let mut buf = vec![0; PAGE_SIZE];
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, PAGE_SIZE as u64 + 2);
- assert_eq!(bytes_read, PAGE_SIZE - 2);
- assert_eq!(&buf[..PAGE_SIZE - 2], [1; PAGE_SIZE - 2]);
+ // should be 2 bytes short of a full logical page.
+ let mut buf = vec![0; PAGE_SIZE.get() as usize];
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, PAGE_SIZE_U64 + 2);
+ assert_eq!(bytes_read, PAGE_SIZE.get() as usize - 2);
+ assert_eq!(
+ &buf[..PAGE_SIZE.get() as usize - 2],
+ [1; PAGE_SIZE.get() as usize - 2]
+ );
}
#[test_traced]
@@ -454,39 +477,50 @@ mod tests {
let executor = deterministic::Runner::default();
// Start the test within the executor
executor.start(|context| async move {
- // Populate a blob with 11 consecutive pages of data.
+ // Physical page size = logical + CRC record.
+ let physical_page_size = PAGE_SIZE_U64 + CHECKSUM_SIZE;
+
+ // Populate a blob with 11 consecutive pages of CRC-protected data.
let (blob, size) = context
.open("test", "blob".as_bytes())
.await
.expect("Failed to open blob");
assert_eq!(size, 0);
for i in 0..11 {
- let buf = vec![i as u8; PAGE_SIZE];
- blob.write_at(buf, i * PAGE_SIZE as u64).await.unwrap();
+ // Write logical data followed by Checksum.
+ let logical_data = vec![i as u8; PAGE_SIZE.get() as usize];
+ let crc = crc32fast::hash(&logical_data);
+ let record = Checksum::new(PAGE_SIZE.get(), crc);
+ let mut page_data = logical_data;
+ page_data.extend_from_slice(&record.to_bytes());
+ blob.write_at(page_data, i * physical_page_size)
+ .await
+ .unwrap();
}
- // Fill the buffer pool with the blob's data.
- let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10));
+ // Fill the buffer pool with the blob's data via PoolRef::read.
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(10));
assert_eq!(pool_ref.next_id().await, 0);
assert_eq!(pool_ref.next_id().await, 1);
for i in 0..11 {
- let mut buf = vec![0; PAGE_SIZE];
+ // Read expects logical bytes only (CRCs are stripped).
+ let mut buf = vec![0; PAGE_SIZE.get() as usize];
pool_ref
- .read(&blob, 0, &mut buf, i * PAGE_SIZE as u64)
+ .read(&blob, 0, &mut buf, i * PAGE_SIZE_U64)
.await
.unwrap();
- assert_eq!(buf, [i as u8; PAGE_SIZE]);
+ assert_eq!(buf, [i as u8; PAGE_SIZE.get() as usize]);
}
// Repeat the read to exercise reading from the buffer pool. Must start at 1 because
// page 0 should be evicted.
for i in 1..11 {
- let mut buf = vec![0; PAGE_SIZE];
+ let mut buf = vec![0; PAGE_SIZE.get() as usize];
pool_ref
- .read(&blob, 0, &mut buf, i * PAGE_SIZE as u64)
+ .read(&blob, 0, &mut buf, i * PAGE_SIZE_U64)
.await
.unwrap();
- assert_eq!(buf, [i as u8; PAGE_SIZE]);
+ assert_eq!(buf, [i as u8; PAGE_SIZE.get() as usize]);
}
// Cleanup.
@@ -498,41 +532,63 @@ mod tests {
fn test_pool_cache_max_page() {
let executor = deterministic::Runner::default();
executor.start(|_context| async move {
- let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(2));
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(2));
// Use the largest page-aligned offset representable for the configured PAGE_SIZE.
- let aligned_max_offset = u64::MAX - (u64::MAX % PAGE_SIZE as u64);
+ let aligned_max_offset = u64::MAX - (u64::MAX % PAGE_SIZE_U64);
+
+ // PoolRef::cache expects only logical bytes (no CRC).
+ let logical_data = vec![42u8; PAGE_SIZE.get() as usize];
// Caching exactly one page at the maximum offset should succeed.
let remaining = pool_ref
- .cache(0, vec![42; PAGE_SIZE].as_slice(), aligned_max_offset)
+ .cache(0, logical_data.as_slice(), aligned_max_offset)
.await;
assert_eq!(remaining, 0);
- let mut buf = vec![0u8; PAGE_SIZE];
+ // Reading from the pool should return the logical bytes.
+ let mut buf = vec![0u8; PAGE_SIZE.get() as usize];
let pool = pool_ref.pool.read().await;
- let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, aligned_max_offset);
- assert_eq!(bytes_read, PAGE_SIZE);
+ let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, aligned_max_offset);
+ assert_eq!(bytes_read, PAGE_SIZE.get() as usize);
assert!(buf.iter().all(|b| *b == 42));
});
}
#[test_traced]
- fn test_pool_cache_page_overflow_partial() {
+ fn test_pool_cache_at_high_offset() {
let executor = deterministic::Runner::default();
executor.start(|_context| async move {
- // Use the minimum page size to force the page index to reach u64::MAX and trigger the
- // overflow guard.
- let pool_ref = PoolRef::new(NZUsize!(1), NZUsize!(2));
-
- // Caching across the maximum page should stop before overflow and report the remainder.
- let remaining = pool_ref.cache(0, &[1, 2], u64::MAX).await;
- assert_eq!(remaining, 1);
+ // Use the minimum page size (CHECKSUM_SIZE + 1 = 13) with high offset.
+ const MIN_PAGE_SIZE: u64 = CHECKSUM_SIZE + 1;
+ let pool_ref = PoolRef::new(NZU16!(MIN_PAGE_SIZE as u16), NZUsize!(2));
+
+ // Create two pages worth of logical data (no CRCs - PoolRef::cache expects logical only).
+ let data = vec![1u8; MIN_PAGE_SIZE as usize * 2];
+
+ // Cache pages at a high (but not max) aligned offset so we can verify both pages.
+ // Use an offset that's a few pages below max to avoid overflow when verifying.
+ let aligned_max_offset = u64::MAX - (u64::MAX % MIN_PAGE_SIZE);
+ let high_offset = aligned_max_offset - (MIN_PAGE_SIZE * 2);
+ let remaining = pool_ref.cache(0, &data, high_offset).await;
+ // Both pages should be cached.
+ assert_eq!(remaining, 0);
- let mut buf = [0u8; 1];
+ // Verify the first page was cached correctly.
+ let mut buf = vec![0u8; MIN_PAGE_SIZE as usize];
let pool = pool_ref.pool.read().await;
- assert_eq!(pool.read_at(1, 0, &mut buf, u64::MAX), 1);
- assert_eq!(buf, [1]);
+ assert_eq!(
+ pool.read_at(MIN_PAGE_SIZE, 0, &mut buf, high_offset),
+ MIN_PAGE_SIZE as usize
+ );
+ assert!(buf.iter().all(|b| *b == 1));
+
+ // Verify the second page was cached correctly.
+ assert_eq!(
+ pool.read_at(MIN_PAGE_SIZE, 0, &mut buf, high_offset + MIN_PAGE_SIZE),
+ MIN_PAGE_SIZE as usize
+ );
+ assert!(buf.iter().all(|b| *b == 1));
});
}
}
diff --git a/runtime/src/utils/buffer/pool/read.rs b/runtime/src/utils/buffer/pool/read.rs
new file mode 100644
index 0000000000..95a13d1647
--- /dev/null
+++ b/runtime/src/utils/buffer/pool/read.rs
@@ -0,0 +1,470 @@
+use super::{Checksum, CHECKSUM_SIZE};
+use crate::{Blob, Error};
+use commonware_utils::StableBuf;
+use std::num::NonZeroUsize;
+use tracing::{debug, error};
+
+const CHECKSUM_SIZE_USIZE: usize = CHECKSUM_SIZE as usize;
+
+/// A reader that buffers content from a [Blob] with page-level CRCs to optimize the performance of
+/// a full scan of contents.
+pub struct Read {
+ /// The underlying blob to read from.
+ blob: B,
+ /// The physical size of the blob (always a multiple of physical page size).
+ physical_blob_size: u64,
+ /// The logical size of the blob (actual data bytes, not including CRCs or padding).
+ logical_blob_size: u64,
+ /// The buffer storing the data read from the blob. The buffer stores logical bytes only.
+ buffer: Vec,
+ /// The current page in the blob from where the buffer was filled (the buffer always starts at a
+ /// page boundary).
+ blob_page: u64,
+ /// The current position within the buffer containing the next byte to be read.
+ buffer_position: usize,
+ /// The capacity of the buffer. We always fully fill the buffer, unless we are at the end of
+ /// the blob. The buffer capacity must be a multiple of the page size.
+ buffer_capacity: usize,
+ /// The physical page size of each full page in the blob, including its 12-byte Checksum.
+ page_size: usize,
+}
+
+impl Read {
+ /// Creates a new `Read` that reads from the given blob with the specified buffer size. The
+ /// `logical_page_size` is the size of the logical data portion of each page (not including the
+ /// Checksum). If the buffer capacity is not a multiple of the physical page size, it will be
+ /// rounded up to the nearest.
+ ///
+ /// The `physical_blob_size` is the size of the underlying blob on disk (must be a multiple of
+ /// the physical page size). The `logical_blob_size` is the actual data size (not including
+ /// CRCs or padding in partial pages).
+ pub fn new(
+ blob: B,
+ physical_blob_size: u64,
+ logical_blob_size: u64,
+ capacity: NonZeroUsize,
+ logical_page_size: NonZeroUsize,
+ ) -> Self {
+ let page_size = logical_page_size.get() + CHECKSUM_SIZE_USIZE;
+ let mut capacity = capacity.get();
+ if !capacity.is_multiple_of(page_size) {
+ capacity += page_size - capacity % page_size;
+ debug!(
+ capacity,
+ "rounded buffer capacity up to nearest multiple of page_size"
+ );
+ }
+
+ Self {
+ blob,
+ physical_blob_size,
+ logical_blob_size,
+ buffer: Vec::with_capacity(capacity),
+ blob_page: 0,
+ buffer_position: 0,
+ buffer_capacity: capacity,
+ page_size,
+ }
+ }
+
+ /// Returns the logical size of the blob in bytes.
+ pub const fn blob_size(&self) -> u64 {
+ self.logical_blob_size
+ }
+
+ /// Returns the current logical position in the blob.
+ pub const fn position(&self) -> u64 {
+ let logical_page_size = (self.page_size - CHECKSUM_SIZE_USIZE) as u64;
+ self.blob_page * logical_page_size + self.buffer_position as u64
+ }
+
+ /// Reads up to `buf.len()` bytes from the current position, but only as many as are available.
+ ///
+ /// This is useful for reading variable-length prefixes (like varints) where you want to read up
+ /// to a maximum number of bytes but the actual remaining bytes in the blob might be less.
+ ///
+ /// Returns the number of bytes actually read into the buffer, which will be [0, buf.len()).
+ pub async fn read_up_to(
+ &mut self,
+ buf: impl Into + Send,
+ ) -> Result<(StableBuf, usize), Error> {
+ let mut buf = buf.into();
+ if buf.is_empty() {
+ return Ok((buf, 0));
+ }
+ let current_pos = self.position();
+ let blob_size = self.blob_size();
+ let available = (blob_size.saturating_sub(current_pos) as usize).min(buf.len());
+ if available == 0 {
+ return Err(Error::BlobInsufficientLength);
+ }
+ self.read_exact(buf.as_mut(), available).await?;
+
+ Ok((buf, available))
+ }
+
+ /// Reads exactly `size` bytes into the provided buffer. Returns [Error::BlobInsufficientLength]
+ /// if not enough bytes are available.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `size` is greater than the length of `buf`.
+ pub async fn read_exact(&mut self, buf: &mut [u8], size: usize) -> Result<(), Error> {
+ assert!(size <= buf.len());
+
+ let mut bytes_copied = 0;
+ while bytes_copied < size {
+ // Refill buffer if exhausted
+ if self.buffer_position >= self.buffer.len() {
+ self.fill_buffer().await?;
+ }
+
+ // Copy logical bytes
+ let available = self.buffer.len() - self.buffer_position;
+ // The buffer might be empty if we're at the end of the blob.
+ if available == 0 {
+ return Err(Error::BlobInsufficientLength);
+ }
+
+ let bytes_to_copy = (size - bytes_copied).min(available);
+ buf[bytes_copied..bytes_copied + bytes_to_copy].copy_from_slice(
+ &self.buffer[self.buffer_position..self.buffer_position + bytes_to_copy],
+ );
+
+ bytes_copied += bytes_to_copy;
+ self.buffer_position += bytes_to_copy;
+ }
+
+ Ok(())
+ }
+
+ /// Fills the buffer from the blob starting at the current physical position and verifies the
+ /// CRC of each page (including any trailing partial page).
+ async fn fill_buffer(&mut self) -> Result<(), Error> {
+ let logical_page_size = self.page_size - CHECKSUM_SIZE_USIZE;
+
+ // Advance blob_page based on how much of the buffer we've consumed. We use ceiling division
+ // because even a partial page counts as a "page" read from the blob.
+ let pages_consumed = self.buffer.len().div_ceil(logical_page_size);
+ self.blob_page += pages_consumed as u64;
+
+ // Reset position to the offset within the new page. If the buffer was not empty, we are
+ // continuing a sequential read, so we start at the beginning of the next page. If the
+ // buffer was empty (e.g. after a seek), we preserve the offset set by seek_to.
+ if !self.buffer.is_empty() {
+ self.buffer_position = 0;
+ }
+
+ // Calculate physical read parameters
+ let start_offset = match self.blob_page.checked_mul(self.page_size as u64) {
+ Some(o) => o,
+ None => return Err(Error::OffsetOverflow),
+ };
+
+ if start_offset >= self.physical_blob_size {
+ return Err(Error::BlobInsufficientLength);
+ }
+
+ let bytes_to_read =
+ ((self.physical_blob_size - start_offset) as usize).min(self.buffer_capacity);
+ if bytes_to_read == 0 {
+ return Err(Error::BlobInsufficientLength);
+ }
+
+ // Read physical data directly into the main buffer, then validate CRCs and compact in-place.
+ // This avoids allocating a separate staging buffer.
+ self.buffer.clear();
+ self.buffer.resize(bytes_to_read, 0);
+ let buf = std::mem::take(&mut self.buffer);
+ let buf = self.blob.read_at(buf, start_offset).await?;
+ self.buffer = buf.into();
+
+ // Validate CRCs and compact by removing CRC records in-place.
+ let mut read_offset = 0;
+ let mut write_offset = 0;
+ let physical_len = self.buffer.len();
+
+ while read_offset < physical_len {
+ let remaining = physical_len - read_offset;
+
+ // Check if full page or partial
+ if remaining >= self.page_size {
+ let page_slice = &self.buffer[read_offset..read_offset + self.page_size];
+ let Some(record) = Checksum::validate_page(page_slice) else {
+ error!(
+ page = self.blob_page + (read_offset / self.page_size) as u64,
+ "CRC mismatch"
+ );
+ return Err(Error::InvalidChecksum);
+ };
+ // For non-last pages, the validated length must equal logical_page_size.
+ let (len, _) = record.get_crc();
+ let len = len as usize;
+ let is_last_page = start_offset + read_offset as u64 + self.page_size as u64
+ >= self.physical_blob_size;
+ if !is_last_page && len != logical_page_size {
+ error!(
+ page = self.blob_page + (read_offset / self.page_size) as u64,
+ expected = logical_page_size,
+ actual = len,
+ "non-last page has partial length"
+ );
+ return Err(Error::InvalidChecksum);
+ }
+ // Compact: move logical data to remove CRC record gap
+ if write_offset != read_offset {
+ self.buffer
+ .copy_within(read_offset..read_offset + len, write_offset);
+ }
+ write_offset += len;
+ read_offset += self.page_size;
+ continue;
+ }
+
+ // Partial page - must have at least CHECKSUM_SIZE bytes
+ if remaining < CHECKSUM_SIZE_USIZE {
+ error!(
+ page = self.blob_page + (read_offset / self.page_size) as u64,
+ "short page"
+ );
+ return Err(Error::InvalidChecksum);
+ }
+ let page_slice = &self.buffer[read_offset..];
+ let Some(record) = Checksum::validate_page(page_slice) else {
+ error!(
+ page = self.blob_page + (read_offset / self.page_size) as u64,
+ "CRC mismatch"
+ );
+ return Err(Error::InvalidChecksum);
+ };
+ let (len, _) = record.get_crc();
+ let logical_len = len as usize;
+ // Compact: move logical data
+ if write_offset != read_offset {
+ self.buffer
+ .copy_within(read_offset..read_offset + logical_len, write_offset);
+ }
+ write_offset += logical_len;
+ break;
+ }
+
+ // Truncate buffer to only contain logical data
+ self.buffer.truncate(write_offset);
+
+ // If we sought to a position that is beyond the end of what we just read, error.
+ if self.buffer_position >= self.buffer.len() {
+ return Err(Error::BlobInsufficientLength);
+ }
+
+ Ok(())
+ }
+
+ /// Repositions the buffer to read from the specified logical position in the blob.
+ pub fn seek_to(&mut self, position: u64) -> Result<(), Error> {
+ let logical_page_size = (self.page_size - CHECKSUM_SIZE_USIZE) as u64;
+
+ // Check if the position is within the current buffer.
+ let buffer_start = self.blob_page * logical_page_size;
+ let buffer_end = buffer_start + self.buffer.len() as u64;
+ if position >= buffer_start && position < buffer_end {
+ self.buffer_position = (position - buffer_start) as usize;
+ return Ok(());
+ }
+
+ self.blob_page = position / logical_page_size;
+ self.buffer_position = (position % logical_page_size) as usize;
+ self.buffer.clear(); // Invalidate buffer, will be refilled on next read
+
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::super::{append::Append, PoolRef};
+ use crate::{deterministic, Blob, Error, Runner as _, Storage as _};
+ use commonware_macros::test_traced;
+ use commonware_utils::{NZUsize, NZU16};
+ use std::num::NonZeroU16;
+
+ const PAGE_SIZE: NonZeroU16 = NZU16!(103); // Logical page size (intentionally odd to test alignment)
+ const BUFFER_SIZE: usize = PAGE_SIZE.get() as usize * 2;
+
+ #[test_traced("DEBUG")]
+ fn test_read_after_append() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ // Create a blob and write data using Append
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+ assert_eq!(blob_size, 0);
+
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref)
+ .await
+ .unwrap();
+
+ // Write data that spans multiple pages
+ let data: Vec = (0u8..=255).cycle().take(300).collect();
+ append.append(&data).await.unwrap();
+
+ // Create a Read to read the data back
+ let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap();
+
+ // Verify initial position
+ assert_eq!(reader.position(), 0);
+
+ // Read all data back
+ let mut read_buf = vec![0u8; 300];
+ reader.read_exact(&mut read_buf, 300).await.unwrap();
+ assert_eq!(read_buf, data);
+
+ // Verify position after read
+ assert_eq!(reader.position(), 300);
+ });
+ }
+
+ #[test_traced("DEBUG")]
+ fn test_read_with_seek() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ // Create a blob and write data using Append
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref)
+ .await
+ .unwrap();
+
+ // Write data that spans multiple pages (300 bytes = ~3 logical pages)
+ let data: Vec = (0u8..=255).cycle().take(300).collect();
+ append.append(&data).await.unwrap();
+
+ let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap();
+
+ // Read first 50 bytes
+ let mut buf = vec![0u8; 50];
+ reader.read_exact(&mut buf, 50).await.unwrap();
+ assert_eq!(buf, &data[0..50]);
+ assert_eq!(reader.position(), 50);
+
+ // Seek to middle of second page (position 150)
+ reader.seek_to(150).unwrap();
+ assert_eq!(reader.position(), 150);
+
+ // Read 50 bytes from position 150
+ reader.read_exact(&mut buf, 50).await.unwrap();
+ assert_eq!(buf, &data[150..200]);
+ assert_eq!(reader.position(), 200);
+
+ // Seek back to beginning
+ reader.seek_to(0).unwrap();
+ assert_eq!(reader.position(), 0);
+
+ // Read all data to verify seek worked
+ let mut full_buf = vec![0u8; 300];
+ reader.read_exact(&mut full_buf, 300).await.unwrap();
+ assert_eq!(full_buf, data);
+ });
+ }
+
+ #[test_traced("DEBUG")]
+ fn test_read_partial_page() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ // Create a blob and write data that doesn't fill the last page
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref)
+ .await
+ .unwrap();
+
+ // Write exactly one full logical page plus 10 more bytes
+ let data: Vec = (1u8..=(PAGE_SIZE.get() + 10) as u8).collect();
+ assert_eq!(data.len(), PAGE_SIZE.get() as usize + 10);
+ append.append(&data).await.unwrap();
+
+ let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap();
+
+ // Read all data back
+ let mut read_buf = vec![0u8; data.len()];
+ reader.read_exact(&mut read_buf, data.len()).await.unwrap();
+ assert_eq!(read_buf, data);
+
+ // Verify we can seek to partial page and read
+ reader.seek_to(PAGE_SIZE.get() as u64).unwrap();
+ let mut partial_buf = vec![0u8; 10];
+ reader.read_exact(&mut partial_buf, 10).await.unwrap();
+ assert_eq!(partial_buf, &data[PAGE_SIZE.get() as usize..]);
+ });
+ }
+
+ #[test_traced("DEBUG")]
+ fn test_read_across_page_boundary() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref)
+ .await
+ .unwrap();
+
+ // Write 200 bytes spanning multiple pages
+ let data: Vec = (0u8..200).collect();
+ append.append(&data).await.unwrap();
+
+ let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap();
+
+ // Seek to position 90 (13 bytes before first page boundary at 103)
+ reader.seek_to(90).unwrap();
+
+ // Read 20 bytes across the page boundary
+ let mut buf = vec![0u8; 20];
+ reader.read_exact(&mut buf, 20).await.unwrap();
+ assert_eq!(buf, &data[90..110]);
+ });
+ }
+
+ #[test_traced("DEBUG")]
+ fn test_read_rejects_partial_crc_on_non_last_page() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: deterministic::Context| async move {
+ let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap();
+
+ let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE));
+ let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref)
+ .await
+ .unwrap();
+
+ // Two full pages.
+ let data: Vec = (0u8..=255)
+ .cycle()
+ .take(PAGE_SIZE.get() as usize * 2)
+ .collect();
+ append.append(&data).await.unwrap();
+ append.sync().await.unwrap();
+
+ // Corrupt page 0 to claim a shorter (partial) length with a valid CRC.
+ let page_size = PAGE_SIZE.get() as u64;
+ let short_len = page_size / 2;
+ let crc = crc32fast::hash(&data[..short_len as usize]);
+ let record = super::Checksum::new(short_len as u16, crc);
+ let crc_offset = page_size; // CRC record starts after logical page bytes
+ blob.write_at(record.to_bytes().to_vec(), crc_offset)
+ .await
+ .unwrap();
+ blob.sync().await.unwrap();
+
+ // Capacity of one page => bug reproduces if last-page check is buffer-based.
+ let mut reader = append
+ .as_blob_reader(NZUsize!(page_size as usize))
+ .await
+ .unwrap();
+ let mut buf = vec![0u8; page_size as usize];
+ let result = reader.read_exact(&mut buf, page_size as usize).await;
+
+ assert!(matches!(result, Err(Error::InvalidChecksum)));
+ });
+ }
+}
diff --git a/runtime/src/utils/buffer/tip.rs b/runtime/src/utils/buffer/tip.rs
index 7061e69dfd..ac68e77945 100644
--- a/runtime/src/utils/buffer/tip.rs
+++ b/runtime/src/utils/buffer/tip.rs
@@ -1,5 +1,3 @@
-use std::num::NonZeroUsize;
-
/// A buffer for caching data written to the tip of a blob.
///
/// The buffer always represents data at the "tip" of the logical blob, starting at `offset` and
@@ -16,15 +14,20 @@ pub(super) struct Buffer {
/// The maximum size of the buffer.
pub(super) capacity: usize,
+
+ /// Whether this buffer should allow new data.
+ // TODO(#2371): Use a distinct state-type for immutable vs immutable.
+ pub(super) immutable: bool,
}
impl Buffer {
- /// Creates a new buffer with the provided `size` and `capacity`.
- pub(super) fn new(size: u64, capacity: NonZeroUsize) -> Self {
+ /// Creates a new buffer with the provided `offset` and `capacity`.
+ pub(super) fn new(offset: u64, capacity: usize) -> Self {
Self {
- data: Vec::with_capacity(capacity.get()),
- offset: size,
- capacity: capacity.get(),
+ data: Vec::with_capacity(capacity),
+ offset,
+ capacity,
+ immutable: false,
}
}
@@ -75,11 +78,11 @@ impl Buffer {
}
}
- /// Returns the buffered data and its blob offset, or returns `None` if the buffer is
- /// already empty.
+ /// Returns the buffered data and its blob offset, or returns `None` if the buffer is already
+ /// empty.
///
- /// The buffer is reset to the empty state with an updated offset positioned at
- /// the end of the logical blob.
+ /// The buffer is reset to the empty state with an updated offset positioned at the end of the
+ /// logical blob.
pub(super) fn take(&mut self) -> Option<(Vec, u64)> {
if self.is_empty() {
return None;
@@ -153,11 +156,19 @@ impl Buffer {
true
}
- /// Appends the provided `data` to the buffer, and returns `true` if the buffer is now above
- /// capacity. If above capacity, the caller is responsible for using `take` to bring it back
- /// under.
+ /// Appends the provided `data` to the buffer, and returns `true` if the buffer is over capacity
+ /// after the append.
+ ///
+ /// If the buffer is above capacity, the caller is responsible for using `take` to bring it back
+ /// under. Further appends are safe, but will continue growing the buffer beyond its capacity.
pub(super) fn append(&mut self, data: &[u8]) -> bool {
self.data.extend_from_slice(data);
+
+ self.over_capacity()
+ }
+
+ /// Whether the buffer is over capacity and should be taken & flushed to the underlying blob.
+ const fn over_capacity(&self) -> bool {
self.data.len() > self.capacity
}
}
@@ -165,11 +176,10 @@ impl Buffer {
#[cfg(test)]
mod tests {
use super::*;
- use commonware_utils::NZUsize;
#[test]
fn test_tip_append() {
- let mut buffer = Buffer::new(50, NZUsize!(100));
+ let mut buffer = Buffer::new(50, 100);
assert_eq!(buffer.size(), 50);
assert!(buffer.is_empty());
assert_eq!(buffer.take(), None);
@@ -198,7 +208,7 @@ mod tests {
#[test]
fn test_tip_resize() {
- let mut buffer = Buffer::new(50, NZUsize!(100));
+ let mut buffer = Buffer::new(50, 100);
buffer.append(&[1, 2, 3]);
assert_eq!(buffer.size(), 53);
diff --git a/runtime/src/utils/buffer/write.rs b/runtime/src/utils/buffer/write.rs
index e18189f3c2..b923995717 100644
--- a/runtime/src/utils/buffer/write.rs
+++ b/runtime/src/utils/buffer/write.rs
@@ -2,8 +2,8 @@ use crate::{buffer::tip::Buffer, Blob, Error, RwLock};
use commonware_utils::StableBuf;
use std::{num::NonZeroUsize, sync::Arc};
-/// A writer that buffers content to a [Blob] to optimize the performance
-/// of appending or updating data.
+/// A writer that buffers the raw content of a [Blob] to optimize the performance of appending or
+/// updating data.
///
/// # Example
///
@@ -54,7 +54,7 @@ impl Write {
pub fn new(blob: B, size: u64, capacity: NonZeroUsize) -> Self {
Self {
blob,
- buffer: Arc::new(RwLock::new(Buffer::new(size, capacity))),
+ buffer: Arc::new(RwLock::new(Buffer::new(size, capacity.get()))),
}
}
diff --git a/storage/conformance.toml b/storage/conformance.toml
index 02770d0057..ab29dff0ac 100644
--- a/storage/conformance.toml
+++ b/storage/conformance.toml
@@ -1,10 +1,10 @@
["commonware_storage::archive::conformance::ArchiveImmutable"]
n_cases = 128
-hash = "8e578ed38733486716d072e565e62fe5d9ba7185ffb6e26ec7db8611c69b90b8"
+hash = "6acfa1bc0c17920b5c0e0437af106e09ee57dcd37459091402192f2c146afdb5"
["commonware_storage::archive::conformance::ArchivePrunable"]
n_cases = 128
-hash = "674e81c769c06a3965dc691b1f8c0327374f427e8a4bf67895c6ad4e566fed20"
+hash = "cb063a05c6a75902893f790e9802b4906be506f2f7e5d10b46dff90a92e40819"
["commonware_storage::archive::immutable::storage::conformance::CodecConformance"]
n_cases = 65536
@@ -48,15 +48,15 @@ hash = "13b3e99a8c74b50dc18150194a92306de670b94e6642758feb6d9b6e9881f827"
["commonware_storage::journal::conformance::ContiguousFixed"]
n_cases = 512
-hash = "134bb8b838241c2dedf98d96130f014bea19f1bc7580307c9798540466eb81c6"
+hash = "4c786b6b7f91b9924a62a7b9a1c32a8d47398f1c8a3d5bf06fe1a90998e86aab"
["commonware_storage::journal::conformance::ContiguousVariable"]
n_cases = 512
-hash = "29d37f2309943dd27d4344710a900bb3b992c0a1089ff9734cddbfa78c039200"
+hash = "973ebd77804d2ea346574d377f39bd29350063098f2e6fab9596783ba43664e5"
["commonware_storage::journal::conformance::SegmentedFixed"]
n_cases = 512
-hash = "505611ba11d6380254c159eb6234f87cc19a62b0919bc96d59e83de498b458fa"
+hash = "e077ce8c6d9a79c87cf9b48866c65f387ffbec9d8e8c65dd40c46b4296cfc050"
["commonware_storage::journal::conformance::SegmentedGlob"]
n_cases = 512
@@ -64,11 +64,11 @@ hash = "adb1efeef12c203c05879ce4d1d03ef443c767737a6c6b57433189100eec9197"
["commonware_storage::journal::conformance::SegmentedOversized"]
n_cases = 512
-hash = "b98d56d2eb039657b3452135666795eeeefdc83e9d6f3cb070e7ca114b4621a1"
+hash = "b815138329a06cbe235cf547ed62774165bd2108e68c65bf15ae152bedf84b3a"
["commonware_storage::journal::conformance::SegmentedVariable"]
n_cases = 512
-hash = "cd79e09ca53917f78c290e67efe08bf17b3ec0d0faf1b5f6507d4665749574b1"
+hash = "418dafd67008cb74d34fe58b9be8747cfaf86e345a71eb4c35ae0e43a4c077ef"
["commonware_storage::mmr::proof::tests::conformance::CodecConformance>"]
n_cases = 65536
diff --git a/storage/fuzz/fuzz_targets/archive_operations.rs b/storage/fuzz/fuzz_targets/archive_operations.rs
index 1a2ffe1fa6..70761c41fc 100644
--- a/storage/fuzz/fuzz_targets/archive_operations.rs
+++ b/storage/fuzz/fuzz_targets/archive_operations.rs
@@ -9,9 +9,9 @@ use commonware_storage::{
},
translator::EightCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU16, NonZeroUsize};
type Key = FixedBytes<16>;
type Value = FixedBytes<32>;
@@ -40,7 +40,7 @@ struct FuzzInput {
operations: Vec,
}
-const PAGE_SIZE: NonZeroUsize = NZUsize!(555);
+const PAGE_SIZE: NonZeroU16 = NZU16!(456);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(100);
fn fuzz(data: FuzzInput) {
diff --git a/storage/fuzz/fuzz_targets/cache_operations.rs b/storage/fuzz/fuzz_targets/cache_operations.rs
index d780f426a0..bfa3c4b894 100644
--- a/storage/fuzz/fuzz_targets/cache_operations.rs
+++ b/storage/fuzz/fuzz_targets/cache_operations.rs
@@ -8,7 +8,7 @@ use libfuzzer_sys::{
fuzz_target,
};
use rand::{rngs::StdRng, SeedableRng};
-use std::collections::BTreeMap;
+use std::{collections::BTreeMap, num::NonZeroU16};
const MAX_OPERATIONS: usize = 50;
const MAX_INDEX: u64 = 10000;
@@ -21,8 +21,8 @@ const MIN_REPLAY_BUFFER: usize = 256;
const MAX_REPLAY_BUFFER: usize = 2 * 8192;
const MIN_COMPRESSION_LEVEL: u8 = 1;
const MAX_COMPRESSION_LEVEL: u8 = 21;
-const MIN_BUFFER_POOL_PAGE_SIZE: usize = 512;
-const MAX_BUFFER_POOL_PAGE_SIZE: usize = 4096;
+const MIN_BUFFER_POOL_PAGE_SIZE: u16 = 511;
+const MAX_BUFFER_POOL_PAGE_SIZE: u16 = 4097;
const MIN_BUFFER_POOL_CAPACITY: usize = 10;
const MAX_BUFFER_POOL_CAPACITY: usize = 64;
@@ -45,7 +45,7 @@ struct CacheConfig {
write_buffer: usize,
replay_buffer: usize,
compression: Option,
- buffer_pool_pages_size: usize,
+ buffer_pool_pages_size: NonZeroU16,
buffer_pool_capacity: usize,
}
@@ -71,7 +71,8 @@ impl<'a> Arbitrary<'a> for FuzzInput {
None
};
let buffer_pool_pages_size =
- u.int_in_range(MIN_BUFFER_POOL_PAGE_SIZE..=MAX_BUFFER_POOL_PAGE_SIZE)?;
+ NonZeroU16::new(u.int_in_range(MIN_BUFFER_POOL_PAGE_SIZE..=MAX_BUFFER_POOL_PAGE_SIZE)?)
+ .unwrap();
let buffer_pool_capacity =
u.int_in_range(MIN_BUFFER_POOL_CAPACITY..=MAX_BUFFER_POOL_CAPACITY)?;
@@ -137,7 +138,7 @@ fn fuzz(input: FuzzInput) {
replay_buffer: NZUsize!(input.config.replay_buffer),
items_per_blob: NZU64!(input.config.items_per_blob),
buffer_pool: PoolRef::new(
- NZUsize!(input.config.buffer_pool_pages_size),
+ input.config.buffer_pool_pages_size,
NZUsize!(input.config.buffer_pool_capacity),
),
};
diff --git a/storage/fuzz/fuzz_targets/current_ordered_operations.rs b/storage/fuzz/fuzz_targets/current_ordered_operations.rs
index 81bda24787..0b3a6b2a27 100644
--- a/storage/fuzz/fuzz_targets/current_ordered_operations.rs
+++ b/storage/fuzz/fuzz_targets/current_ordered_operations.rs
@@ -11,9 +11,12 @@ use commonware_storage::{
},
translator::TwoCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
-use std::{collections::HashMap, num::NonZeroU64};
+use std::{
+ collections::HashMap,
+ num::{NonZeroU16, NonZeroU64},
+};
type Key = FixedBytes<32>;
type Value = FixedBytes<32>;
@@ -74,7 +77,7 @@ impl<'a> Arbitrary<'a> for FuzzInput {
}
}
-const PAGE_SIZE: usize = 88;
+const PAGE_SIZE: NonZeroU16 = NZU16!(91);
const PAGE_CACHE_SIZE: usize = 8;
const MMR_ITEMS_PER_BLOB: u64 = 11;
const LOG_ITEMS_PER_BLOB: u64 = 7;
@@ -95,7 +98,7 @@ fn fuzz(data: FuzzInput) {
log_write_buffer: NZUsize!(WRITE_BUFFER_SIZE),
bitmap_metadata_partition: "fuzz_current_bitmap_metadata".into(),
translator: TwoCap,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
thread_pool: None,
};
diff --git a/storage/fuzz/fuzz_targets/current_unordered_operations.rs b/storage/fuzz/fuzz_targets/current_unordered_operations.rs
index f8220562e7..465e76f61d 100644
--- a/storage/fuzz/fuzz_targets/current_unordered_operations.rs
+++ b/storage/fuzz/fuzz_targets/current_unordered_operations.rs
@@ -11,9 +11,12 @@ use commonware_storage::{
},
translator::TwoCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
-use std::{collections::HashMap, num::NonZeroU64};
+use std::{
+ collections::HashMap,
+ num::{NonZeroU16, NonZeroU64},
+};
type Key = FixedBytes<32>;
type Value = FixedBytes<32>;
@@ -68,7 +71,7 @@ impl<'a> Arbitrary<'a> for FuzzInput {
}
}
-const PAGE_SIZE: usize = 88;
+const PAGE_SIZE: NonZeroU16 = NZU16!(88);
const PAGE_CACHE_SIZE: usize = 8;
const MMR_ITEMS_PER_BLOB: u64 = 11;
const LOG_ITEMS_PER_BLOB: u64 = 7;
@@ -89,7 +92,7 @@ fn fuzz(data: FuzzInput) {
log_write_buffer: NZUsize!(WRITE_BUFFER_SIZE),
bitmap_metadata_partition: "fuzz_current_bitmap_metadata".into(),
translator: TwoCap,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
thread_pool: None,
};
diff --git a/storage/fuzz/fuzz_targets/fixed_journal_operations.rs b/storage/fuzz/fuzz_targets/fixed_journal_operations.rs
index 65c2f27a6a..486c966730 100644
--- a/storage/fuzz/fuzz_targets/fixed_journal_operations.rs
+++ b/storage/fuzz/fuzz_targets/fixed_journal_operations.rs
@@ -4,9 +4,10 @@ use arbitrary::{Arbitrary, Result, Unstructured};
use commonware_cryptography::{Hasher as _, Sha256};
use commonware_runtime::{buffer::PoolRef, deterministic, Runner};
use commonware_storage::journal::contiguous::fixed::{Config as JournalConfig, Journal};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use futures::{pin_mut, StreamExt};
use libfuzzer_sys::fuzz_target;
+use std::num::NonZeroU16;
const MAX_REPLAY_BUF: usize = 2048;
const MAX_WRITE_BUF: usize = 2048;
@@ -51,7 +52,7 @@ struct FuzzInput {
operations: Vec,
}
-const PAGE_SIZE: usize = 128;
+const PAGE_SIZE: NonZeroU16 = NZU16!(57);
const PAGE_CACHE_SIZE: usize = 1;
fn fuzz(input: FuzzInput) {
@@ -62,7 +63,7 @@ fn fuzz(input: FuzzInput) {
partition: "fixed_journal_operations_fuzz_test".to_string(),
items_per_blob: NZU64!(3),
write_buffer: NZUsize!(MAX_WRITE_BUF),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap();
diff --git a/storage/fuzz/fuzz_targets/freezer_operations.rs b/storage/fuzz/fuzz_targets/freezer_operations.rs
index 44a4ae4e89..9764bafa64 100644
--- a/storage/fuzz/fuzz_targets/freezer_operations.rs
+++ b/storage/fuzz/fuzz_targets/freezer_operations.rs
@@ -3,9 +3,12 @@
use arbitrary::Arbitrary;
use commonware_runtime::{buffer::PoolRef, deterministic, Runner};
use commonware_storage::freezer::{Config, Freezer, Identifier};
-use commonware_utils::{sequence::FixedBytes, NZUsize};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16};
use libfuzzer_sys::fuzz_target;
-use std::{collections::HashMap, num::NonZeroUsize};
+use std::{
+ collections::HashMap,
+ num::{NonZeroU16, NonZeroUsize},
+};
#[derive(Arbitrary, Debug)]
enum Op {
@@ -40,7 +43,7 @@ fn vec_to_key(v: &[u8]) -> FixedBytes<32> {
FixedBytes::<32>::new(buf)
}
-const PAGE_SIZE: NonZeroUsize = NZUsize!(555);
+const PAGE_SIZE: NonZeroU16 = NZU16!(393);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(100);
fn fuzz(input: FuzzInput) {
diff --git a/storage/fuzz/fuzz_targets/mmr_journaled.rs b/storage/fuzz/fuzz_targets/mmr_journaled.rs
index 07c5cbbe04..4906ef58e8 100644
--- a/storage/fuzz/fuzz_targets/mmr_journaled.rs
+++ b/storage/fuzz/fuzz_targets/mmr_journaled.rs
@@ -8,12 +8,13 @@ use commonware_storage::mmr::{
location::{Location, LocationRangeExt},
Position, StandardHasher as Standard,
};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
+use std::num::NonZeroU16;
const MAX_OPERATIONS: usize = 200;
const MAX_DATA_SIZE: usize = 64;
-const PAGE_SIZE: usize = 111;
+const PAGE_SIZE: NonZeroU16 = NZU16!(111);
const PAGE_CACHE_SIZE: usize = 5;
const ITEMS_PER_BLOB: u64 = 7;
@@ -88,7 +89,7 @@ fn test_config(partition_suffix: &str) -> Config {
items_per_blob: NZU64!(ITEMS_PER_BLOB),
write_buffer: NZUsize!(1024),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
}
}
diff --git a/storage/fuzz/fuzz_targets/oversized_recovery.rs b/storage/fuzz/fuzz_targets/oversized_recovery.rs
index bb52d6348f..549a4caaa7 100644
--- a/storage/fuzz/fuzz_targets/oversized_recovery.rs
+++ b/storage/fuzz/fuzz_targets/oversized_recovery.rs
@@ -10,9 +10,9 @@ use bytes::{Buf, BufMut};
use commonware_codec::{FixedSize, Read, ReadExt, Write};
use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner, Storage as _};
use commonware_storage::journal::segmented::oversized::{Config, Oversized, Record};
-use commonware_utils::NZUsize;
+use commonware_utils::{NZUsize, NZU16};
use libfuzzer_sys::fuzz_target;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU16, NonZeroUsize};
/// Test index entry that stores a u64 id and references a value.
#[derive(Debug, Clone, PartialEq)]
@@ -154,7 +154,7 @@ struct FuzzInput {
sync_before_corrupt: bool,
}
-const PAGE_SIZE: NonZeroUsize = NZUsize!(128);
+const PAGE_SIZE: NonZeroU16 = NZU16!(128);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(4);
const INDEX_PARTITION: &str = "fuzz_index";
const VALUE_PARTITION: &str = "fuzz_values";
diff --git a/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs b/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs
index 156801031d..f752165cbc 100644
--- a/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs
+++ b/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs
@@ -13,9 +13,9 @@ use commonware_storage::{
},
translator::TwoCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
-use std::sync::Arc;
+use std::{num::NonZeroU16, sync::Arc};
type Key = FixedBytes<32>;
type Value = FixedBytes<32>;
@@ -86,7 +86,7 @@ impl<'a> Arbitrary<'a> for FuzzInput {
}
}
-const PAGE_SIZE: usize = 128;
+const PAGE_SIZE: NonZeroU16 = NZU16!(129);
fn test_config(test_name: &str) -> Config {
Config {
@@ -99,7 +99,7 @@ fn test_config(test_name: &str) -> Config {
log_write_buffer: NZUsize!(1024),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(1)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(1)),
}
}
diff --git a/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs b/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs
index 13f4bf1e83..5e767d80ef 100644
--- a/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs
+++ b/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs
@@ -11,10 +11,13 @@ use commonware_storage::{
},
translator::TwoCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
use mmr::location::Location;
-use std::{collections::HashMap, num::NonZeroU64};
+use std::{
+ collections::HashMap,
+ num::{NonZeroU16, NonZeroU64},
+};
const MAX_OPERATIONS: usize = 50;
@@ -129,7 +132,7 @@ impl<'a> Arbitrary<'a> for FuzzInput {
}
}
-const PAGE_SIZE: usize = 128;
+const PAGE_SIZE: NonZeroU16 = NZU16!(128);
fn test_config(test_name: &str) -> Config, ())> {
Config {
@@ -144,7 +147,7 @@ fn test_config(test_name: &str) -> Config Config, ())> {
log_write_buffer: NZUsize!(1024),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
}
}
diff --git a/storage/fuzz/fuzz_targets/qmdb_keyless.rs b/storage/fuzz/fuzz_targets/qmdb_keyless.rs
index 66db54d449..1390ae3326 100644
--- a/storage/fuzz/fuzz_targets/qmdb_keyless.rs
+++ b/storage/fuzz/fuzz_targets/qmdb_keyless.rs
@@ -10,8 +10,9 @@ use commonware_storage::{
verify_proof,
},
};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
+use std::num::NonZeroU16;
const MAX_OPERATIONS: usize = 50;
const MAX_PROOF_OPS: u64 = 100;
@@ -117,7 +118,7 @@ impl<'a> Arbitrary<'a> for FuzzInput {
}
}
-const PAGE_SIZE: usize = 128;
+const PAGE_SIZE: NonZeroU16 = NZU16!(127);
const PAGE_CACHE_SIZE: usize = 8;
type CleanDb = Keyless, Sha256>;
@@ -134,7 +135,7 @@ fn test_config(test_name: &str) -> Config<(commonware_codec::RangeCfg, ()
log_codec_config: ((0..=10000).into(), ()),
log_items_per_section: NZU64!(7),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
}
}
diff --git a/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs b/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs
index b103ca9a12..d799aeaece 100644
--- a/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs
+++ b/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs
@@ -8,10 +8,11 @@ use commonware_storage::{
qmdb::any::{ordered::fixed::Db, FixedConfig as Config},
translator::EightCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
use std::{
collections::{BTreeMap, HashSet},
+ num::NonZeroU16,
ops::Bound::{Excluded, Unbounded},
};
@@ -36,7 +37,7 @@ struct FuzzInput {
operations: Vec,
}
-const PAGE_SIZE: usize = 555;
+const PAGE_SIZE: NonZeroU16 = NZU16!(111);
const PAGE_CACHE_SIZE: usize = 100;
fn fuzz(data: FuzzInput) {
@@ -53,7 +54,7 @@ fn fuzz(data: FuzzInput) {
log_write_buffer: NZUsize!(1024),
translator: EightCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
let mut db = OrderedDb::init(context.clone(), cfg.clone())
diff --git a/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs b/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs
index 02e0125c06..bd7fc96db7 100644
--- a/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs
+++ b/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs
@@ -11,11 +11,11 @@ use commonware_storage::{
},
translator::EightCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
use std::{
collections::{HashMap, HashSet},
- num::NonZeroU64,
+ num::{NonZeroU16, NonZeroU64},
};
type Key = FixedBytes<32>;
@@ -60,7 +60,7 @@ struct FuzzInput {
operations: Vec,
}
-const PAGE_SIZE: usize = 555;
+const PAGE_SIZE: NonZeroU16 = NZU16!(555);
const PAGE_CACHE_SIZE: usize = 100;
fn fuzz(data: FuzzInput) {
@@ -78,7 +78,7 @@ fn fuzz(data: FuzzInput) {
log_write_buffer: NZUsize!(1024),
translator: EightCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
let mut db = Db::<_, Key, Value, Sha256, EightCap>::init(context.clone(), cfg.clone())
diff --git a/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs b/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs
index 98ed0bf025..8eed64abf8 100644
--- a/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs
+++ b/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs
@@ -11,9 +11,12 @@ use commonware_storage::{
},
translator::EightCap,
};
-use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
-use std::collections::{HashMap, HashSet};
+use std::{
+ collections::{HashMap, HashSet},
+ num::NonZeroU16,
+};
type Key = FixedBytes<32>;
type Value = FixedBytes<64>;
@@ -36,7 +39,7 @@ struct FuzzInput {
operations: Vec,
}
-const PAGE_SIZE: usize = 555;
+const PAGE_SIZE: NonZeroU16 = NZU16!(223);
const PAGE_CACHE_SIZE: usize = 100;
fn fuzz(data: FuzzInput) {
@@ -54,7 +57,7 @@ fn fuzz(data: FuzzInput) {
log_write_buffer: NZUsize!(1024),
translator: EightCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
let mut db = Db::<_, Key, Value, Sha256, EightCap>::init(context.clone(), cfg.clone())
diff --git a/storage/fuzz/fuzz_targets/store_operations.rs b/storage/fuzz/fuzz_targets/store_operations.rs
index 6a1ac79507..a7dd7c633b 100644
--- a/storage/fuzz/fuzz_targets/store_operations.rs
+++ b/storage/fuzz/fuzz_targets/store_operations.rs
@@ -7,8 +7,9 @@ use commonware_storage::{
qmdb::store::db::{Config, Db},
translator::TwoCap,
};
-use commonware_utils::{NZUsize, NZU64};
+use commonware_utils::{NZUsize, NZU16, NZU64};
use libfuzzer_sys::fuzz_target;
+use std::num::NonZeroU16;
const MAX_OPERATIONS: usize = 50;
@@ -86,7 +87,7 @@ impl<'a> Arbitrary<'a> for FuzzInput {
}
}
-const PAGE_SIZE: usize = 128;
+const PAGE_SIZE: NonZeroU16 = NZU16!(125);
const PAGE_CACHE_SIZE: usize = 8;
fn test_config(test_name: &str) -> Config, ())> {
@@ -97,7 +98,7 @@ fn test_config(test_name: &str) -> Config FixedBytes<64> {
diff --git a/storage/src/archive/prunable/mod.rs b/storage/src/archive/prunable/mod.rs
index a6a7c2f550..a244077fb1 100644
--- a/storage/src/archive/prunable/mod.rs
+++ b/storage/src/archive/prunable/mod.rs
@@ -120,7 +120,7 @@
//! prunable::{Archive, Config},
//! },
//! };
-//! use commonware_utils::{NZUsize, NZU64};
+//! use commonware_utils::{NZUsize, NZU16, NZU64};
//!
//! let executor = deterministic::Runner::default();
//! executor.start(|context| async move {
@@ -128,7 +128,7 @@
//! let cfg = Config {
//! translator: FourCap,
//! key_partition: "demo_index".into(),
-//! key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+//! key_buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)),
//! value_partition: "demo_value".into(),
//! compression: Some(3),
//! codec_config: (),
@@ -203,15 +203,15 @@ mod tests {
};
use commonware_codec::{DecodeExt, Error as CodecError};
use commonware_macros::{test_group, test_traced};
- use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage};
- use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+ use commonware_runtime::{deterministic, Metrics, Runner};
+ use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use rand::Rng;
- use std::collections::BTreeMap;
+ use std::{collections::BTreeMap, num::NonZeroU16};
const DEFAULT_ITEMS_PER_SECTION: u64 = 65536;
const DEFAULT_WRITE_BUFFER: usize = 1024;
const DEFAULT_REPLAY_BUFFER: usize = 4096;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
fn test_key(key: &str) -> FixedBytes<64> {
@@ -288,79 +288,6 @@ mod tests {
});
}
- #[test_traced]
- fn test_archive_record_corruption() {
- // Initialize the deterministic context
- let executor = deterministic::Runner::default();
- executor.start(|context| async move {
- // Initialize the archive
- let cfg = Config {
- translator: FourCap,
- key_partition: "test_index".into(),
- key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
- value_partition: "test_value".into(),
- codec_config: (),
- compression: None,
- key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER),
- value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER),
- replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER),
- items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION),
- };
- let mut archive = Archive::init(context.clone(), cfg.clone())
- .await
- .expect("Failed to initialize archive");
-
- let index = 1u64;
- let key = test_key("testkey");
- let data = 1;
-
- // Put the key-data pair
- archive
- .put(index, key.clone(), data)
- .await
- .expect("Failed to put data");
-
- // Sync and drop the archive
- archive.sync().await.expect("Failed to sync archive");
- drop(archive);
-
- // Corrupt the index journal
- let section = (index / DEFAULT_ITEMS_PER_SECTION) * DEFAULT_ITEMS_PER_SECTION;
- let (blob, _) = context
- .open("test_index", §ion.to_be_bytes())
- .await
- .unwrap();
- blob.write_at(b"corrupt!".to_vec(), 8).await.unwrap();
- blob.sync().await.unwrap();
-
- // Initialize the archive again
- let archive = Archive::<_, _, FixedBytes<64>, i32>::init(
- context,
- Config {
- translator: FourCap,
- key_partition: "test_index".into(),
- key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
- value_partition: "test_value".into(),
- codec_config: (),
- compression: None,
- key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER),
- value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER),
- replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER),
- items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION),
- },
- )
- .await
- .expect("Failed to initialize archive");
-
- // Check that the archive is empty
- let retrieved: Option = archive
- .get(Identifier::Index(index))
- .await
- .expect("Failed to get data");
- assert!(retrieved.is_none());
- });
- }
-
#[test_traced]
fn test_archive_overlapping_key_basic() {
// Initialize the deterministic context
diff --git a/storage/src/cache/mod.rs b/storage/src/cache/mod.rs
index 0c4f170707..5f4dfd1061 100644
--- a/storage/src/cache/mod.rs
+++ b/storage/src/cache/mod.rs
@@ -42,7 +42,7 @@
//! ```rust
//! use commonware_runtime::{Spawner, Runner, deterministic, buffer::PoolRef};
//! use commonware_storage::cache::{Cache, Config};
-//! use commonware_utils::{NZUsize, NZU64};
+//! use commonware_utils::{NZUsize, NZU16, NZU64};
//!
//! let executor = deterministic::Runner::default();
//! executor.start(|context| async move {
@@ -54,7 +54,7 @@
//! items_per_blob: NZU64!(1024),
//! write_buffer: NZUsize!(1024 * 1024),
//! replay_buffer: NZUsize!(4096),
-//! buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+//! buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)),
//! };
//! let mut cache = Cache::init(context, cfg).await.unwrap();
//!
@@ -126,17 +126,16 @@ pub struct Config {
mod tests {
use super::*;
use crate::journal::Error as JournalError;
- use commonware_codec::{varint::UInt, EncodeSize};
use commonware_macros::{test_group, test_traced};
- use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage};
- use commonware_utils::{NZUsize, NZU64};
+ use commonware_runtime::{deterministic, Metrics, Runner};
+ use commonware_utils::{NZUsize, NZU16, NZU64};
use rand::Rng;
- use std::collections::BTreeMap;
+ use std::{collections::BTreeMap, num::NonZeroU16};
const DEFAULT_ITEMS_PER_BLOB: u64 = 65536;
const DEFAULT_WRITE_BUFFER: usize = 1024;
const DEFAULT_REPLAY_BUFFER: usize = 4096;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
#[test_traced]
@@ -185,72 +184,6 @@ mod tests {
});
}
- #[test_traced]
- fn test_cache_record_corruption() {
- // Initialize the deterministic context
- let executor = deterministic::Runner::default();
- executor.start(|context| async move {
- // Initialize the cache
- let cfg = Config {
- partition: "test_partition".into(),
- codec_config: (),
- compression: None,
- write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER),
- replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER),
- items_per_blob: NZU64!(DEFAULT_ITEMS_PER_BLOB),
- buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
- };
- let mut cache = Cache::init(context.clone(), cfg.clone())
- .await
- .expect("Failed to initialize cache");
-
- let index = 1u64;
- let data = 1;
-
- // Put the data
- cache
- .put(index, data)
- .await
- .expect("Failed to put data");
-
- // Sync and drop the cache
- cache.sync().await.expect("Failed to sync cache");
- drop(cache);
-
- // Corrupt the value
- let section = (index / DEFAULT_ITEMS_PER_BLOB) * DEFAULT_ITEMS_PER_BLOB;
- let (blob, _) = context
- .open("test_partition", §ion.to_be_bytes())
- .await
- .unwrap();
- let value_location = 4 /* journal size */ + UInt(1u64).encode_size() as u64 /* index */ + 4 /* value length */;
- blob.write_at(b"testdaty".to_vec(), value_location).await.unwrap();
- blob.sync().await.unwrap();
-
- // Initialize the cache again
- let cache = Cache::<_, i32>::init(
- context,
- Config {
- partition: "test_partition".into(),
- codec_config: (),
- compression: None,
- write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER),
- replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER),
- items_per_blob: NZU64!(DEFAULT_ITEMS_PER_BLOB),
- buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
- },
- )
- .await.expect("Failed to initialize cache");
-
- // Check that the cache is empty
- let retrieved: Option = cache
- .get(index)
- .await
- .expect("Failed to get data");
- assert!(retrieved.is_none());
- });
- }
-
#[test_traced]
fn test_cache_prune() {
// Initialize the deterministic context
diff --git a/storage/src/freezer/benches/utils.rs b/storage/src/freezer/benches/utils.rs
index 87ebc0d555..fcdbc64144 100644
--- a/storage/src/freezer/benches/utils.rs
+++ b/storage/src/freezer/benches/utils.rs
@@ -2,9 +2,9 @@
use commonware_runtime::{buffer::PoolRef, tokio::Context};
use commonware_storage::freezer::{Config, Freezer};
-use commonware_utils::{sequence::FixedBytes, NZUsize};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16};
use rand::{rngs::StdRng, RngCore, SeedableRng};
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU16, NonZeroUsize};
/// Number of bytes that can be buffered before being written to disk.
const WRITE_BUFFER: usize = 1024 * 1024; // 1MB
@@ -34,7 +34,7 @@ pub const VALUE_PARTITION: &str = "freezer_bench_value";
pub const TABLE_PARTITION: &str = "freezer_bench_table";
/// Use a "prod sized" page size to test the performance of the journal.
-const PAGE_SIZE: NonZeroUsize = NZUsize!(16_384);
+const PAGE_SIZE: NonZeroU16 = NZU16!(16_384);
/// The number of pages to cache in the buffer pool.
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10_000);
diff --git a/storage/src/freezer/mod.rs b/storage/src/freezer/mod.rs
index f30492cedc..4ebbeb848b 100644
--- a/storage/src/freezer/mod.rs
+++ b/storage/src/freezer/mod.rs
@@ -164,7 +164,7 @@
//! ```rust
//! use commonware_runtime::{Spawner, Runner, deterministic, buffer::PoolRef};
//! use commonware_storage::freezer::{Freezer, Config, Identifier};
-//! use commonware_utils::{sequence::FixedBytes, NZUsize};
+//! use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16};
//!
//! let executor = deterministic::Runner::default();
//! executor.start(|context| async move {
@@ -172,7 +172,7 @@
//! let cfg = Config {
//! key_partition: "freezer_key_index".into(),
//! key_write_buffer: NZUsize!(1024 * 1024), // 1MB
-//! key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+//! key_buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)),
//! value_partition: "freezer_value_journal".into(),
//! value_compression: Some(3),
//! value_write_buffer: NZUsize!(1024 * 1024), // 1MB
@@ -276,8 +276,9 @@ mod tests {
use commonware_codec::DecodeExt;
use commonware_macros::{test_group, test_traced};
use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage};
- use commonware_utils::{hex, sequence::FixedBytes, NZUsize};
+ use commonware_utils::{hex, sequence::FixedBytes, NZUsize, NZU16};
use rand::{Rng, RngCore};
+ use std::num::NonZeroU16;
const DEFAULT_WRITE_BUFFER: usize = 1024;
const DEFAULT_VALUE_TARGET_SIZE: u64 = 10 * 1024 * 1024;
@@ -285,7 +286,7 @@ mod tests {
const DEFAULT_TABLE_RESIZE_FREQUENCY: u8 = 4;
const DEFAULT_TABLE_RESIZE_CHUNK_SIZE: u32 = 128; // force multiple chunks
const DEFAULT_TABLE_REPLAY_BUFFER: usize = 64 * 1024; // 64KB
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
fn test_key(key: &str) -> FixedBytes<64> {
diff --git a/storage/src/journal/authenticated.rs b/storage/src/journal/authenticated.rs
index 174c683c5d..a5a516011f 100644
--- a/storage/src/journal/authenticated.rs
+++ b/storage/src/journal/authenticated.rs
@@ -637,11 +637,12 @@ mod tests {
deterministic::{self, Context},
Runner as _,
};
- use commonware_utils::{NZUsize, NZU64};
+ use commonware_utils::{NZUsize, NZU16, NZU64};
use futures::StreamExt as _;
+ use std::num::NonZeroU16;
- const PAGE_SIZE: usize = 101;
- const PAGE_CACHE_SIZE: usize = 11;
+ const PAGE_SIZE: NonZeroU16 = NZU16!(101);
+ const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11);
/// Create MMR configuration for tests.
fn mmr_config(suffix: &str) -> MmrConfig {
@@ -651,7 +652,7 @@ mod tests {
items_per_blob: NZU64!(11),
write_buffer: NZUsize!(1024),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
@@ -661,7 +662,7 @@ mod tests {
partition: format!("journal_{suffix}"),
items_per_blob: NZU64!(7),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
diff --git a/storage/src/journal/benches/bench.rs b/storage/src/journal/benches/bench.rs
index a9f19c499a..d8e6181cfb 100644
--- a/storage/src/journal/benches/bench.rs
+++ b/storage/src/journal/benches/bench.rs
@@ -1,9 +1,9 @@
use commonware_runtime::{buffer::PoolRef, tokio::Context};
use commonware_storage::journal::contiguous::fixed::{Config as JConfig, Journal};
-use commonware_utils::{sequence::FixedBytes, NZUsize};
+use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16};
use criterion::criterion_main;
use rand::{rngs::StdRng, RngCore, SeedableRng};
-use std::num::{NonZeroU64, NonZeroUsize};
+use std::num::{NonZeroU16, NonZeroU64, NonZeroUsize};
mod fixed_append;
mod fixed_read_random;
@@ -21,7 +21,7 @@ criterion_main!(
const WRITE_BUFFER: NonZeroUsize = NZUsize!(1_024 * 1024); // 1MB
/// Use a "prod sized" page size to test the performance of the journal.
-const PAGE_SIZE: NonZeroUsize = NZUsize!(16_384);
+const PAGE_SIZE: NonZeroU16 = NZU16!(16_384);
/// The number of pages to cache in the buffer pool. Make it big enough to be
/// fast, but not so big we avoid any page faults for the larger benchmarks.
diff --git a/storage/src/journal/benches/fixed_replay.rs b/storage/src/journal/benches/fixed_replay.rs
index 4c4b93cab3..c4d01ebd0a 100644
--- a/storage/src/journal/benches/fixed_replay.rs
+++ b/storage/src/journal/benches/fixed_replay.rs
@@ -59,7 +59,7 @@ fn bench_fixed_replay(c: &mut Criterion) {
// Run the benchmarks
let runner = tokio::Runner::new(cfg.clone());
- for buffer in [128, 16_384, 65_536, 1_048_576] {
+ for buffer in [16_384, 65_536, 1_048_576] {
c.bench_function(
&format!(
"{}/items={} buffer={} size={}",
diff --git a/storage/src/journal/conformance.rs b/storage/src/journal/conformance.rs
index 48936ef478..4df130368c 100644
--- a/storage/src/journal/conformance.rs
+++ b/storage/src/journal/conformance.rs
@@ -8,14 +8,14 @@ use bytes::{Buf, BufMut};
use commonware_codec::{FixedSize, RangeCfg, Read, ReadExt, Write};
use commonware_conformance::{conformance_tests, Conformance};
use commonware_runtime::{buffer::PoolRef, deterministic, Metrics, Runner};
-use commonware_utils::{NZUsize, NZU64};
-use core::num::{NonZeroU64, NonZeroUsize};
+use commonware_utils::{NZUsize, NZU16, NZU64};
+use core::num::{NonZeroU16, NonZeroU64, NonZeroUsize};
use oversized::Record;
use rand::Rng;
const WRITE_BUFFER: NonZeroUsize = NZUsize!(1024);
const ITEMS_PER_BLOB: NonZeroU64 = NZU64!(4096);
-const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
struct ContiguousFixed;
diff --git a/storage/src/journal/contiguous/fixed.rs b/storage/src/journal/contiguous/fixed.rs
index cf88ea435d..8501e7593b 100644
--- a/storage/src/journal/contiguous/fixed.rs
+++ b/storage/src/journal/contiguous/fixed.rs
@@ -9,22 +9,22 @@
//! # Format
//!
//! Data stored in a `fixed::Journal` is persisted in one of many Blobs within a caller-provided
-//! `partition`. Each `Blob` contains a configurable maximum of `items_per_blob`, with each item
-//! followed by its checksum (CRC32):
+//! `partition`. Each `Blob` contains a configurable maximum of `items_per_blob`, with page-level
+//! data integrity provided by a buffer pool.
//!
//! ```text
-//! +--------+-----------+--------+-----------+--------+----------+-------------+
-//! | item_0 | C(Item_0) | item_1 | C(Item_1) | ... | item_n-1 | C(Item_n-1) |
-//! +--------+-----------+--------+----0------+--------+----------+-------------+
+//! +--------+----- --+--- -+----------+
+//! | item_0 | item_1 | ... | item_n-1 |
+//! +--------+-----------+--------+----0
//!
-//! n = config.items_per_blob, C = CRC32
+//! n = config.items_per_blob
//! ```
//!
//! The most recent blob may not necessarily be full, in which case it will contain fewer than the
//! maximum number of items.
//!
-//! A fetched or replayed item's checksum is always computed and checked against the stored value
-//! before it is returned. If the checksums do not match, an error is returned instead.
+//! Data fetched from disk is always checked for integrity before being returned. If the data is
+//! found to be invalid, an error is returned instead.
//!
//! # Open Blobs
//!
@@ -44,12 +44,11 @@
//!
//! # State Sync
//!
-//! `Journal::init_sync` allows for initializing a journal for use in state sync.
-//! When opened in this mode, we attempt to populate the journal within the given range
-//! with persisted data.
-//! If the journal is empty, we create a fresh journal at the specified position.
-//! If the journal is not empty, we prune the journal to the specified lower bound and rewind to
-//! the specified upper bound.
+//! `Journal::init_sync` allows for initializing a journal for use in state sync. When opened in
+//! this mode, we attempt to populate the journal within the given range with persisted data. If the
+//! journal is empty, we create a fresh journal at the specified position. If the journal is not
+//! empty, we prune the journal to the specified lower bound and rewind to the specified upper
+//! bound.
//!
//! # Replay
//!
@@ -59,10 +58,9 @@ use crate::{
journal::{contiguous::MutableContiguous, Error},
Persistable,
};
-use bytes::BufMut;
-use commonware_codec::{CodecFixed, DecodeExt as _, FixedSize};
+use commonware_codec::{CodecFixed, DecodeExt as _};
use commonware_runtime::{
- buffer::{Append, PoolRef, Read},
+ buffer::pool::{Append, PoolRef},
telemetry::metrics::status::GaugeExt,
Blob, Error as RError, Metrics, Storage,
};
@@ -137,7 +135,7 @@ pub struct Journal {
}
impl> Journal {
- const CHUNK_SIZE: usize = u32::SIZE + A::SIZE;
+ pub(crate) const CHUNK_SIZE: usize = A::SIZE;
pub(crate) const CHUNK_SIZE_U64: u64 = Self::CHUNK_SIZE as u64;
/// Initialize a new `Journal` instance.
@@ -147,10 +145,12 @@ impl> Journal {
///
/// # Repair
///
- /// Like [sqlite](https://github.com/sqlite/sqlite/blob/8658a8df59f00ec8fcfea336a2a6a4b5ef79d2ee/src/wal.c#L1504-L1505)
- /// and [rocksdb](https://github.com/facebook/rocksdb/blob/0c533e61bc6d89fdf1295e8e0bcee4edb3aef401/include/rocksdb/options.h#L441-L445),
- /// the first invalid data read will be considered the new end of the journal (and the underlying [Blob] will be truncated to the last
- /// valid item).
+ /// Like
+ /// [sqlite](https://github.com/sqlite/sqlite/blob/8658a8df59f00ec8fcfea336a2a6a4b5ef79d2ee/src/wal.c#L1504-L1505)
+ /// and
+ /// [rocksdb](https://github.com/facebook/rocksdb/blob/0c533e61bc6d89fdf1295e8e0bcee4edb3aef401/include/rocksdb/options.h#L441-L445),
+ /// the first invalid data read will be considered the new end of the journal (and the
+ /// underlying [Blob] will be truncated to the last valid item).
pub async fn init(context: E, cfg: Config) -> Result {
// Iterate over blobs in partition
let mut blobs = BTreeMap::new();
@@ -172,21 +172,16 @@ impl> Journal {
blobs.insert(index, (blob, size));
}
- // Check that there are no gaps in the historical blobs and that they are all full.
+ // Check that there are no gaps in the historical blobs.
let full_size = cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64;
if !blobs.is_empty() {
let mut it = blobs.keys().rev();
let mut prev_index = *it.next().unwrap();
for index in it {
- let (_, size) = blobs.get(index).unwrap();
if *index != prev_index - 1 {
return Err(Error::MissingBlob(prev_index - 1));
}
prev_index = *index;
- if *size != full_size {
- // Non-final blobs that have invalid sizes are not recoverable.
- return Err(Error::InvalidBlobSize(*index, *size));
- }
}
} else {
debug!("no blobs found");
@@ -204,15 +199,43 @@ impl> Journal {
context.register("pruned", "Number of blobs pruned", pruned.clone());
let _ = tracked.try_set(blobs.len());
- // Initialize the tail blob.
- let (mut tail_index, (mut tail, mut tail_size)) = blobs.pop_last().unwrap();
+ // Wrap all blobs with Append wrappers, starting with the tail.
+ let (mut tail_index, (blob, blob_size)) = blobs.pop_last().unwrap();
+ let mut tail = Append::new(
+ blob,
+ blob_size,
+ cfg.write_buffer.get(),
+ cfg.buffer_pool.clone(),
+ )
+ .await?;
+ let mut tail_size = tail.size().await;
- // Trim invalid items from the tail blob.
- tail_size = Self::trim_tail(&tail, tail_size, tail_index).await?;
- if tail_size > full_size {
- return Err(Error::InvalidBlobSize(tail_index, tail_size));
+ // Trim the tail blob if necessary.
+ if !tail_size.is_multiple_of(Self::CHUNK_SIZE_U64) {
+ warn!(
+ blob = tail_index,
+ invalid_size = tail_size,
+ "last blob size is not a multiple of item size, truncating"
+ );
+ tail_size -= tail_size % Self::CHUNK_SIZE_U64;
+ tail.resize(tail_size).await?;
}
+ // Non-tail blobs can be immutable.
+ let mut blobs = try_join_all(blobs.into_iter().map(|(index, (blob, size))| {
+ let pool = cfg.buffer_pool.clone();
+ async move {
+ let blob = Append::new_immutable(blob, size, cfg.write_buffer.get(), pool).await?;
+ let logical_size = blob.size().await;
+ // Verify the non-tail blobs are full as expected.
+ if logical_size != full_size {
+ return Err(Error::InvalidBlobSize(logical_size, full_size));
+ }
+ Ok::<_, Error>((index, (blob, logical_size)))
+ }
+ }))
+ .await?;
+
// If the tail blob is full we need to start a new one to maintain its invariant that there
// is always room for another item.
if tail_size == full_size {
@@ -220,33 +243,30 @@ impl> Journal {
blob = tail_index,
"tail blob is full, creating a new empty one"
);
- blobs.insert(tail_index, (tail, tail_size));
+ tail.to_immutable().await?;
+ blobs.push((tail_index, (tail, tail_size)));
tail_index += 1;
- (tail, tail_size) = context
+ let (blob, blob_size) = context
.open(&cfg.partition, &tail_index.to_be_bytes())
.await?;
- assert_eq!(tail_size, 0);
+ assert_eq!(blob_size, 0);
+ tail = Append::new(
+ blob,
+ blob_size,
+ cfg.write_buffer.get(),
+ cfg.buffer_pool.clone(),
+ )
+ .await?;
+ tail_size = 0;
tracked.inc();
}
- // Wrap all blobs with Append wrappers.
- // TODO(https://github.com/commonwarexyz/monorepo/issues/1219): Consider creating an
- // Immutable wrapper which doesn't allocate a write buffer for these.
- let blobs = try_join_all(blobs.into_iter().map(|(index, (blob, size))| {
- let pool = cfg.buffer_pool.clone();
- async move {
- let blob = Append::new(blob, size, cfg.write_buffer, pool).await?;
- Ok::<_, Error>((index, (blob, size)))
- }
- }))
- .await?;
- let tail = Append::new(tail, tail_size, cfg.write_buffer, cfg.buffer_pool.clone()).await?;
- let size = tail_index * cfg.items_per_blob.get() + (tail_size / Self::CHUNK_SIZE_U64);
let pruning_boundary = if blobs.is_empty() {
tail_index * cfg.items_per_blob.get()
} else {
blobs[0].0 * cfg.items_per_blob.get()
};
+ let size = tail_index * cfg.items_per_blob.get() + (tail_size / Self::CHUNK_SIZE_U64);
assert!(size >= pruning_boundary);
Ok(Self {
@@ -267,51 +287,6 @@ impl> Journal {
})
}
- /// Trim any invalid data found at the end of the tail blob and return the new size. The new
- /// size will be less than or equal to the originally provided size, and a multiple of the item
- /// size.
- async fn trim_tail(
- tail: &::Blob,
- mut tail_size: u64,
- tail_index: u64,
- ) -> Result {
- let mut truncated = false;
- if !tail_size.is_multiple_of(Self::CHUNK_SIZE_U64) {
- warn!(
- blob = tail_index,
- invalid_size = tail_size,
- "last blob size is not a multiple of item size, truncating"
- );
- tail_size -= tail_size % Self::CHUNK_SIZE_U64;
- tail.resize(tail_size).await?;
- truncated = true;
- }
-
- // Truncate any records with failing checksums. This can happen if the file system allocated
- // extra space for a blob but there was a crash before any data was written to that space.
- while tail_size > 0 {
- let offset = tail_size - Self::CHUNK_SIZE_U64;
- let read = tail.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?;
- match Self::verify_integrity(read.as_ref()) {
- Ok(_) => break, // Valid item found, we can stop truncating.
- Err(Error::ChecksumMismatch(_, _)) => {
- warn!(blob = tail_index, offset, "checksum mismatch: truncating",);
- tail_size -= Self::CHUNK_SIZE_U64;
- tail.resize(tail_size).await?;
- truncated = true;
- }
- Err(err) => return Err(err),
- }
- }
-
- // If we truncated the blob, make sure to sync it.
- if truncated {
- tail.sync().await?;
- }
-
- Ok(tail_size)
- }
-
/// Sync any pending updates to disk.
pub async fn sync(&mut self) -> Result<(), Error> {
self.synced.inc();
@@ -332,17 +307,12 @@ impl> Journal {
let mut size = self.tail.size().await;
assert!(size < self.cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64);
assert_eq!(size % Self::CHUNK_SIZE_U64, 0);
-
- // Pre-allocate exact size and write directly to avoid copying
- let mut buf: Vec = Vec::with_capacity(Self::CHUNK_SIZE);
- item.write(&mut buf);
- let checksum = crc32fast::hash(&buf);
- buf.put_u32(checksum);
+ let item = item.encode_mut();
// Write the item to the blob
let item_pos =
(size / Self::CHUNK_SIZE_U64) + self.cfg.items_per_blob.get() * self.tail_index;
- self.tail.append(buf).await?;
+ self.tail.append(&item).await?;
trace!(blob = self.tail_index, pos = item_pos, "appended item");
size += Self::CHUNK_SIZE_U64;
@@ -351,7 +321,7 @@ impl> Journal {
if size == self.cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64 {
// Sync the tail blob before creating a new one so if we crash we don't end up with a
// non-full historical blob.
- self.tail.sync().await?;
+ self.tail.to_immutable().await?;
// Create a new empty blob.
let next_blob_index = self.tail_index + 1;
@@ -364,7 +334,7 @@ impl> Journal {
let next_blob = Append::new(
next_blob,
size,
- self.cfg.write_buffer,
+ self.cfg.write_buffer.get(),
self.cfg.buffer_pool.clone(),
)
.await?;
@@ -406,6 +376,7 @@ impl> Journal {
let (blob_index, mut new_tail) = self.blobs.pop_last().unwrap();
assert_eq!(blob_index, self.tail_index - 1);
std::mem::swap(&mut self.tail, &mut new_tail);
+ self.tail.to_mutable().await;
self.remove_blob(self.tail_index, new_tail).await?;
self.tail_index -= 1;
}
@@ -459,21 +430,14 @@ impl> Journal {
};
let read = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?;
- Self::verify_integrity(read.as_ref())
+ Self::decode_buf(read.as_ref())
}
- /// Verify the integrity of the Array + checksum in `buf`, returning:
- /// - The array if it is valid,
- /// - Error::ChecksumMismatch if the checksum is invalid, or
- /// - Error::Codec if the array could not be decoded after passing the checksum check.
+ /// Decode the array from `buf`, returning:
+ /// - Error::Codec if the array could not be decoded.
///
/// Error::Codec likely indicates a logic error rather than a corruption issue.
- fn verify_integrity(buf: &[u8]) -> Result {
- let stored_checksum = u32::from_be_bytes(buf[A::SIZE..].try_into().unwrap());
- let checksum = crc32fast::hash(&buf[..A::SIZE]);
- if checksum != stored_checksum {
- return Err(Error::ChecksumMismatch(stored_checksum, checksum));
- }
+ fn decode_buf(buf: &[u8]) -> Result {
A::decode(&buf[..A::SIZE]).map_err(Error::Codec)
}
@@ -498,24 +462,20 @@ impl> Journal {
let start_blob = start_pos / items_per_blob;
assert!(start_blob <= self.tail_index);
let blobs = self.blobs.range(start_blob..).collect::>();
- let full_size = items_per_blob * Self::CHUNK_SIZE_U64;
- let mut blob_plus = Vec::with_capacity(blobs.len() + 1);
+ let mut readers = Vec::with_capacity(blobs.len() + 1);
for (blob_index, blob) in blobs {
- blob_plus.push((*blob_index, blob.clone_blob().await, full_size));
+ let reader = blob.as_blob_reader(buffer).await?;
+ readers.push((*blob_index, reader));
}
// Include the tail blob.
- self.tail.sync().await?; // make sure no data is buffered
- let tail_size = self.tail.size().await;
- blob_plus.push((self.tail_index, self.tail.clone_blob().await, tail_size));
+ let tail_reader = self.tail.as_blob_reader(buffer).await?;
+ readers.push((self.tail_index, tail_reader));
let start_offset = (start_pos % items_per_blob) * Self::CHUNK_SIZE_U64;
// Replay all blobs in order and stream items as they are read (to avoid occupying too much
// memory with buffered data).
- let stream = stream::iter(blob_plus).flat_map(move |(blob_index, blob, size)| {
- // Create a new reader and buffer for each blob. Preallocating the buffer here to avoid
- // a per-iteration allocation improves performance by ~20%.
- let mut reader = Read::new(blob, size, buffer);
+ let stream = stream::iter(readers).flat_map(move |(blob_index, mut reader)| {
let buf = vec![0u8; Self::CHUNK_SIZE];
let initial_offset = if blob_index == start_blob {
// If this is the very first blob then we need to seek to the starting position.
@@ -538,7 +498,7 @@ impl> Journal {
match reader.read_exact(&mut buf, Self::CHUNK_SIZE).await {
Ok(()) => {
let next_offset = offset + Self::CHUNK_SIZE_U64;
- let result = Self::verify_integrity(&buf).map(|item| (item_pos, item));
+ let result = Self::decode_buf(&buf).map(|item| (item_pos, item));
if result.is_err() {
warn!("corrupted item at {item_pos}");
}
@@ -550,7 +510,8 @@ impl> Journal {
err = err.to_string(),
"error reading item during replay"
);
- Some((Err(Error::Runtime(err)), (buf, reader, size)))
+ let blob_size = reader.blob_size();
+ Some((Err(Error::Runtime(err)), (buf, reader, blob_size)))
}
}
},
@@ -698,11 +659,15 @@ mod tests {
use super::*;
use commonware_cryptography::{sha256::Digest, Hasher as _, Sha256};
use commonware_macros::test_traced;
- use commonware_runtime::{deterministic, Blob, Runner, Storage};
- use commonware_utils::{NZUsize, NZU64};
+ use commonware_runtime::{
+ deterministic::{self, Context},
+ Blob, Runner, Storage,
+ };
+ use commonware_utils::{NZUsize, NZU16, NZU64};
use futures::{pin_mut, StreamExt};
+ use std::num::NonZeroU16;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(44);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(44);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3);
/// Generate a SHA-256 digest for the given value.
@@ -966,19 +931,16 @@ mod tests {
journal.sync().await.expect("Failed to sync journal");
drop(journal);
- // Corrupt one of the checksums and make sure it's detected.
- let checksum_offset = Digest::SIZE as u64
- + (ITEMS_PER_BLOB.get() / 2) * (Digest::SIZE + u32::SIZE) as u64;
+ // Corrupt one of the bytes and make sure it's detected.
let (blob, _) = context
.open(&cfg.partition, &40u64.to_be_bytes())
.await
.expect("Failed to open blob");
- // Write incorrect checksum
- let bad_checksum = 123456789u32;
- blob.write_at(bad_checksum.to_be_bytes().to_vec(), checksum_offset)
+ // Write junk bytes.
+ let bad_bytes = 123456789u32;
+ blob.write_at(bad_bytes.to_be_bytes().to_vec(), 1)
.await
- .expect("Failed to write incorrect checksum");
- let corrupted_item_pos = 40 * ITEMS_PER_BLOB.get() + ITEMS_PER_BLOB.get() / 2;
+ .expect("Failed to write bad bytes");
blob.sync().await.expect("Failed to sync blob");
// Re-initialize the journal to simulate a restart
@@ -986,19 +948,22 @@ mod tests {
.await
.expect("Failed to re-initialize journal");
- // Make sure reading the corrupted item fails with appropriate error.
- let err = journal.read(corrupted_item_pos).await.unwrap_err();
- assert!(matches!(err, Error::ChecksumMismatch(x, _) if x == bad_checksum));
+ // Make sure reading an item that resides in the corrupted page fails.
+ let err = journal
+ .read(40 * ITEMS_PER_BLOB.get() + 1)
+ .await
+ .unwrap_err();
+ assert!(matches!(err, Error::Runtime(_)));
- // Replay all items, making sure the checksum mismatch error is handled correctly.
+ // Replay all items.
{
+ let mut error_found = false;
let stream = journal
.replay(NZUsize!(1024), 0)
.await
.expect("failed to replay journal");
let mut items = Vec::new();
pin_mut!(stream);
- let mut error_count = 0;
while let Some(result) = stream.next().await {
match result {
Ok((pos, item)) => {
@@ -1006,17 +971,13 @@ mod tests {
items.push(pos);
}
Err(err) => {
- error_count += 1;
- assert!(matches!(err, Error::ChecksumMismatch(_, _)));
+ error_found = true;
+ assert!(matches!(err, Error::Runtime(_)));
+ break;
}
}
}
- assert_eq!(error_count, 1);
- // Result will be missing only the one corrupted value.
- assert_eq!(
- items.len(),
- ITEMS_PER_BLOB.get() as usize * 100 + ITEMS_PER_BLOB.get() as usize / 2 - 1
- );
+ assert!(error_found); // error should abort replay
}
});
}
@@ -1056,10 +1017,7 @@ mod tests {
blob.resize(size - 1).await.expect("Failed to corrupt blob");
blob.sync().await.expect("Failed to sync blob");
let result = Journal::<_, Digest>::init(context.clone(), cfg.clone()).await;
- assert!(matches!(
- result.err().unwrap(),
- Error::InvalidBlobSize(_, _)
- ));
+ assert!(matches!(result.err().unwrap(), Error::Runtime(_)));
// Delete a blob and make sure the gap is detected during initialization.
context
@@ -1096,32 +1054,9 @@ mod tests {
journal.sync().await.expect("Failed to sync journal");
drop(journal);
- // Truncate the tail blob by one byte, which should result in the 3rd item being
- // trimmed.
- let (blob, size) = context
- .open(&cfg.partition, &1u64.to_be_bytes())
- .await
- .expect("Failed to open blob");
- blob.resize(size - 1).await.expect("Failed to corrupt blob");
-
- // Write incorrect checksum into the second item in the blob, which should result in the
- // second item being trimmed.
- let checksum_offset = Digest::SIZE + u32::SIZE + Digest::SIZE;
-
- let bad_checksum = 123456789u32;
- blob.write_at(bad_checksum.to_be_bytes().to_vec(), checksum_offset as u64)
- .await
- .expect("Failed to write incorrect checksum");
- blob.sync().await.expect("Failed to sync blob");
-
- let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone())
- .await
- .unwrap();
-
- // Confirm 2 items were trimmed.
- assert_eq!(journal.size(), item_count - 2);
-
- // Corrupt the last item, ensuring last blob is trimmed to empty state.
+ // Truncate the tail blob by one byte, which should result in the last page worth of
+ // data being discarded due to an invalid checksum. This will result in one item being
+ // lost.
let (blob, size) = context
.open(&cfg.partition, &1u64.to_be_bytes())
.await
@@ -1133,8 +1068,8 @@ mod tests {
.await
.unwrap();
- // Confirm last item in blob was trimmed.
- assert_eq!(journal.size(), item_count - 3);
+ // Confirm 1 item was lost.
+ assert_eq!(journal.size(), item_count - 1);
// Cleanup.
journal.destroy().await.expect("Failed to destroy journal");
@@ -1339,14 +1274,13 @@ mod tests {
journal.sync().await.expect("Failed to sync journal");
drop(journal);
- // Manually extend the blob by an amount at least some multiple of the chunk size to
- // simulate a failure where the file was extended, but no bytes were written due to
- // failure.
+ // Manually extend the blob to simulate a failure where the file was extended, but no
+ // bytes were written due to failure.
let (blob, size) = context
.open(&cfg.partition, &0u64.to_be_bytes())
.await
.expect("Failed to open blob");
- blob.write_at(vec![0u8; Digest::SIZE * 3 - 1], size)
+ blob.write_at(vec![0u8; PAGE_SIZE.get() as usize * 3], size)
.await
.expect("Failed to extend blob");
blob.sync().await.expect("Failed to sync blob");
@@ -1356,7 +1290,7 @@ mod tests {
.await
.expect("Failed to re-initialize journal");
- // Ensure we've recovered to the state of a single item.
+ // No items should be lost since we called sync.
assert_eq!(journal.size(), 1);
assert_eq!(journal.oldest_retained_pos(), Some(0));
@@ -1367,10 +1301,6 @@ mod tests {
.expect("failed to append data");
assert_eq!(journal.size(), 2);
- // Get the value of the first item
- let item = journal.read(0).await.unwrap();
- assert_eq!(item, test_digest(0));
-
// Get the value of new item
let item = journal.read(1).await.unwrap();
assert_eq!(item, test_digest(1));
@@ -1487,4 +1417,87 @@ mod tests {
journal.destroy().await.unwrap();
});
}
+
+ /// Test recovery when blob is truncated to a page boundary with item size not dividing page size.
+ ///
+ /// This tests the scenario where:
+ /// 1. Items (32 bytes) don't divide evenly into page size (44 bytes)
+ /// 2. Data spans multiple pages
+ /// 3. Blob is truncated to a page boundary (simulating crash before last page was written)
+ /// 4. Journal should recover correctly on reopen
+ #[test_traced]
+ fn test_fixed_journal_recover_from_page_boundary_truncation() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context: Context| async move {
+ // Use a small items_per_blob to keep the test focused on a single blob
+ let cfg = test_cfg(NZU64!(100));
+ let mut journal = Journal::init(context.clone(), cfg.clone())
+ .await
+ .expect("failed to initialize journal");
+
+ // Item size is 32 bytes (Digest), page size is 44 bytes.
+ // 32 doesn't divide 44, so items will cross page boundaries.
+ // Physical page size = 44 + 12 (CRC) = 56 bytes.
+ //
+ // Write enough items to span multiple pages:
+ // - 10 items = 320 logical bytes
+ // - This spans ceil(320/44) = 8 logical pages
+ for i in 0u64..10 {
+ journal
+ .append(test_digest(i))
+ .await
+ .expect("failed to append data");
+ }
+ assert_eq!(journal.size(), 10);
+ journal.sync().await.expect("Failed to sync journal");
+ drop(journal);
+
+ // Open the blob directly and truncate to a page boundary.
+ // Physical page size = PAGE_SIZE + CHECKSUM_SIZE = 44 + 12 = 56
+ let physical_page_size = PAGE_SIZE.get() as u64 + 12;
+ let (blob, size) = context
+ .open(&cfg.partition, &0u64.to_be_bytes())
+ .await
+ .expect("Failed to open blob");
+
+ // Calculate how many full physical pages we have and truncate to lose the last one.
+ let full_pages = size / physical_page_size;
+ assert!(full_pages >= 2, "need at least 2 pages for this test");
+ let truncate_to = (full_pages - 1) * physical_page_size;
+
+ blob.resize(truncate_to)
+ .await
+ .expect("Failed to truncate blob");
+ blob.sync().await.expect("Failed to sync blob");
+
+ // Re-initialize the journal - it should recover by truncating to valid data
+ let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone())
+ .await
+ .expect("Failed to re-initialize journal after page truncation");
+
+ // The journal should have fewer items now (those that fit in the remaining pages).
+ // With logical page size 44 and item size 32:
+ // - After truncating to (full_pages-1) physical pages, we have (full_pages-1)*44 logical bytes
+ // - Number of complete items = floor(logical_bytes / 32)
+ let remaining_logical_bytes = (full_pages - 1) * PAGE_SIZE.get() as u64;
+ let expected_items = remaining_logical_bytes / 32; // 32 = Digest::SIZE
+ assert_eq!(
+ journal.size(),
+ expected_items,
+ "Journal should recover to {} items after truncation",
+ expected_items
+ );
+
+ // Verify we can still read the remaining items
+ for i in 0..expected_items {
+ let item = journal
+ .read(i)
+ .await
+ .expect("failed to read recovered item");
+ assert_eq!(item, test_digest(i), "item {} mismatch after recovery", i);
+ }
+
+ journal.destroy().await.expect("Failed to destroy journal");
+ });
+ }
}
diff --git a/storage/src/journal/contiguous/variable.rs b/storage/src/journal/contiguous/variable.rs
index 990cf32f7f..fddddaecf6 100644
--- a/storage/src/journal/contiguous/variable.rs
+++ b/storage/src/journal/contiguous/variable.rs
@@ -856,13 +856,17 @@ mod tests {
use super::*;
use crate::journal::contiguous::tests::run_contiguous_tests;
use commonware_macros::test_traced;
- use commonware_runtime::{buffer::PoolRef, deterministic, Runner};
- use commonware_utils::{NZUsize, NZU64};
+ use commonware_runtime::{buffer::pool::PoolRef, deterministic, Runner};
+ use commonware_utils::{NZUsize, NZU16, NZU64};
use futures::FutureExt as _;
+ use std::num::NonZeroU16;
// Use some jank sizes to exercise boundary conditions.
- const PAGE_SIZE: usize = 101;
+ const PAGE_SIZE: NonZeroU16 = NZU16!(101);
const PAGE_CACHE_SIZE: usize = 2;
+ // Larger page sizes for tests that need more buffer space.
+ const LARGE_PAGE_SIZE: NonZeroU16 = NZU16!(1024);
+ const SMALL_PAGE_SIZE: NonZeroU16 = NZU16!(512);
/// Test that complete offsets partition loss after pruning is detected as unrecoverable.
///
@@ -878,7 +882,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -928,7 +932,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -993,7 +997,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
},
)
@@ -1015,7 +1019,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1103,7 +1107,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1186,7 +1190,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1248,7 +1252,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1285,7 +1289,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1341,7 +1345,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1409,7 +1413,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1469,7 +1473,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1536,7 +1540,7 @@ mod tests {
items_per_section: NZU64!(10),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+ buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)),
write_buffer: NZUsize!(1024),
};
@@ -1578,7 +1582,7 @@ mod tests {
items_per_section: NZU64!(5),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)),
+ buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)),
write_buffer: NZUsize!(1024),
};
@@ -1611,7 +1615,7 @@ mod tests {
items_per_section: NZU64!(5),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)),
+ buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)),
write_buffer: NZUsize!(1024),
};
@@ -1650,7 +1654,7 @@ mod tests {
items_per_section: NZU64!(5),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)),
+ buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)),
write_buffer: NZUsize!(1024),
};
@@ -1684,7 +1688,7 @@ mod tests {
items_per_section: NZU64!(5),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)),
+ buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)),
write_buffer: NZUsize!(1024),
};
@@ -1736,7 +1740,7 @@ mod tests {
items_per_section: NZU64!(5),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)),
+ buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)),
write_buffer: NZUsize!(1024),
};
@@ -1767,7 +1771,7 @@ mod tests {
items_per_section: NZU64!(5),
compression: None,
codec_config: (),
- buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)),
+ buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)),
write_buffer: NZUsize!(1024),
};
@@ -1813,7 +1817,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Initialize journal with sync boundaries when no existing data exists
@@ -1851,7 +1855,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Create initial journal with data in multiple sections
@@ -1921,7 +1925,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
#[allow(clippy::reversed_empty_ranges)]
@@ -1946,7 +1950,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Create initial journal with data exactly matching sync range
@@ -2016,7 +2020,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Create initial journal with data beyond sync range
@@ -2060,7 +2064,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Create initial journal with stale data
@@ -2113,7 +2117,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Create journal with data at section boundaries
@@ -2182,7 +2186,7 @@ mod tests {
compression: None,
codec_config: (),
write_buffer: NZUsize!(1024),
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)),
};
// Create journal with data in multiple sections
diff --git a/storage/src/journal/mod.rs b/storage/src/journal/mod.rs
index 9f57cce711..1f72e813a0 100644
--- a/storage/src/journal/mod.rs
+++ b/storage/src/journal/mod.rs
@@ -29,8 +29,6 @@ pub enum Error {
InvalidBlobName(String),
#[error("invalid blob size: index={0} size={1}")]
InvalidBlobSize(u64, u64),
- #[error("checksum mismatch: expected={0} actual={1}")]
- ChecksumMismatch(u32, u32),
#[error("item too large: size={0}")]
ItemTooLarge(usize),
#[error("already pruned to section: {0}")]
@@ -61,4 +59,6 @@ pub enum Error {
Corruption(String),
#[error("invalid configuration: {0}")]
InvalidConfiguration(String),
+ #[error("checksum mismatch: expected={0}, found={1}")]
+ ChecksumMismatch(u32, u32),
}
diff --git a/storage/src/journal/segmented/fixed.rs b/storage/src/journal/segmented/fixed.rs
index 05cc2e0a68..a38d666a76 100644
--- a/storage/src/journal/segmented/fixed.rs
+++ b/storage/src/journal/segmented/fixed.rs
@@ -2,15 +2,12 @@
//!
//! # Format
//!
-//! Data is stored in one blob per section. Within each blob, items are stored with
-//! their checksum (CRC32):
+//! Data is stored in one blob per section. Items are stored sequentially:
//!
//! ```text
-//! +--------+-----------+--------+-----------+--------+----------+-------------+
-//! | item_0 | C(Item_0) | item_1 | C(Item_1) | ... | item_n-1 | C(Item_n-1) |
-//! +--------+-----------+--------+-----------+--------+----------+-------------+
-//!
-//! C = CRC32
+//! +--------+--------+--------+----------+
+//! | item_0 | item_1 | ... | item_n-1 |
+//! +--------+--------+--------+----------+
//! ```
//!
//! # Sync
@@ -25,12 +22,8 @@
use super::manager::{AppendFactory, Config as ManagerConfig, Manager};
use crate::journal::Error;
-use bytes::BufMut;
-use commonware_codec::{CodecFixed, DecodeExt as _, FixedSize};
-use commonware_runtime::{
- buffer::{PoolRef, Read},
- Blob, Error as RError, Metrics, Storage,
-};
+use commonware_codec::{CodecFixed, DecodeExt as _};
+use commonware_runtime::{buffer::PoolRef, Blob, Error as RError, Metrics, Storage};
use futures::{
stream::{self, Stream},
StreamExt,
@@ -54,15 +47,15 @@ pub struct Config {
/// A segmented journal with fixed-size entries.
///
/// Each section is stored in a separate blob. Within each blob, items are
-/// fixed-size with a CRC32 checksum appended.
+/// fixed-size.
pub struct Journal {
manager: Manager,
_array: PhantomData,
}
impl> Journal {
- /// Size of each entry: item + CRC32 checksum.
- pub const CHUNK_SIZE: usize = A::SIZE + u32::SIZE;
+ /// Size of each entry.
+ pub const CHUNK_SIZE: usize = A::SIZE;
const CHUNK_SIZE_U64: u64 = Self::CHUNK_SIZE as u64;
/// Initialize a new `Journal` instance.
@@ -100,13 +93,9 @@ impl> Journal {
}
let position = size / Self::CHUNK_SIZE_U64;
- // Pre-allocate exact size and write directly to avoid copying
- let mut buf: Vec = Vec::with_capacity(Self::CHUNK_SIZE);
- item.write(&mut buf);
- let checksum = crc32fast::hash(&buf);
- buf.put_u32(checksum);
-
- blob.append(buf).await?;
+ // Encode the item
+ let buf = item.encode_mut();
+ blob.append(&buf).await?;
trace!(section, position, "appended item");
Ok(position)
@@ -136,7 +125,7 @@ impl> Journal {
}
let buf = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?;
- Self::verify_integrity(buf.as_ref())
+ A::decode(buf.as_ref()).map_err(Error::Codec)
}
/// Read the last item in a section, if any.
@@ -154,18 +143,7 @@ impl> Journal {
let last_position = (size / Self::CHUNK_SIZE_U64) - 1;
let offset = last_position * Self::CHUNK_SIZE_U64;
let buf = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?;
- Self::verify_integrity(buf.as_ref()).map(Some)
- }
-
- /// Verify the integrity of the item + checksum in `buf`.
- fn verify_integrity(buf: &[u8]) -> Result {
- let stored_checksum =
- u32::from_be_bytes(buf[A::SIZE..].try_into().expect("checksum is 4 bytes"));
- let checksum = crc32fast::hash(&buf[..A::SIZE]);
- if checksum != stored_checksum {
- return Err(Error::ChecksumMismatch(stored_checksum, checksum));
- }
- A::decode(&buf[..A::SIZE]).map_err(Error::Codec)
+ A::decode(buf.as_ref()).map_err(Error::Codec).map(Some)
}
/// Returns a stream of all items starting from the given section.
@@ -180,21 +158,22 @@ impl> Journal {
start_section: u64,
buffer: NonZeroUsize,
) -> Result> + '_, Error> {
+ // Pre-create readers from blobs (async operation)
let mut blob_info = Vec::new();
for (§ion, blob) in self.manager.sections_from(start_section) {
- let size = blob.size().await;
- blob_info.push((section, blob.clone(), size));
+ let blob_size = blob.size().await;
+ let reader = blob.as_blob_reader(buffer).await?;
+ blob_info.push((section, blob.clone(), reader, blob_size));
}
- Ok(
- stream::iter(blob_info).flat_map(move |(section, blob, blob_size)| {
- let reader = Read::new(blob, blob_size, buffer);
+ Ok(stream::iter(blob_info).flat_map(
+ move |(section, blob, reader, blob_size)| {
let buf = vec![0u8; Self::CHUNK_SIZE];
stream::unfold(
- (section, buf, reader, 0u64, 0u64),
- move |(section, mut buf, mut reader, offset, valid_size)| async move {
- if offset >= reader.blob_size() {
+ (section, buf, blob, reader, 0u64, 0u64, blob_size),
+ move |(section, mut buf, blob, mut reader, offset, valid_size, blob_size)| async move {
+ if offset >= blob_size {
return None;
}
@@ -202,25 +181,13 @@ impl> Journal {
match reader.read_exact(&mut buf, Self::CHUNK_SIZE).await {
Ok(()) => {
let next_offset = offset + Self::CHUNK_SIZE_U64;
- match Self::verify_integrity(&buf) {
+ match A::decode(buf.as_slice()).map_err(Error::Codec) {
Ok(item) => Some((
Ok((section, position, item)),
- (section, buf, reader, next_offset, next_offset),
+ (section, buf, blob, reader, next_offset, next_offset, blob_size),
)),
- Err(Error::ChecksumMismatch(expected, found)) => {
- warn!(
- section,
- position,
- expected,
- found,
- new_size = valid_size,
- "corruption detected: truncating"
- );
- reader.resize(valid_size).await.ok()?;
- None
- }
Err(err) => {
- Some((Err(err), (section, buf, reader, offset, valid_size)))
+ Some((Err(err), (section, buf, blob, reader, offset, valid_size, blob_size)))
}
}
}
@@ -231,21 +198,21 @@ impl> Journal {
new_size = valid_size,
"trailing bytes detected: truncating"
);
- reader.resize(valid_size).await.ok()?;
+ blob.resize(valid_size).await.ok()?;
None
}
Err(err) => {
warn!(section, position, ?err, "unexpected error");
Some((
Err(Error::Runtime(err)),
- (section, buf, reader, offset, valid_size),
+ (section, buf, blob, reader, offset, valid_size, blob_size),
))
}
}
},
)
- }),
- )
+ },
+ ))
}
/// Sync the given section to storage.
@@ -316,10 +283,11 @@ mod tests {
use commonware_cryptography::{sha256::Digest, Hasher as _, Sha256};
use commonware_macros::test_traced;
use commonware_runtime::{buffer::PoolRef, deterministic, Runner};
- use commonware_utils::NZUsize;
+ use commonware_utils::{NZUsize, NZU16};
+ use core::num::NonZeroU16;
use futures::{pin_mut, StreamExt};
- const PAGE_SIZE: NonZeroUsize = NZUsize!(44);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(44);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3);
fn test_digest(value: u64) -> Digest {
diff --git a/storage/src/journal/segmented/manager.rs b/storage/src/journal/segmented/manager.rs
index 3c4ca526e5..e424e3ffe8 100644
--- a/storage/src/journal/segmented/manager.rs
+++ b/storage/src/journal/segmented/manager.rs
@@ -5,7 +5,7 @@
use crate::journal::Error;
use commonware_runtime::{
- buffer::{Append, PoolRef, Write},
+ buffer::{pool::Append, PoolRef, Write},
telemetry::metrics::status::GaugeExt,
Blob, Error as RError, Metrics, Storage,
};
@@ -61,7 +61,7 @@ impl BufferFactory for AppendFactory {
type Buffer = Append;
async fn create(&self, blob: B, size: u64) -> Result {
- Append::new(blob, size, self.write_buffer, self.pool_ref.clone()).await
+ Append::new(blob, size, self.write_buffer.get(), self.pool_ref.clone()).await
}
}
diff --git a/storage/src/journal/segmented/oversized.rs b/storage/src/journal/segmented/oversized.rs
index 18718f9397..baa8c864df 100644
--- a/storage/src/journal/segmented/oversized.rs
+++ b/storage/src/journal/segmented/oversized.rs
@@ -444,7 +444,7 @@ mod tests {
use commonware_codec::{FixedSize, Read, ReadExt, Write};
use commonware_macros::test_traced;
use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner};
- use commonware_utils::NZUsize;
+ use commonware_utils::{NZUsize, NZU16};
/// Convert offset + size to byte end position (for truncation tests).
fn byte_end(offset: u64, size: u32) -> u64 {
@@ -512,7 +512,7 @@ mod tests {
Config {
index_partition: "test_index".to_string(),
value_partition: "test_values".to_string(),
- index_buffer_pool: PoolRef::new(NZUsize!(64), NZUsize!(8)),
+ index_buffer_pool: PoolRef::new(NZU16!(64), NZUsize!(8)),
index_write_buffer: NZUsize!(1024),
value_write_buffer: NZUsize!(1024),
compression: None,
@@ -900,7 +900,18 @@ mod tests {
fn test_recovery_corrupted_last_index_entry() {
let executor = deterministic::Runner::default();
executor.start(|context| async move {
- let cfg = test_cfg();
+ // Use page size = entry size so each entry is on its own page.
+ // This allows corrupting just the last entry's page without affecting others.
+ // Physical page size = TestEntry::SIZE (20) + 12 (CRC record) = 32 bytes.
+ let cfg = Config {
+ index_partition: "test_index".to_string(),
+ value_partition: "test_values".to_string(),
+ index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)),
+ index_write_buffer: NZUsize!(1024),
+ value_write_buffer: NZUsize!(1024),
+ compression: None,
+ codec_config: (),
+ };
// Create and populate
let mut oversized: Oversized<_, TestEntry, TestValue> =
@@ -908,7 +919,7 @@ mod tests {
.await
.expect("Failed to init");
- // Append 5 entries
+ // Append 5 entries (each on its own page)
for i in 0..5u8 {
let value: TestValue = [i; 16];
let entry = TestEntry::new(i as u64, 0, 0);
@@ -920,34 +931,36 @@ mod tests {
oversized.sync(1).await.expect("Failed to sync");
drop(oversized);
- // Corrupt the last index entry's checksum
+ // Corrupt the last page's CRC to trigger page-level integrity failure
let (blob, size) = context
.open(&cfg.index_partition, &1u64.to_be_bytes())
.await
.expect("Failed to open blob");
- // Each entry is TestEntry::SIZE (16) + 4 (CRC32) = 20 bytes
- // Corrupt the CRC of the last entry
- let last_entry_crc_offset = size - 4;
- blob.write_at(vec![0xFF, 0xFF, 0xFF, 0xFF], last_entry_crc_offset)
+ // Physical page size = 20 + 12 = 32 bytes
+ // 5 entries = 5 pages = 160 bytes total
+ // Last page CRC starts at offset 160 - 12 = 148
+ assert_eq!(size, 160);
+ let last_page_crc_offset = size - 12;
+ blob.write_at(vec![0xFF; 12], last_page_crc_offset)
.await
.expect("Failed to corrupt");
blob.sync().await.expect("Failed to sync");
drop(blob);
- // Reinitialize - should detect corruption and scan backwards
+ // Reinitialize - should detect page corruption and truncate
let mut oversized: Oversized<_, TestEntry, TestValue> =
Oversized::init(context.clone(), cfg)
.await
.expect("Failed to reinit");
- // First 4 entries should be valid
+ // First 4 entries should be valid (on pages 0-3)
for i in 0..4u8 {
let entry = oversized.get(1, i as u64).await.expect("Failed to get");
assert_eq!(entry.id, i as u64);
}
- // Entry 4 should be gone (corrupted and rewound)
+ // Entry 4 should be gone (its page was corrupted)
assert!(oversized.get(1, 4).await.is_err());
// Should be able to append after recovery
@@ -1423,7 +1436,18 @@ mod tests {
fn test_recovery_glob_synced_but_index_not() {
let executor = deterministic::Runner::default();
executor.start(|context| async move {
- let cfg = test_cfg();
+ // Use page size = entry size so each entry is exactly one page.
+ // This allows truncating by entry count to equal truncating by full pages,
+ // maintaining page-level integrity.
+ let cfg = Config {
+ index_partition: "test_index".to_string(),
+ value_partition: "test_values".to_string(),
+ index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)),
+ index_write_buffer: NZUsize!(1024),
+ value_write_buffer: NZUsize!(1024),
+ compression: None,
+ codec_config: (),
+ };
// Create and populate
let mut oversized: Oversized<_, TestEntry, TestValue> =
@@ -1452,9 +1476,10 @@ mod tests {
.await
.expect("Failed to open blob");
- // Keep only first 2 index entries
- let chunk_size = (TestEntry::SIZE + u32::SIZE) as u64; // entry + CRC32
- blob.resize(2 * chunk_size)
+ // Keep only first 2 index entries (2 full pages)
+ // Physical page size = logical (20) + CRC record (12) = 32 bytes
+ let physical_page_size = (TestEntry::SIZE + 12) as u64;
+ blob.resize(2 * physical_page_size)
.await
.expect("Failed to truncate");
blob.sync().await.expect("Failed to sync");
@@ -1692,7 +1717,18 @@ mod tests {
// Simulates crash where index was rewound but glob wasn't
let executor = deterministic::Runner::default();
executor.start(|context| async move {
- let cfg = test_cfg();
+ // Use page size = entry size so each entry is exactly one page.
+ // This allows truncating by entry count to equal truncating by full pages,
+ // maintaining page-level integrity.
+ let cfg = Config {
+ index_partition: "test_index".to_string(),
+ value_partition: "test_values".to_string(),
+ index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)),
+ index_write_buffer: NZUsize!(1024),
+ value_write_buffer: NZUsize!(1024),
+ compression: None,
+ codec_config: (),
+ };
// Create and populate
let mut oversized: Oversized<_, TestEntry, TestValue> =
@@ -1719,8 +1755,9 @@ mod tests {
.open(&cfg.index_partition, &1u64.to_be_bytes())
.await
.expect("Failed to open blob");
- let chunk_size = (TestEntry::SIZE + u32::SIZE) as u64;
- blob.resize(2 * chunk_size)
+ // Physical page size = logical (20) + CRC record (12) = 32 bytes
+ let physical_page_size = (TestEntry::SIZE + 12) as u64;
+ blob.resize(2 * physical_page_size)
.await
.expect("Failed to truncate");
blob.sync().await.expect("Failed to sync");
@@ -1845,15 +1882,20 @@ mod tests {
// Size 0 - should fail
assert!(oversized.get_value(1, offset, 0).await.is_err());
- // Size < CRC_SIZE (1, 2, 3 bytes) - should fail with BlobInsufficientLength
+ // Size < value size - should fail with codec error, checksum mismatch, or
+ // insufficient length (if size < 4 bytes for checksum)
for size in 1..4u32 {
let result = oversized.get_value(1, offset, size).await;
- assert!(matches!(
- result,
- Err(Error::Runtime(
- commonware_runtime::Error::BlobInsufficientLength
- ))
- ));
+ assert!(
+ matches!(
+ result,
+ Err(Error::Codec(_))
+ | Err(Error::ChecksumMismatch(_, _))
+ | Err(Error::Runtime(_))
+ ),
+ "expected error, got: {:?}",
+ result
+ );
}
oversized.destroy().await.expect("Failed to destroy");
@@ -1877,9 +1919,17 @@ mod tests {
.expect("Failed to append");
oversized.sync(1).await.expect("Failed to sync");
- // Size too small (but >= CRC_SIZE) - checksum mismatch
+ // Size too small - will fail to decode or checksum mismatch
+ // (checksum mismatch can occur because we read wrong bytes as the checksum)
let result = oversized.get_value(1, offset, correct_size - 1).await;
- assert!(matches!(result, Err(Error::ChecksumMismatch(_, _))));
+ assert!(
+ matches!(
+ result,
+ Err(Error::Codec(_)) | Err(Error::ChecksumMismatch(_, _))
+ ),
+ "expected Codec or ChecksumMismatch error, got: {:?}",
+ result
+ );
oversized.destroy().await.expect("Failed to destroy");
});
@@ -2394,7 +2444,16 @@ mod tests {
// when added to size is detected as invalid during recovery.
let executor = deterministic::Runner::default();
executor.start(|context| async move {
- let cfg = test_cfg();
+ // Use page size = entry size so one entry per page
+ let cfg = Config {
+ index_partition: "test_index".to_string(),
+ value_partition: "test_values".to_string(),
+ index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)),
+ index_write_buffer: NZUsize!(1024),
+ value_write_buffer: NZUsize!(1024),
+ compression: None,
+ codec_config: (),
+ };
// Create and populate with valid entry
let mut oversized: Oversized<_, TestEntry, TestValue> =
@@ -2411,25 +2470,38 @@ mod tests {
oversized.sync(1).await.expect("Failed to sync");
drop(oversized);
- // Corrupt the index entry to have offset near u64::MAX
- // Entry format: id (8) + value_offset (8) + value_size (4) + CRC32 (4) = 24 bytes
+ // Build a corrupted entry with offset near u64::MAX that would overflow.
+ // We need to write a valid page (with correct page-level CRC) containing
+ // the semantically-invalid entry data.
let (blob, _) = context
.open(&cfg.index_partition, &1u64.to_be_bytes())
.await
.expect("Failed to open blob");
- // Write a corrupted entry with offset = u64::MAX - 10 and size = 100
- // This would overflow when computing offset + size
- let mut corrupted_entry = Vec::new();
- 1u64.write(&mut corrupted_entry); // id
- (u64::MAX - 10).write(&mut corrupted_entry); // value_offset (near max)
- 100u32.write(&mut corrupted_entry); // value_size
- let checksum = crc32fast::hash(&corrupted_entry);
- corrupted_entry.put_u32(checksum);
-
- blob.write_at(corrupted_entry, 0)
- .await
- .expect("Failed to write corrupted entry");
+ // Build entry data: id (8) + value_offset (8) + value_size (4) = 20 bytes
+ let mut entry_data = Vec::new();
+ 1u64.write(&mut entry_data); // id
+ (u64::MAX - 10).write(&mut entry_data); // value_offset (near max)
+ 100u32.write(&mut entry_data); // value_size (offset + size overflows)
+ assert_eq!(entry_data.len(), TestEntry::SIZE);
+
+ // Build page-level CRC record (12 bytes):
+ // len1 (2) + crc1 (4) + len2 (2) + crc2 (4)
+ let crc = crc32fast::hash(&entry_data);
+ let len1 = TestEntry::SIZE as u16;
+ let mut crc_record = Vec::new();
+ crc_record.extend_from_slice(&len1.to_be_bytes()); // len1
+ crc_record.extend_from_slice(&crc.to_be_bytes()); // crc1
+ crc_record.extend_from_slice(&0u16.to_be_bytes()); // len2 (unused)
+ crc_record.extend_from_slice(&0u32.to_be_bytes()); // crc2 (unused)
+ assert_eq!(crc_record.len(), 12);
+
+ // Write the complete physical page: entry_data + crc_record
+ let mut page = entry_data;
+ page.extend_from_slice(&crc_record);
+ blob.write_at(page, 0)
+ .await
+ .expect("Failed to write corrupted page");
blob.sync().await.expect("Failed to sync");
drop(blob);
diff --git a/storage/src/journal/segmented/variable.rs b/storage/src/journal/segmented/variable.rs
index f545f33c9e..2a040540a5 100644
--- a/storage/src/journal/segmented/variable.rs
+++ b/storage/src/journal/segmented/variable.rs
@@ -11,21 +11,13 @@
//! Within a `section`, data is appended as an `item` with the following format:
//!
//! ```text
-//! +---+---+---+---+---+---+---+---+---+---+---+---+
-//! | 0 ~ 4 | ... | 8 | 9 |10 |11 |
-//! +---+---+---+---+---+---+---+---+---+---+---+---+
-//! | Size (varint u32) | Data | C(u32) |
-//! +---+---+---+---+---+---+---+---+---+---+---+---+
-//!
-//! Size = u32 as varint (1 to 5 bytes)
-//! C = CRC32(Size | Data)
+//! +---+---+---+---+---+---+---+---+
+//! | 0 ~ 4 | ... |
+//! +---+---+---+---+---+---+---+---+
+//! | Size (varint u32) | Data |
+//! +---+---+---+---+---+---+---+---+
//! ```
//!
-//! _To ensure data returned by `Journal` is correct, a checksum (CRC32) is stored at the end of
-//! each item. If the checksum of the read data does not match the stored checksum, an error is
-//! returned. This checksum is only verified when data is accessed and not at startup (which would
-//! require reading all data in `Journal`)._
-//!
//! # Open Blobs
//!
//! `Journal` uses 1 `commonware-storage::Blob` per `section` to store data. All `Blobs` in a given
@@ -64,7 +56,7 @@
//! ```rust
//! use commonware_runtime::{Spawner, Runner, deterministic, buffer::PoolRef};
//! use commonware_storage::journal::segmented::variable::{Journal, Config};
-//! use commonware_utils::NZUsize;
+//! use commonware_utils::{NZUsize, NZU16};
//!
//! let executor = deterministic::Runner::default();
//! executor.start(|context| async move {
@@ -73,7 +65,7 @@
//! partition: "partition".to_string(),
//! compression: None,
//! codec_config: (),
-//! buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)),
+//! buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)),
//! write_buffer: NZUsize!(1024 * 1024),
//! }).await.unwrap();
//!
@@ -88,13 +80,13 @@
use super::manager::{AppendFactory, Config as ManagerConfig, Manager};
use crate::journal::Error;
use bytes::{Buf, BufMut};
-use commonware_codec::{varint::UInt, Codec, EncodeSize, FixedSize, ReadExt, Write as CodecWrite};
+use commonware_codec::{varint::UInt, Codec, EncodeSize, ReadExt, Write as CodecWrite};
use commonware_runtime::{
- buffer::{Append, PoolRef, Read},
+ buffer::pool::{Append, PoolRef, Read},
Blob, Error as RError, Metrics, Storage,
};
use futures::stream::{self, Stream, StreamExt};
-use std::{io::Cursor, num::NonZeroUsize};
+use std::{borrow::Cow, io::Cursor, num::NonZeroUsize};
use tracing::{trace, warn};
use zstd::{bulk::compress, decode_all};
@@ -118,10 +110,69 @@ pub struct Config {
pub write_buffer: NonZeroUsize,
}
-/// Minimum size of any item: 1 byte varint (size=0) + 0 bytes data + 4 bytes checksum.
-/// This is also the max varint size for u32, so we can always read this many bytes
-/// at the start of an item to get the complete varint.
-const MIN_ITEM_SIZE: usize = 5;
+/// Maximum size of a varint for u32 (also the minimum useful read size for parsing item headers).
+const MAX_VARINT_SIZE: usize = 5;
+
+/// Decodes a varint length prefix from a buffer.
+/// Returns (item_size, varint_len).
+#[inline]
+fn decode_length_prefix(buf: &[u8]) -> Result<(usize, usize), Error> {
+ let mut cursor = buf;
+ let size = UInt::::read(&mut cursor)?.0 as usize;
+ let varint_len = buf.len() - cursor.remaining();
+ Ok((size, varint_len))
+}
+
+/// Result of finding an item in a buffer.
+enum Item<'a> {
+ /// All item data is available in the buffer.
+ Complete(&'a [u8]),
+ /// Need to read more bytes. Buffer has been allocated and prefix copied.
+ Incomplete {
+ buffer: Vec,
+ filled: usize,
+ /// Offset to read remaining bytes from (for offset-based readers).
+ read_offset: u64,
+ },
+}
+
+/// Find an item in a buffer by decoding its length prefix.
+///
+/// Returns (next_offset, size, item).
+fn find_item(buf: &[u8], available: usize, offset: u64) -> Result<(u64, u32, Item<'_>), Error> {
+ let (size, varint_len) = decode_length_prefix(&buf[..available])?;
+ let next_offset = offset
+ .checked_add(varint_len as u64)
+ .ok_or(Error::OffsetOverflow)?
+ .checked_add(size as u64)
+ .ok_or(Error::OffsetOverflow)?;
+ let buffered = available.saturating_sub(varint_len);
+
+ let item = if buffered >= size {
+ Item::Complete(&buf[varint_len..varint_len + size])
+ } else {
+ let mut buffer = vec![0u8; size];
+ buffer[..buffered].copy_from_slice(&buf[varint_len..varint_len + buffered]);
+ Item::Incomplete {
+ buffer,
+ filled: buffered,
+ read_offset: offset + varint_len as u64 + buffered as u64,
+ }
+ };
+
+ Ok((next_offset, size as u32, item))
+}
+
+/// Decode item data with optional decompression.
+fn decode_item(item_data: &[u8], cfg: &V::Cfg, compressed: bool) -> Result {
+ if compressed {
+ let decompressed =
+ decode_all(Cursor::new(item_data)).map_err(|_| Error::DecompressionFailed)?;
+ V::decode_cfg(decompressed.as_ref(), cfg).map_err(Error::Codec)
+ } else {
+ V::decode_cfg(item_data, cfg).map_err(Error::Codec)
+ }
+}
/// Implementation of `Journal` storage.
pub struct Journal {
@@ -164,108 +215,77 @@ impl Journal {
blob: &Append,
offset: u64,
) -> Result<(u64, u32, V), Error> {
- // Read varint size (max 5 bytes for u32)
- let mut hasher = crc32fast::Hasher::new();
- let varint_buf = blob.read_at(vec![0; MIN_ITEM_SIZE], offset).await?;
- let mut varint = varint_buf.as_ref();
- let size = UInt::::read(&mut varint).map_err(Error::Codec)?.0 as usize;
- let varint_len = MIN_ITEM_SIZE - varint.remaining();
- hasher.update(&varint_buf.as_ref()[..varint_len]);
- let offset = offset
- .checked_add(varint_len as u64)
- .ok_or(Error::OffsetOverflow)?;
-
- // Read remaining
- let buf_size = size.checked_add(u32::SIZE).ok_or(Error::OffsetOverflow)?;
- let buf = blob.read_at(vec![0u8; buf_size], offset).await?;
- let buf = buf.as_ref();
- let next_offset = offset
- .checked_add(buf_size as u64)
- .ok_or(Error::OffsetOverflow)?;
-
- // Read item
- let item = &buf[..size];
- hasher.update(item);
-
- // Verify integrity
- let checksum = hasher.finalize();
- let stored_checksum = u32::from_be_bytes(buf[size..].try_into().unwrap());
- if checksum != stored_checksum {
- return Err(Error::ChecksumMismatch(stored_checksum, checksum));
- }
-
- // If compression is enabled, decompress the item
- let item = if compressed {
- let decompressed =
- decode_all(Cursor::new(&item)).map_err(|_| Error::DecompressionFailed)?;
- V::decode_cfg(decompressed.as_ref(), cfg).map_err(Error::Codec)?
- } else {
- V::decode_cfg(item, cfg).map_err(Error::Codec)?
+ // Read varint header (max 5 bytes for u32)
+ let buf = vec![0u8; MAX_VARINT_SIZE];
+ let (buf, available) = blob.read_up_to(buf, offset).await?;
+ let (next_offset, size, item) = find_item(buf.as_ref(), available, offset)?;
+
+ // Get item bytes - either from buffer directly or by reading more
+ let item_data: Cow<'_, [u8]> = match item {
+ Item::Complete(data) => Cow::Borrowed(data),
+ Item::Incomplete {
+ mut buffer,
+ filled,
+ read_offset,
+ } => {
+ blob.read_into(&mut buffer[filled..], read_offset).await?;
+ Cow::Owned(buffer)
+ }
};
- // Return item
- Ok((next_offset, size as u32, item))
+ // Decode item (with optional decompression)
+ let decoded = decode_item::(item_data.as_ref(), cfg, compressed)?;
+ Ok((next_offset, size, decoded))
}
/// Helper function to read an item from a [Read].
+ ///
+ /// The `varint_buf` parameter is a reusable buffer for reading the varint header to avoid
+ /// allocating a new buffer for every item.
async fn read_buffered(
- reader: &mut Read>,
+ reader: &mut Read,
offset: u64,
cfg: &V::Cfg,
compressed: bool,
+ varint_buf: &mut Vec,
) -> Result<(u64, u64, u32, V), Error> {
// If we're not at the right position, seek to it
if reader.position() != offset {
reader.seek_to(offset).map_err(Error::Runtime)?;
}
- // Read varint size (max 5 bytes for u32, and min item size is 5 bytes)
- let mut hasher = crc32fast::Hasher::new();
- let mut varint_buf = [0u8; MIN_ITEM_SIZE];
- reader
- .read_exact(&mut varint_buf, MIN_ITEM_SIZE)
- .await
- .map_err(Error::Runtime)?;
- let mut varint = varint_buf.as_ref();
- let size = UInt::::read(&mut varint).map_err(Error::Codec)?.0 as usize;
- let varint_len = MIN_ITEM_SIZE - varint.remaining();
- hasher.update(&varint_buf[..varint_len]);
-
- // Read remaining data+checksum (we already have some bytes from the varint read)
- let buf_size = size.checked_add(u32::SIZE).ok_or(Error::OffsetOverflow)?;
- let already_read = MIN_ITEM_SIZE - varint_len;
- let mut buf = vec![0u8; buf_size];
- buf[..already_read].copy_from_slice(&varint_buf[varint_len..]);
- if buf_size > already_read {
- reader
- .read_exact(&mut buf[already_read..], buf_size - already_read)
- .await
- .map_err(Error::Runtime)?;
- }
-
- // Read item
- let item = &buf[..size];
- hasher.update(item);
-
- // Verify integrity
- let checksum = hasher.finalize();
- let stored_checksum = u32::from_be_bytes(buf[size..].try_into().unwrap());
- if checksum != stored_checksum {
- return Err(Error::ChecksumMismatch(stored_checksum, checksum));
- }
-
- // If compression is enabled, decompress the item
- let item = if compressed {
- let decompressed =
- decode_all(Cursor::new(&item)).map_err(|_| Error::DecompressionFailed)?;
- V::decode_cfg(decompressed.as_ref(), cfg).map_err(Error::Codec)?
- } else {
- V::decode_cfg(item, cfg).map_err(Error::Codec)?
+ // Read varint header (max 5 bytes for u32). Reuse the provided buffer.
+ varint_buf.clear();
+ varint_buf.resize(MAX_VARINT_SIZE, 0);
+ let buf = std::mem::take(varint_buf);
+ let (buf, available) = reader.read_up_to(buf).await?;
+ let (next_offset, size, item) = find_item(buf.as_ref(), available, offset)?;
+
+ // Get item bytes - either from buffer directly or by reading more
+ let item_data: Cow<'_, [u8]> = match item {
+ Item::Complete(data) => {
+ // We already have all the data we need, but reader position may be ahead.
+ // Seek to the correct next position.
+ reader.seek_to(next_offset).map_err(Error::Runtime)?;
+ Cow::Borrowed(data)
+ }
+ Item::Incomplete {
+ mut buffer, filled, ..
+ } => {
+ reader
+ .read_exact(&mut buffer[filled..], size as usize - filled)
+ .await
+ .map_err(Error::Runtime)?;
+ Cow::Owned(buffer)
+ }
};
- // Calculate next offset
- let next_offset = reader.position();
- Ok((next_offset, next_offset, size as u32, item))
+ // Decode item (with optional decompression)
+ let decoded = decode_item::(item_data.as_ref(), cfg, compressed)?;
+
+ // Restore the buffer for reuse in the next iteration
+ *varint_buf = buf.into();
+ Ok((next_offset, next_offset, size, decoded))
}
/// Returns an ordered stream of all items in the journal starting with the item at the given
@@ -286,7 +306,7 @@ impl Journal {
mut offset: u64,
buffer: NonZeroUsize,
) -> Result> + '_, Error> {
- // Collect all blobs to replay
+ // Collect all blobs to replay (keeping blob reference for potential resize)
let codec_config = self.codec_config.clone();
let compressed = self.compression.is_some();
let mut blobs = Vec::new();
@@ -295,6 +315,7 @@ impl Journal {
blobs.push((
section,
blob.clone(),
+ blob.as_blob_reader(buffer).await?,
blob_size,
codec_config.clone(),
compressed,
@@ -304,9 +325,7 @@ impl Journal {
// Replay all blobs in order and stream items as they are read (to avoid occupying too much
// memory with buffered data)
Ok(stream::iter(blobs).flat_map(
- move |(section, blob, blob_size, codec_config, compressed)| {
- // Created buffered reader
- let mut reader = Read::new(blob, blob_size, buffer);
+ move |(section, blob, mut reader, blob_size, codec_config, compressed)| {
if section == start_section && offset != 0 {
if let Err(err) = reader.seek_to(offset) {
warn!(section, offset, ?err, "failed to seek to offset");
@@ -317,25 +336,29 @@ impl Journal {
offset = 0;
}
- // Read over the blob
+ // Read over the blob. Include a reusable buffer for varint parsing.
stream::unfold(
(
section,
+ blob,
reader,
offset,
0u64,
blob_size,
codec_config,
compressed,
+ Vec::with_capacity(MAX_VARINT_SIZE),
),
move |(
section,
+ blob,
mut reader,
offset,
valid_size,
blob_size,
codec_config,
compressed,
+ mut varint_buf,
)| async move {
// Check if we are at the end of the blob
if offset >= blob_size {
@@ -343,8 +366,14 @@ impl Journal {
}
// Read an item from the buffer
- match Self::read_buffered(&mut reader, offset, &codec_config, compressed)
- .await
+ match Self::read_buffered(
+ &mut reader,
+ offset,
+ &codec_config,
+ compressed,
+ &mut varint_buf,
+ )
+ .await
{
Ok((next_offset, next_valid_size, size, item)) => {
trace!(blob = section, cursor = offset, "replayed item");
@@ -352,30 +381,17 @@ impl Journal {
Ok((section, offset, size, item)),
(
section,
+ blob,
reader,
next_offset,
next_valid_size,
blob_size,
codec_config,
compressed,
+ varint_buf,
),
))
}
- Err(Error::ChecksumMismatch(expected, found)) => {
- // If we encounter corruption, we prune to the last valid item. This
- // can happen during an unclean file close (where pending data is not
- // fully synced to disk).
- warn!(
- blob = section,
- bad_offset = offset,
- new_size = valid_size,
- expected,
- found,
- "corruption detected: truncating"
- );
- reader.resize(valid_size).await.ok()?;
- None
- }
Err(Error::Runtime(RError::BlobInsufficientLength)) => {
// If we encounter trailing bytes, we prune to the last
// valid item. This can happen during an unclean file close (where
@@ -386,7 +402,7 @@ impl Journal {
new_size = valid_size,
"trailing bytes detected: truncating"
);
- reader.resize(valid_size).await.ok()?;
+ blob.resize(valid_size).await.ok()?;
None
}
Err(err) => {
@@ -397,12 +413,14 @@ impl Journal {
Err(err),
(
section,
+ blob,
reader,
offset,
valid_size,
blob_size,
codec_config,
compressed,
+ varint_buf,
),
))
}
@@ -417,16 +435,8 @@ impl Journal {
/// Appends an item to `Journal` in a given `section`, returning the offset
/// where the item was written and the size of the item (which may now be smaller
/// than the encoded size from the codec, if compression is enabled).
- ///
- /// # Warning
- ///
- /// If there exist trailing bytes in the `Blob` of a particular `section` and
- /// `replay` is not called before this, it is likely that subsequent data added
- /// to the `Blob` will be considered corrupted (as the trailing bytes will fail
- /// the checksum verification). It is recommended to call `replay` before calling
- /// `append` to prevent this.
pub async fn append(&mut self, section: u64, item: V) -> Result<(u64, u32), Error> {
- // Create buffer with item data
+ // Create buffer with item data (no checksum, no alignment)
let (buf, item_len) = if let Some(compression) = self.compression {
// Compressed: encode first, then compress
let encoded = item.encode();
@@ -440,14 +450,11 @@ impl Journal {
let size_len = UInt(item_len_u32).encode_size();
let entry_len = size_len
.checked_add(item_len)
- .and_then(|v| v.checked_add(4))
.ok_or(Error::OffsetOverflow)?;
let mut buf = Vec::with_capacity(entry_len);
UInt(item_len_u32).write(&mut buf);
buf.put_slice(&compressed);
- let checksum = crc32fast::hash(&buf);
- buf.put_u32(checksum);
(buf, item_len)
} else {
@@ -460,14 +467,11 @@ impl Journal {
let size_len = UInt(item_len_u32).encode_size();
let entry_len = size_len
.checked_add(item_len)
- .and_then(|v| v.checked_add(4))
.ok_or(Error::OffsetOverflow)?;
let mut buf = Vec::with_capacity(entry_len);
UInt(item_len_u32).write(&mut buf);
item.write(&mut buf);
- let checksum = crc32fast::hash(&buf);
- buf.put_u32(checksum);
(buf, item_len)
};
@@ -475,11 +479,11 @@ impl Journal {
// Get or create blob
let blob = self.manager.get_or_create(section).await?;
- // Get current position - this is where we'll write
+ // Get current position - this is where we'll write (no alignment)
let offset = blob.size().await;
// Append item to blob
- blob.append(buf).await?;
+ blob.append(&buf).await?;
trace!(blob = section, offset, "appended item");
Ok((offset, item_len as u32))
}
@@ -596,11 +600,12 @@ mod tests {
use super::*;
use bytes::BufMut;
use commonware_macros::test_traced;
- use commonware_runtime::{deterministic, Runner};
- use commonware_utils::NZUsize;
+ use commonware_runtime::{deterministic, Blob, Runner, Storage};
+ use commonware_utils::{NZUsize, NZU16};
use futures::{pin_mut, StreamExt};
+ use std::num::NonZeroU16;
- const PAGE_SIZE: NonZeroUsize = NZUsize!(1024);
+ const PAGE_SIZE: NonZeroU16 = NZU16!(1024);
const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10);
#[test_traced]
@@ -1110,7 +1115,7 @@ mod tests {
let item_size: u32 = 10; // Size indicates 10 bytes of data
let mut buf = Vec::new();
UInt(item_size).write(&mut buf); // Varint encoding
- let data = [2u8; 5]; // Only 5 bytes, not 10 + 4 (checksum)
+ let data = [2u8; 5];
BufMut::put_slice(&mut buf, &data);
blob.write_at(buf, 0)
.await
@@ -1343,23 +1348,17 @@ mod tests {
}
drop(journal);
- // Verify that only non-corrupted items were replayed
- assert_eq!(items.len(), 3);
+ // Verify that replay stopped after corruption detected (the second blob).
+ assert_eq!(items.len(), 1);
assert_eq!(items[0].0, 1);
assert_eq!(items[0].1, 1);
- assert_eq!(items[1].0, data_items[0].0);
- assert_eq!(items[1].1, data_items[0].1);
- assert_eq!(items[2].0, data_items[1].0);
- assert_eq!(items[2].1, data_items[1].1);
- // Confirm blob is expected length
- // entry = 1 (varint for 4) + 4 (data) + 4 (checksum) = 9 bytes
- // Item 2 ends at position 9 + 9 = 18
+ // Confirm second blob was truncated.
let (_, blob_size) = context
.open(&cfg.partition, &2u64.to_be_bytes())
.await
.expect("Failed to open blob");
- assert_eq!(blob_size, 18);
+ assert_eq!(blob_size, 0);
// Attempt to replay journal after truncation
let mut journal = Journal::init(context.clone(), cfg.clone())
@@ -1383,34 +1382,21 @@ mod tests {
}
// Verify that only non-corrupted items were replayed
- assert_eq!(items.len(), 3);
+ assert_eq!(items.len(), 1);
assert_eq!(items[0].0, 1);
assert_eq!(items[0].1, 1);
- assert_eq!(items[1].0, data_items[0].0);
- assert_eq!(items[1].1, data_items[0].1);
- assert_eq!(items[2].0, data_items[1].0);
- assert_eq!(items[2].1, data_items[1].1);
// Append a new item to truncated partition
- let (offset, _) = journal.append(2, 5).await.expect("Failed to append data");
+ let (_offset, _) = journal.append(2, 5).await.expect("Failed to append data");
journal.sync(2).await.expect("Failed to sync blob");
- // Get the new item
- let item = journal.get(2, offset).await.expect("Failed to get item");
+ // Get the new item (offset is 0 since blob was truncated)
+ let item = journal.get(2, 0).await.expect("Failed to get item");
assert_eq!(item, 5);
// Drop the journal (data already synced)
drop(journal);
- // Confirm blob is expected length
- // Items 1 and 2 at positions 0 and 9, item 3 (value 5) at position 18
- // Item 3 = 1 (varint) + 4 (data) + 4 (checksum) = 9 bytes, ends at 27
- let (_, blob_size) = context
- .open(&cfg.partition, &2u64.to_be_bytes())
- .await
- .expect("Failed to open blob");
- assert_eq!(blob_size, 27);
-
// Re-initialize the journal to simulate a restart
let journal = Journal::init(context.clone(), cfg.clone())
.await
@@ -1433,15 +1419,11 @@ mod tests {
}
// Verify that only non-corrupted items were replayed
- assert_eq!(items.len(), 4);
+ assert_eq!(items.len(), 2);
assert_eq!(items[0].0, 1);
assert_eq!(items[0].1, 1);
- assert_eq!(items[1].0, data_items[0].0);
- assert_eq!(items[1].1, data_items[0].1);
- assert_eq!(items[2].0, data_items[1].0);
- assert_eq!(items[2].1, data_items[1].1);
- assert_eq!(items[3].0, 2);
- assert_eq!(items[3].1, 5);
+ assert_eq!(items[1].0, 2);
+ assert_eq!(items[1].1, 5);
});
}
@@ -1624,6 +1606,67 @@ mod tests {
});
}
+ #[test_traced]
+ fn test_journal_small_items() {
+ let executor = deterministic::Runner::default();
+ executor.start(|context| async move {
+ let cfg = Config {
+ partition: "test_partition".into(),
+ compression: None,
+ codec_config: (),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
+ write_buffer: NZUsize!(1024),
+ };
+
+ let mut journal = Journal::init(context.clone(), cfg.clone())
+ .await
+ .expect("Failed to initialize journal");
+
+ // Append many small (1-byte) items to the same section
+ let num_items = 100;
+ let mut offsets = Vec::new();
+ for i in 0..num_items {
+ let (offset, size) = journal
+ .append(1, i as u8)
+ .await
+ .expect("Failed to append data");
+ assert_eq!(size, 1, "u8 should encode to 1 byte");
+ offsets.push(offset);
+ }
+ journal.sync(1).await.expect("Failed to sync");
+
+ // Read each item back via random access
+ for (i, &offset) in offsets.iter().enumerate() {
+ let item: u8 = journal.get(1, offset).await.expect("Failed to get item");
+ assert_eq!(item, i as u8, "Item mismatch at offset {offset}");
+ }
+
+ // Drop and reopen to test replay
+ drop(journal);
+ let journal = Journal::<_, u8>::init(context, cfg)
+ .await
+ .expect("Failed to re-initialize journal");
+
+ // Replay and verify all items
+ let stream = journal
+ .replay(0, 0, NZUsize!(1024))
+ .await
+ .expect("Failed to setup replay");
+ pin_mut!(stream);
+
+ let mut count = 0;
+ while let Some(result) = stream.next().await {
+ let (section, offset, size, item) = result.expect("Failed to replay item");
+ assert_eq!(section, 1);
+ assert_eq!(offset, offsets[count]);
+ assert_eq!(size, 1);
+ assert_eq!(item, count as u8);
+ count += 1;
+ }
+ assert_eq!(count, num_items, "Should replay all items");
+ });
+ }
+
#[test_traced]
fn test_journal_rewind_many_sections() {
let executor = deterministic::Runner::default();
diff --git a/storage/src/mmr/journaled.rs b/storage/src/mmr/journaled.rs
index 4ddb5cc82f..eeed74c371 100644
--- a/storage/src/mmr/journaled.rs
+++ b/storage/src/mmr/journaled.rs
@@ -835,14 +835,15 @@ mod tests {
};
use commonware_macros::test_traced;
use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner};
- use commonware_utils::{hex, NZUsize, NZU64};
+ use commonware_utils::{hex, NZUsize, NZU16, NZU64};
+ use std::num::NonZeroU16;
fn test_digest(v: usize) -> Digest {
Sha256::hash(&v.to_be_bytes())
}
- const PAGE_SIZE: usize = 111;
- const PAGE_CACHE_SIZE: usize = 5;
+ const PAGE_SIZE: NonZeroU16 = NZU16!(111);
+ const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(5);
fn test_config() -> Config {
Config {
@@ -851,7 +852,7 @@ mod tests {
items_per_blob: NZU64!(7),
write_buffer: NZUsize!(1024),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
@@ -1134,16 +1135,17 @@ mod tests {
drop(mmr);
// The very last element we added (pos=495) resulted in new parents at positions 496 &
- // 497. Simulate a partial write by corrupting the last parent's checksum by truncating
+ // 497. Simulate a partial write by corrupting the last page's checksum by truncating
// the last blob by a single byte.
let partition: String = "journal_partition".into();
let (blob, len) = context
.open(&partition, &71u64.to_be_bytes())
.await
.expect("Failed to open blob");
- assert_eq!(len, 36); // N+4 = 36 bytes per node, 1 node in the last blob
+ // A full page w/ CRC should have been written on sync.
+ assert_eq!(len, PAGE_SIZE.get() as u64 + 12);
- // truncate the blob by one byte to corrupt the checksum of the last parent node.
+ // truncate the blob by one byte to corrupt the page CRC.
blob.resize(len - 1).await.expect("Failed to corrupt blob");
blob.sync().await.expect("Failed to sync blob");
@@ -1161,33 +1163,6 @@ mod tests {
.await
.unwrap();
assert_eq!(mmr.size(), 498);
- drop(mmr);
-
- // Repeat partial write test though this time truncate the leaf itself not just some
- // parent. The leaf is in the *previous* blob so we'll have to delete the most recent
- // blob, then appropriately truncate the previous one.
- context
- .remove(&partition, Some(&71u64.to_be_bytes()))
- .await
- .expect("Failed to remove blob");
- let (blob, len) = context
- .open(&partition, &70u64.to_be_bytes())
- .await
- .expect("Failed to open blob");
- assert_eq!(len, 36 * 7); // this blob should be full.
-
- // The last leaf should be in slot 5 of this blob, truncate last byte of its checksum.
- blob.resize(36 * 5 + 35)
- .await
- .expect("Failed to corrupt blob");
- blob.sync().await.expect("Failed to sync blob");
-
- let mmr = Mmr::init(context.clone(), &mut hasher, test_config())
- .await
- .unwrap();
- // Since the leaf was corrupted, it should not have been recovered, and the journal's
- // size will be the last-valid size.
- assert_eq!(mmr.size(), 495);
mmr.destroy().await.unwrap();
});
@@ -1464,7 +1439,7 @@ mod tests {
items_per_blob: NZU64!(7),
write_buffer: NZUsize!(1024),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
},
)
.await
@@ -1515,7 +1490,7 @@ mod tests {
items_per_blob: NZU64!(7),
write_buffer: NZUsize!(1024),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
},
)
.await
@@ -1540,7 +1515,7 @@ mod tests {
items_per_blob: NZU64!(7),
write_buffer: NZUsize!(1024),
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
},
)
.await
diff --git a/storage/src/qmdb/any/mod.rs b/storage/src/qmdb/any/mod.rs
index d2c5b05273..c682e2c9e1 100644
--- a/storage/src/qmdb/any/mod.rs
+++ b/storage/src/qmdb/any/mod.rs
@@ -155,11 +155,12 @@ pub(crate) mod test {
qmdb::any::{FixedConfig, VariableConfig},
translator::TwoCap,
};
- use commonware_utils::{NZUsize, NZU64};
+ use commonware_utils::{NZUsize, NZU16, NZU64};
+ use std::num::NonZeroU16;
// Janky page & cache sizes to exercise boundary conditions.
- const PAGE_SIZE: usize = 101;
- const PAGE_CACHE_SIZE: usize = 11;
+ const PAGE_SIZE: NonZeroU16 = NZU16!(101);
+ const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11);
pub(super) fn fixed_db_config(suffix: &str) -> FixedConfig {
FixedConfig {
@@ -172,7 +173,7 @@ pub(crate) mod test {
log_write_buffer: NZUsize!(1024),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
@@ -189,7 +190,7 @@ pub(crate) mod test {
log_codec_config: (),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
diff --git a/storage/src/qmdb/any/ordered/fixed.rs b/storage/src/qmdb/any/ordered/fixed.rs
index 14e10fcc0a..969d2cf577 100644
--- a/storage/src/qmdb/any/ordered/fixed.rs
+++ b/storage/src/qmdb/any/ordered/fixed.rs
@@ -87,13 +87,16 @@ mod test {
deterministic::{self, Context},
Runner as _,
};
- use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64};
+ use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64};
use rand::{rngs::StdRng, seq::IteratorRandom, RngCore, SeedableRng};
- use std::collections::{BTreeMap, HashMap};
+ use std::{
+ collections::{BTreeMap, HashMap},
+ num::{NonZeroU16, NonZeroUsize},
+ };
// Janky page & cache sizes to exercise boundary conditions.
- const PAGE_SIZE: usize = 103;
- const PAGE_CACHE_SIZE: usize = 13;
+ const PAGE_SIZE: NonZeroU16 = NZU16!(103);
+ const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(13);
fn any_db_config(suffix: &str) -> Config {
Config {
@@ -106,7 +109,7 @@ mod test {
log_write_buffer: NZUsize!(1024),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
@@ -138,7 +141,7 @@ mod test {
log_write_buffer: NZUsize!(64),
translator: t,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
diff --git a/storage/src/qmdb/any/unordered/fixed/mod.rs b/storage/src/qmdb/any/unordered/fixed/mod.rs
index 33e7c05291..a840b74b24 100644
--- a/storage/src/qmdb/any/unordered/fixed/mod.rs
+++ b/storage/src/qmdb/any/unordered/fixed/mod.rs
@@ -92,12 +92,13 @@ pub(super) mod test {
deterministic::{self, Context},
Runner as _,
};
- use commonware_utils::{NZUsize, NZU64};
+ use commonware_utils::{NZUsize, NZU16, NZU64};
use rand::{rngs::StdRng, RngCore, SeedableRng};
+ use std::num::{NonZeroU16, NonZeroUsize};
// Janky page & cache sizes to exercise boundary conditions.
- const PAGE_SIZE: usize = 101;
- const PAGE_CACHE_SIZE: usize = 11;
+ const PAGE_SIZE: NonZeroU16 = NZU16!(101);
+ const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11);
pub(crate) fn any_db_config(suffix: &str) -> Config {
Config {
@@ -110,7 +111,7 @@ pub(super) mod test {
log_write_buffer: NZUsize!(1024),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
@@ -143,7 +144,7 @@ pub(super) mod test {
log_write_buffer: NZUsize!(64),
translator: TwoCap,
thread_pool: None,
- buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)),
+ buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE),
}
}
diff --git a/storage/src/qmdb/any/unordered/fixed/sync.rs b/storage/src/qmdb/any/unordered/fixed/sync.rs
index d3462c390c..7f10afc0e2 100644
--- a/storage/src/qmdb/any/unordered/fixed/sync.rs
+++ b/storage/src/qmdb/any/unordered/fixed/sync.rs
@@ -12,7 +12,7 @@ use crate::{
use commonware_codec::CodecFixed;
use commonware_cryptography::{DigestOf, Hasher};
use commonware_runtime::{
- buffer::Append, telemetry::metrics::status::GaugeExt, Blob, Clock, Metrics, Storage,
+ buffer::pool::Append, telemetry::metrics::status::GaugeExt, Blob, Clock, Metrics, Storage,
};
use commonware_utils::Array;
use prometheus_client::metrics::{counter::Counter, gauge::Gauge};
@@ -225,7 +225,13 @@ pub(crate) async fn init_journal_at_size