diff --git a/Cargo.lock b/Cargo.lock index 81a6cd1531..db2dc1e4fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1461,6 +1461,7 @@ dependencies = [ "commonware-parallel", "commonware-utils", "console-subscriber", + "crc32fast", "criterion", "futures", "getrandom 0.2.16", diff --git a/consensus/fuzz/src/lib.rs b/consensus/fuzz/src/lib.rs index 988491fa6a..409203cc80 100644 --- a/consensus/fuzz/src/lib.rs +++ b/consensus/fuzz/src/lib.rs @@ -28,14 +28,20 @@ use commonware_cryptography::{ }; use commonware_p2p::simulated::{Config as NetworkConfig, Link, Network}; use commonware_runtime::{buffer::PoolRef, deterministic, Clock, Metrics, Runner, Spawner}; -use commonware_utils::{max_faults, NZUsize}; +use commonware_utils::{max_faults, NZUsize, NZU16}; use futures::{channel::mpsc::Receiver, future::join_all, StreamExt}; use rand::{rngs::StdRng, RngCore, SeedableRng}; -use std::{cell::RefCell, num::NonZeroUsize, panic, sync::Arc, time::Duration}; +use std::{ + cell::RefCell, + num::{NonZeroU16, NonZeroUsize}, + panic, + sync::Arc, + time::Duration, +}; pub const EPOCH: u64 = 333; -const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); +const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const MIN_REQUIRED_CONTAINERS: u64 = 10; const MAX_REQUIRED_CONTAINERS: u64 = 50; diff --git a/consensus/src/aggregation/mod.rs b/consensus/src/aggregation/mod.rs index 7eacb9a938..7400ba0a1d 100644 --- a/consensus/src/aggregation/mod.rs +++ b/consensus/src/aggregation/mod.rs @@ -103,19 +103,19 @@ mod tests { deterministic::{self, Context}, Clock, Metrics, Quota, Runner, Spawner, }; - use commonware_utils::{test_rng, NZUsize, NonZeroDuration}; + use commonware_utils::{test_rng, NZUsize, NonZeroDuration, NZU16}; use futures::{channel::oneshot, future::join_all}; use rand::{rngs::StdRng, Rng}; use std::{ collections::BTreeMap, - num::{NonZeroU32, NonZeroUsize}, + num::{NonZeroU16, NonZeroU32, NonZeroUsize}, time::Duration, }; use tracing::debug; type Registrations

= BTreeMap, Receiver

)>; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX); const TEST_NAMESPACE: &[u8] = b"my testing namespace"; diff --git a/consensus/src/marshal/mod.rs b/consensus/src/marshal/mod.rs index 3e483a9a39..1de1edb636 100644 --- a/consensus/src/marshal/mod.rs +++ b/consensus/src/marshal/mod.rs @@ -130,7 +130,7 @@ mod tests { }; use commonware_runtime::{buffer::PoolRef, deterministic, Clock, Metrics, Quota, Runner}; use commonware_storage::archive::immutable; - use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU64}; + use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU16, NZU64}; use futures::StreamExt; use rand::{ seq::{IteratorRandom, SliceRandom}, @@ -138,7 +138,7 @@ mod tests { }; use std::{ collections::BTreeMap, - num::{NonZeroU32, NonZeroU64, NonZeroUsize}, + num::{NonZeroU16, NonZeroU32, NonZeroU64, NonZeroUsize}, time::{Duration, Instant}, }; use tracing::info; @@ -150,7 +150,7 @@ mod tests { type S = bls12381_threshold::Scheme; type P = ConstantProvider; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const NAMESPACE: &[u8] = b"test"; const NUM_VALIDATORS: u32 = 4; diff --git a/consensus/src/ordered_broadcast/mod.rs b/consensus/src/ordered_broadcast/mod.rs index d65f1bba7a..75dcb5e06f 100644 --- a/consensus/src/ordered_broadcast/mod.rs +++ b/consensus/src/ordered_broadcast/mod.rs @@ -97,16 +97,16 @@ mod tests { deterministic::{self, Context}, Clock, Metrics, Quota, Runner, Spawner, }; - use commonware_utils::NZUsize; + use commonware_utils::{NZUsize, NZU16}; use futures::{channel::oneshot, future::join_all}; use std::{ collections::{BTreeMap, HashMap}, - num::{NonZeroU32, NonZeroUsize}, + num::{NonZeroU16, NonZeroU32, NonZeroUsize}, time::Duration, }; use tracing::debug; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX); const TEST_NAMESPACE: &[u8] = b"ordered_broadcast_test"; diff --git a/consensus/src/simplex/actors/voter/mod.rs b/consensus/src/simplex/actors/voter/mod.rs index 7b13cb5293..4ca1bcbd4b 100644 --- a/consensus/src/simplex/actors/voter/mod.rs +++ b/consensus/src/simplex/actors/voter/mod.rs @@ -71,15 +71,15 @@ mod tests { use commonware_macros::{select, test_traced}; use commonware_p2p::simulated::{Config as NConfig, Network}; use commonware_runtime::{deterministic, Clock, Metrics, Quota, Runner}; - use commonware_utils::{quorum, NZUsize}; + use commonware_utils::{quorum, NZUsize, NZU16}; use futures::{channel::mpsc, FutureExt, StreamExt}; use std::{ - num::NonZeroU32, + num::{NonZeroU16, NonZeroU32}, sync::{Arc, Mutex}, time::Duration, }; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX); diff --git a/consensus/src/simplex/mod.rs b/consensus/src/simplex/mod.rs index be2cad6b7d..0d2439eac4 100644 --- a/consensus/src/simplex/mod.rs +++ b/consensus/src/simplex/mod.rs @@ -320,20 +320,20 @@ mod tests { use commonware_runtime::{ buffer::PoolRef, deterministic, Clock, Metrics, Quota, Runner, Spawner, }; - use commonware_utils::{max_faults, quorum, test_rng, NZUsize}; + use commonware_utils::{max_faults, quorum, test_rng, NZUsize, NZU16}; use engine::Engine; use futures::{future::join_all, StreamExt}; use rand::{rngs::StdRng, Rng as _}; use std::{ collections::{BTreeMap, HashMap}, - num::{NonZeroU32, NonZeroUsize}, + num::{NonZeroU16, NonZeroU32, NonZeroUsize}, sync::{Arc, Mutex}, time::Duration, }; use tracing::{debug, info, warn}; use types::Activity; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); const TEST_QUOTA: Quota = Quota::per_second(NonZeroU32::MAX); diff --git a/examples/bridge/src/bin/validator.rs b/examples/bridge/src/bin/validator.rs index fb290f00b6..420adce080 100644 --- a/examples/bridge/src/bin/validator.rs +++ b/examples/bridge/src/bin/validator.rs @@ -20,7 +20,7 @@ use commonware_runtime::{ buffer::PoolRef, tokio, Metrics, Network, Quota, RayonPoolSpawner, Runner, }; use commonware_stream::{dial, Config as StreamConfig}; -use commonware_utils::{from_hex, ordered::Set, union, NZUsize, TryCollect, NZU32}; +use commonware_utils::{from_hex, ordered::Set, union, NZUsize, TryCollect, NZU16, NZU32}; use std::{ net::{IpAddr, Ipv4Addr, SocketAddr}, str::FromStr, @@ -264,7 +264,7 @@ fn main() { activity_timeout: ViewDelta::new(10), skip_timeout: ViewDelta::new(5), fetch_concurrent: 32, - buffer_pool: PoolRef::new(NZUsize!(16_384), NZUsize!(10_000)), + buffer_pool: PoolRef::new(NZU16!(16_384), NZUsize!(10_000)), }, ); diff --git a/examples/log/src/main.rs b/examples/log/src/main.rs index db93410646..b61ad3775c 100644 --- a/examples/log/src/main.rs +++ b/examples/log/src/main.rs @@ -55,7 +55,7 @@ use commonware_consensus::{ use commonware_cryptography::{ed25519, Sha256, Signer as _}; use commonware_p2p::{authenticated::discovery, Manager}; use commonware_runtime::{buffer::PoolRef, tokio, Metrics, Quota, Runner}; -use commonware_utils::{ordered::Set, union, NZUsize, TryCollect, NZU32}; +use commonware_utils::{ordered::Set, union, NZUsize, TryCollect, NZU16, NZU32}; use std::{ net::{IpAddr, Ipv4Addr, SocketAddr}, str::FromStr, @@ -223,7 +223,7 @@ fn main() { activity_timeout: ViewDelta::new(10), skip_timeout: ViewDelta::new(5), fetch_concurrent: 32, - buffer_pool: PoolRef::new(NZUsize!(16_384), NZUsize!(10_000)), + buffer_pool: PoolRef::new(NZU16!(16_384), NZUsize!(10_000)), }; let engine = simplex::Engine::new(context.with_label("engine"), cfg); diff --git a/examples/reshare/src/dkg/state.rs b/examples/reshare/src/dkg/state.rs index 447f0b258b..20e1656398 100644 --- a/examples/reshare/src/dkg/state.rs +++ b/examples/reshare/src/dkg/state.rs @@ -30,15 +30,15 @@ use commonware_storage::journal::{ contiguous::variable::{Config as CVConfig, Journal as CVJournal}, segmented::variable::{Config as SVConfig, Journal as SVJournal}, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use futures::StreamExt; use std::{ collections::BTreeMap, - num::{NonZeroU32, NonZeroUsize}, + num::{NonZeroU16, NonZeroU32, NonZeroUsize}, }; use tracing::debug; -const PAGE_SIZE: NonZeroUsize = NZUsize!(1 << 12); +const PAGE_SIZE: NonZeroU16 = NZU16!(1 << 12); const POOL_CAPACITY: NonZeroUsize = NZUsize!(1 << 20); const WRITE_BUFFER: NonZeroUsize = NZUsize!(1 << 12); const READ_BUFFER: NonZeroUsize = NZUsize!(1 << 20); diff --git a/examples/reshare/src/engine.rs b/examples/reshare/src/engine.rs index 331f3dc9aa..48edc03173 100644 --- a/examples/reshare/src/engine.rs +++ b/examples/reshare/src/engine.rs @@ -27,10 +27,14 @@ use commonware_runtime::{ buffer::PoolRef, spawn_cell, Clock, ContextCell, Handle, Metrics, Network, Spawner, Storage, }; use commonware_storage::archive::immutable; -use commonware_utils::{ordered::Set, union, NZUsize, NZU32, NZU64}; +use commonware_utils::{ordered::Set, union, NZUsize, NZU16, NZU32, NZU64}; use futures::{channel::mpsc, future::try_join_all}; use rand_core::CryptoRngCore; -use std::{marker::PhantomData, num::NonZero, time::Instant}; +use std::{ + marker::PhantomData, + num::{NonZero, NonZeroU16}, + time::Instant, +}; use tracing::{error, info, warn}; const MAILBOX_SIZE: usize = 10; @@ -45,7 +49,7 @@ const FREEZER_VALUE_TARGET_SIZE: u64 = 1024 * 1024 * 1024; // 1GB const FREEZER_VALUE_COMPRESSION: Option = Some(3); const REPLAY_BUFFER: NonZero = NZUsize!(8 * 1024 * 1024); // 8MB const WRITE_BUFFER: NonZero = NZUsize!(1024 * 1024); // 1MB -const BUFFER_POOL_PAGE_SIZE: NonZero = NZUsize!(4_096); // 4KB +const BUFFER_POOL_PAGE_SIZE: NonZeroU16 = NZU16!(4_096); // 4KB const BUFFER_POOL_CAPACITY: NonZero = NZUsize!(8_192); // 32MB const MAX_REPAIR: NonZero = NZUsize!(50); diff --git a/examples/reshare/src/orchestrator/actor.rs b/examples/reshare/src/orchestrator/actor.rs index 689a4edba4..3f4fe9a44b 100644 --- a/examples/reshare/src/orchestrator/actor.rs +++ b/examples/reshare/src/orchestrator/actor.rs @@ -23,7 +23,7 @@ use commonware_parallel::Strategy; use commonware_runtime::{ buffer::PoolRef, spawn_cell, Clock, ContextCell, Handle, Metrics, Network, Spawner, Storage, }; -use commonware_utils::{vec::NonEmptyVec, NZUsize}; +use commonware_utils::{vec::NonEmptyVec, NZUsize, NZU16}; use futures::{channel::mpsc, StreamExt}; use rand_core::CryptoRngCore; use std::{collections::BTreeMap, marker::PhantomData, time::Duration}; @@ -105,7 +105,7 @@ where config: Config, ) -> (Self, Mailbox) { let (sender, mailbox) = mpsc::channel(config.mailbox_size); - let pool_ref = PoolRef::new(NZUsize!(16_384), NZUsize!(10_000)); + let pool_ref = PoolRef::new(NZU16!(16_384), NZUsize!(10_000)); ( Self { diff --git a/examples/sync/src/databases/any.rs b/examples/sync/src/databases/any.rs index fc2bbc8b26..4bd206d74e 100644 --- a/examples/sync/src/databases/any.rs +++ b/examples/sync/src/databases/any.rs @@ -17,7 +17,7 @@ use commonware_storage::{ operation::Committable, }, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use std::{future::Future, num::NonZeroU64}; use tracing::error; @@ -39,7 +39,7 @@ pub fn create_config() -> Config { log_write_buffer: NZUsize!(1024), translator: Translator::default(), thread_pool: None, - buffer_pool: buffer::PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: buffer::PoolRef::new(NZU16!(1024), NZUsize!(10)), } } diff --git a/examples/sync/src/databases/immutable.rs b/examples/sync/src/databases/immutable.rs index 245cccebb3..264dc0ff79 100644 --- a/examples/sync/src/databases/immutable.rs +++ b/examples/sync/src/databases/immutable.rs @@ -11,7 +11,7 @@ use commonware_storage::{ Durable, Merkleized, }, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use std::{future::Future, num::NonZeroU64}; use tracing::error; @@ -36,7 +36,7 @@ pub fn create_config() -> Config { log_write_buffer: NZUsize!(1024), translator: commonware_storage::translator::EightCap, thread_pool: None, - buffer_pool: commonware_runtime::buffer::PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: commonware_runtime::buffer::PoolRef::new(NZU16!(1024), NZUsize!(10)), } } diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index 0d07c2783a..8ec609c8b8 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -23,6 +23,7 @@ commonware-conformance = { workspace = true, optional = true } commonware-macros.workspace = true commonware-parallel = { workspace = true, features = ["std"] } commonware-utils = { workspace = true, features = ["std"] } +crc32fast.workspace = true futures.workspace = true governor.workspace = true io-uring = { workspace = true, optional = true } diff --git a/runtime/conformance.toml b/runtime/conformance.toml index a065ec07fb..3aa9ea75bf 100644 --- a/runtime/conformance.toml +++ b/runtime/conformance.toml @@ -1,3 +1,7 @@ ["commonware_runtime::storage::tests::conformance::CodecConformance

"] n_cases = 65536 hash = "541c356728d47b13f1d3ac800926ef3ae2396c82f5d4e043f5c7641c4c22b4b9" + +["commonware_runtime::utils::buffer::pool::tests::conformance::CodecConformance"] +n_cases = 65536 +hash = "2ca927141b521b7cccc541ec0df8614e418d317fc864ce11f428aefb330cf256" diff --git a/runtime/fuzz/Cargo.toml b/runtime/fuzz/Cargo.toml index 1632bdbb8d..92a133d280 100644 --- a/runtime/fuzz/Cargo.toml +++ b/runtime/fuzz/Cargo.toml @@ -19,3 +19,9 @@ name = "buffer" path = "fuzz_targets/buffer.rs" test = false doc = false + +[[bin]] +name = "blob_integrity" +path = "fuzz_targets/blob_integrity.rs" +test = false +doc = false diff --git a/runtime/fuzz/fuzz_targets/blob_integrity.rs b/runtime/fuzz/fuzz_targets/blob_integrity.rs new file mode 100644 index 0000000000..002a51c01c --- /dev/null +++ b/runtime/fuzz/fuzz_targets/blob_integrity.rs @@ -0,0 +1,249 @@ +//! Fuzz test for blob integrity verification. +//! +//! This test verifies that random bit corruptions in persisted blob data are appropriately +//! detected and gracefully handled by page-oriented blob wrappers. +//! +//! Strategy: +//! 1. Write several pages worth of data to an Append blob +//! 2. Flip a random bit in the underlying blob +//! 3. Attempt to read various ranges: +//! - Reads from uncorrupted pages should succeed with correct data +//! - Reads from corrupted pages should either fail OR return correct data +//! (if the bit flip was in padding/unused bytes) +//! 4. Test both Append.read_at() and as_blob_reader() + +#![no_main] + +use arbitrary::{Arbitrary, Unstructured}; +use commonware_runtime::{ + buffer::pool::{Append, PoolRef}, + deterministic, Blob, Runner, Storage, +}; +use commonware_utils::{NZUsize, NZU16}; +use libfuzzer_sys::fuzz_target; + +/// CRC record size. +const CRC_SIZE: u64 = 12; +/// Buffer capacity for the Append wrapper. +const BUFFER_CAPACITY: usize = 1024; +/// Buffer capacity for the blob reader. +const READER_BUFFER_CAPACITY: usize = 256; +/// Maximum number of read operations to perform. +const MAX_READS: usize = 20; + +#[derive(Debug)] +struct FuzzInput { + /// Seed for deterministic execution. + seed: u64, + /// Logical page size (1-255). + page_size: u8, + /// Pool page cache capacity (1-10). + pool_capacity: u8, + /// Number of pages to write (1-10). + num_pages: u8, + /// Byte offset within the blob to corrupt (will be modulo physical_size). + corrupt_byte_offset: u16, + /// Bit position within the byte to flip (0-7). + corrupt_bit: u8, + /// Read operations to perform after corruption. + reads: Vec, +} + +impl<'a> Arbitrary<'a> for FuzzInput { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + let num_reads = u.int_in_range(0..=MAX_READS)?; + let reads = (0..num_reads) + .map(|_| ReadOp::arbitrary(u)) + .collect::, _>>()?; + + Ok(FuzzInput { + seed: u.arbitrary()?, + page_size: u.int_in_range(1..=255)?, + pool_capacity: u.int_in_range(1..=10)?, + num_pages: u.int_in_range(1..=10)?, + corrupt_byte_offset: u.arbitrary()?, + corrupt_bit: u.int_in_range(0..=7)?, + reads, + }) + } +} + +#[derive(Debug)] +struct ReadOp { + /// Logical offset to read from. + offset: u16, + /// Number of bytes to read (1-256). + len: u16, + /// Whether to use the Read wrapper (true) or Append.read_at (false). + use_reader: bool, +} + +impl<'a> Arbitrary<'a> for ReadOp { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + Ok(ReadOp { + offset: u.arbitrary()?, + len: u.int_in_range(1..=256)?, + use_reader: u.arbitrary()?, + }) + } +} + +fn fuzz(input: FuzzInput) { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let page_size = input.page_size as u64; + let physical_page_size = page_size + CRC_SIZE; + let pool_capacity = input.pool_capacity as usize; + let pool_ref = PoolRef::new(NZU16!(page_size as u16), NZUsize!(pool_capacity)); + + // Compute logical size from number of pages. + let logical_size = input.num_pages as u64 * page_size; + + // Generate deterministic data based on seed. + let expected_data: Vec = (0..logical_size) + .map(|i| ((input.seed.wrapping_add(i)) & 0xFF) as u8) + .collect(); + + // Step 1: Write data to the blob. + let (blob, _) = context + .open("test_partition", b"integrity_test") + .await + .expect("cannot open blob"); + + let append = Append::new(blob.clone(), 0, BUFFER_CAPACITY, pool_ref.clone()) + .await + .expect("cannot create append wrapper"); + + append + .append(&expected_data) + .await + .expect("cannot append data"); + append.sync().await.expect("cannot sync"); + drop(append); + + // Step 2: Corrupt a single bit in the blob. + // Calculate physical size: full pages + partial page (if any). + let full_pages = logical_size / page_size; + let partial_bytes = logical_size % page_size; + let physical_size = if partial_bytes > 0 { + (full_pages + 1) * physical_page_size + } else { + full_pages * physical_page_size + }; + let corrupt_offset = (input.corrupt_byte_offset as u64) % physical_size; + let corrupt_bit = input.corrupt_bit; + + // Read the byte, flip the bit, write it back. + let byte_buf = blob + .read_at(vec![0u8; 1], corrupt_offset) + .await + .expect("cannot read byte to corrupt"); + let corrupted_byte = byte_buf.as_ref()[0] ^ (1 << corrupt_bit); + blob.write_at(vec![corrupted_byte], corrupt_offset) + .await + .expect("cannot write corrupted byte"); + blob.sync().await.expect("cannot sync corruption"); + + // Determine which logical page was corrupted. + let corrupted_page = corrupt_offset / physical_page_size; + + // Step 3: Re-open and attempt reads. + let (blob, size) = context + .open("test_partition", b"integrity_test") + .await + .expect("cannot reopen blob"); + + // The append wrapper may truncate if the corruption affected the last page's CRC + // during initialization, so we handle both cases. + let append = match Append::new(blob, size, BUFFER_CAPACITY, pool_ref.clone()).await { + Ok(a) => a, + Err(_) => { + // Corruption was severe enough to fail initialization - this is acceptable. + return; + } + }; + + let reported_size = append.size().await; + + // Step 4: Perform read operations and verify results. + for read_op in &input.reads { + let offset = read_op.offset as u64; + let len = read_op.len as usize; + + // Skip reads that would be entirely out of bounds. + if offset >= reported_size { + continue; + } + + // Clamp length to not exceed reported size. + let len = len.min((reported_size - offset) as usize); + + // Determine which pages this read spans. + let start_page = offset / page_size; + let end_page = (offset + len as u64 - 1) / page_size; + let read_touches_corrupted_page = + start_page <= corrupted_page && corrupted_page <= end_page; + + if read_op.use_reader { + // Use as_blob_reader. + // Note: The Read wrapper buffers multiple pages at once, so corruption on ANY + // page in the buffer can cause a read to fail - not just the page being accessed. + // We can only verify that successful reads return correct data. + let reader_result = append.as_blob_reader(NZUsize!(READER_BUFFER_CAPACITY)).await; + let mut reader = match reader_result { + Ok(r) => r, + Err(_) => continue, // Reader creation failed, skip. + }; + + // Seek to the read offset. + if reader.seek_to(offset).is_err() { + continue; + } + + let mut buf = vec![0u8; len]; + let read_result = reader.read_exact(&mut buf, len).await; + + if let Ok(()) = read_result { + // Read succeeded - data must match expected. + let expected_slice = &expected_data[offset as usize..offset as usize + len]; + assert_eq!( + &buf, expected_slice, + "Read via reader returned wrong data at offset {}, len {}", + offset, len + ); + } + // Read failures are acceptable due to buffering behavior. + } else { + // Use Append.read_at directly. + let buf = vec![0u8; len]; + let read_result = append.read_at(buf, offset).await; + + match read_result { + Ok(buf) => { + // Read succeeded - data must match expected. + let buf: Vec = buf.into(); + let expected_slice = &expected_data[offset as usize..offset as usize + len]; + assert_eq!( + &buf, expected_slice, + "Read via Append returned wrong data at offset {}, len {}", + offset, len + ); + } + Err(_) => { + // Read failed - this is only acceptable if the read touched + // the corrupted page. + assert!( + read_touches_corrupted_page, + "Read via Append failed at offset {}, len {} but didn't touch corrupted page {}", + offset, len, corrupted_page + ); + } + } + } + } + }); +} + +fuzz_target!(|input: FuzzInput| { + fuzz(input); +}); diff --git a/runtime/fuzz/fuzz_targets/buffer.rs b/runtime/fuzz/fuzz_targets/buffer.rs index de596ae15b..6acc67df84 100644 --- a/runtime/fuzz/fuzz_targets/buffer.rs +++ b/runtime/fuzz/fuzz_targets/buffer.rs @@ -2,10 +2,13 @@ use arbitrary::Arbitrary; use commonware_runtime::{ - buffer::{Append, PoolRef, Read, Write}, + buffer::{ + pool::{Append, PoolRef}, + Read, Write, + }, deterministic, Blob, Runner, Storage, }; -use commonware_utils::NZUsize; +use commonware_utils::{NZUsize, NZU16}; use libfuzzer_sys::fuzz_target; const MAX_SIZE: usize = 1024 * 1024; @@ -77,7 +80,9 @@ enum FuzzOperation { offset: u16, }, AppendSize, - AppendCloneBlob, + AppendAsReader { + buffer_size: u16, + }, AppendReadAt { data_size: u16, offset: u16, @@ -148,8 +153,7 @@ fn fuzz(input: FuzzInput) { pool_page_size, pool_capacity, } => { - let buffer_size = NZUsize!((buffer_size as usize).clamp(1, MAX_SIZE)); - let pool_page_size = NZUsize!((pool_page_size as usize).clamp(1, MAX_SIZE)); + let buffer_size = (buffer_size as usize).clamp(0, MAX_SIZE); let pool_capacity = NZUsize!((pool_capacity as usize).clamp(1, MAX_SIZE)); let (blob, _) = context @@ -157,8 +161,14 @@ fn fuzz(input: FuzzInput) { .await .expect("cannot open write blob"); - pool_ref = Some(PoolRef::new(pool_page_size, pool_capacity)); - pool_page_size_ref = Some(pool_page_size); + // Only create a new pool if one doesn't exist. Reusing the same blob with + // a different page size would corrupt reads since page size is embedded + // in the CRC records. + if pool_ref.is_none() { + let pool_page_size = pool_page_size.clamp(1, u16::MAX); + pool_ref = Some(PoolRef::new(NZU16!(pool_page_size), pool_capacity)); + pool_page_size_ref = Some(pool_page_size); + } if let Some(ref pool) = pool_ref { append_buffer = @@ -236,7 +246,7 @@ fn fuzz(input: FuzzInput) { }; let current_size = append.size().await; if current_size.checked_add(data.len() as u64).is_some() { - let _ = append.append(data).await; + let _ = append.append(&data).await; } } } @@ -260,15 +270,13 @@ fn fuzz(input: FuzzInput) { } => { if let Some(ref pool) = pool_ref { let offset = offset as u64; - let data = if data.len() > MAX_SIZE { - &data[..MAX_SIZE] - } else { - &data[..] - }; - if let Some(pool_page_size) = pool_page_size_ref { - let aligned_offset = (offset / pool_page_size.get() as u64) - * pool_page_size.get() as u64; - let _ = pool.cache(blob_id as u64, data, aligned_offset).await; + if data.len() >= pool.page_size() as usize { + let data = &data[..pool.page_size() as usize]; + if let Some(pool_page_size) = pool_page_size_ref { + let aligned_offset = + (offset / pool_page_size as u64) * pool_page_size as u64; + let _ = pool.cache(blob_id as u64, data, aligned_offset).await; + } } } } @@ -320,9 +328,15 @@ fn fuzz(input: FuzzInput) { } } - FuzzOperation::AppendCloneBlob => { + FuzzOperation::AppendAsReader { buffer_size } => { if let Some(ref append) = append_buffer { - let _ = append.clone_blob().await; + let buffer_size = NZUsize!((buffer_size as usize).clamp(1, MAX_SIZE)); + // This fuzzer never corrupts data, so CRC validation in as_blob_reader + // should always succeed. A failure here indicates a bug. + let _ = append + .as_blob_reader(buffer_size) + .await + .expect("Failed to create blob reader"); } } diff --git a/runtime/src/lib.rs b/runtime/src/lib.rs index 115f9ad939..6422f05efb 100644 --- a/runtime/src/lib.rs +++ b/runtime/src/lib.rs @@ -111,8 +111,12 @@ pub enum Error { expected: std::ops::RangeInclusive, found: u16, }, + #[error("invalid or missing checksum")] + InvalidChecksum, #[error("offset overflow")] OffsetOverflow, + #[error("immutable blob")] + ImmutableBlob, #[error("io error: {0}")] Io(#[from] IoError), } diff --git a/runtime/src/utils/buffer/append.rs b/runtime/src/utils/buffer/append.rs deleted file mode 100644 index 27b35d7e17..0000000000 --- a/runtime/src/utils/buffer/append.rs +++ /dev/null @@ -1,545 +0,0 @@ -use crate::{ - buffer::{tip::Buffer, PoolRef}, - Blob, Error, RwLock, -}; -use commonware_utils::{NZUsize, StableBuf}; -use std::{num::NonZeroUsize, sync::Arc}; - -/// A [Blob] wrapper that supports appending new data that is both read and write cached, and -/// provides buffer-pool managed read caching of older data. -/// -/// # Concurrent Access -/// -/// This implementation allows readers to proceed while flush I/O is in progress, as long as they -/// are reading from the write buffer or the pool cache. Readers that need to access data from the -/// underlying blob (cache miss) will wait for any in-progress write to complete. -/// -/// The implementation involves two locks: one for the write buffer (and blob size metadata), and -/// one for the underlying blob itself. To avoid deadlocks, the buffer lock is always acquired -/// before the blob lock. -#[derive(Clone)] -pub struct Append { - /// The underlying blob being wrapped, protected by a lock for I/O coordination. - blob: Arc>, - - /// Unique id assigned by the buffer pool. - id: u64, - - /// Buffer pool to consult for caching. - pool_ref: PoolRef, - - /// The buffer containing the data yet to be appended to the tip of the underlying blob, as well - /// as up to the final page_size-1 bytes from the underlying blob (to ensure the buffer's offset - /// is always at a page boundary), paired with the actual size of the underlying blob on disk. - /// - /// # Invariants - /// - /// - The buffer's `offset` into the blob is always page aligned. - /// - The range of bytes in this buffer never overlaps with any page buffered by `pool`. (See - /// the warning in [Self::resize] for one uncommon exception.) - buffer: Arc>, -} - -impl Append { - /// Create a new [Append] of provided `size` using the provided `pool` for read caching, and a - /// write buffer with capacity `buffer_size`. - pub async fn new( - blob: B, - size: u64, - buffer_size: NonZeroUsize, - pool_ref: PoolRef, - ) -> Result { - // Set a floor on the write buffer size to make sure we always write at least 1 page of new - // data with each flush. We multiply page_size by two here since we could be storing up to - // page_size-1 bytes of already written data in the append buffer to maintain page - // alignment. - let mut buffer_size = buffer_size.get(); - buffer_size = buffer_size.max(pool_ref.page_size * 2); - - // Initialize the append buffer to contain the last non-full page of bytes from the blob to - // ensure its offset into the blob is always page aligned. - let leftover_size = size % pool_ref.page_size as u64; - let page_aligned_size = size - leftover_size; - let mut buffer = Buffer::new(page_aligned_size, NZUsize!(buffer_size)); - if leftover_size != 0 { - let page_buf = vec![0; leftover_size as usize]; - let buf = blob.read_at(page_buf, page_aligned_size).await?; - assert!(!buffer.append(buf.as_ref())); - } - - Ok(Self { - blob: Arc::new(RwLock::new(blob)), - id: pool_ref.next_id().await, - pool_ref, - buffer: Arc::new(RwLock::new((buffer, size))), - }) - } - - /// Append all bytes in `buf` to the tip of the blob. - pub async fn append(&self, buf: impl Into + Send) -> Result<(), Error> { - // Prepare `buf` to be written. - let buf = buf.into(); - - // Acquire a write lock on the buffer and blob_size. - let mut guard = self.buffer.write().await; - let (buffer, _) = &mut *guard; - - // Ensure the write doesn't overflow. - buffer - .size() - .checked_add(buf.len() as u64) - .ok_or(Error::OffsetOverflow)?; - - if buffer.append(buf.as_ref()) { - // Buffer is over capacity, flush it to the underlying blob. - return self.flush_internal(guard).await; - } - - Ok(()) - } - - /// Returns the current logical size of the blob including any buffered data. - /// - /// This represents the total size of data that would be present after flushing. - #[allow(clippy::len_without_is_empty)] - pub async fn size(&self) -> u64 { - self.buffer.read().await.0.size() - } - - /// Flush the append buffer to the underlying blob, caching each page worth of written data in - /// the buffer pool. - /// - /// This method acquires the blob write lock before releasing the buffer lock, ensuring readers - /// that need blob access will wait for the write to complete. - async fn flush_internal( - &self, - mut buf_guard: crate::RwLockWriteGuard<'_, (Buffer, u64)>, - ) -> Result<(), Error> { - let (buffer, blob_size) = &mut *buf_guard; - - // Prepare the data to be written. - let Some(buf) = self.prepare_flush_data(buffer, *blob_size).await else { - return Ok(()); - }; - - // Update blob_size *before* releasing the lock. We do this optimistically; if the write - // fails below, the program will return an error and likely abort/panic, so maintaining - // exact consistency on error isn't strictly required. - let write_offset = *blob_size; - *blob_size += buf.len() as u64; - - // Acquire blob write lock BEFORE releasing buffer lock. This ensures no reader can access - // the blob until the write completes. - let blob_guard = self.blob.write().await; - - // Release buffer lock, allowing concurrent buffered reads while the write is in progress. - // Any attempts to read from the blob will block until the write completes. - drop(buf_guard); - - // Perform the write while holding only blob lock. - blob_guard.write_at(buf, write_offset).await - } - - /// Prepares data from the buffer to be flushed to the blob. - /// - /// This method: - /// 1. Takes the data from the write buffer. - /// 2. Caches it in the buffer pool. - /// 3. Returns the data to be written and the offset to write it at (if any). - async fn prepare_flush_data(&self, buffer: &mut Buffer, blob_size: u64) -> Option> { - // Take the buffered data, if any. - let (mut buf, offset) = buffer.take()?; - - // Insert the flushed data into the buffer pool. - let remaining = self.pool_ref.cache(self.id, &buf, offset).await; - - // If there's any data left over that doesn't constitute an entire page, re-buffer it into - // the append buffer to maintain its page-boundary alignment. - if remaining != 0 { - buffer.offset -= remaining as u64; - buffer.data.extend_from_slice(&buf[buf.len() - remaining..]) - } - - // Calculate where new data starts in the buffer to skip already-written trailing bytes. - let new_data_start = blob_size.saturating_sub(offset) as usize; - - // Early exit if there's no new data to write. - if new_data_start >= buf.len() { - return None; - } - - if new_data_start > 0 { - buf.drain(0..new_data_start); - } - - // Return the data to write, and the offset where to write it within the blob. - Some(buf) - } - - /// Clones and returns the underlying blob. - pub async fn clone_blob(&self) -> B { - self.blob.read().await.clone() - } -} - -impl Blob for Append { - async fn read_at( - &self, - buf: impl Into + Send, - offset: u64, - ) -> Result { - // Prepare `buf` to capture the read data. - let mut buf = buf.into(); - - // Ensure the read doesn't overflow. - let end_offset = offset - .checked_add(buf.len() as u64) - .ok_or(Error::OffsetOverflow)?; - - // Acquire a read lock on the buffer. - let guard = self.buffer.read().await; - let (buffer, _) = &*guard; - - // If the data required is beyond the size of the blob, return an error. - if end_offset > buffer.size() { - return Err(Error::BlobInsufficientLength); - } - - // Extract any bytes from the buffer that overlap with the requested range. - let remaining = buffer.extract(buf.as_mut(), offset); - - // Release buffer lock before potential I/O. - drop(guard); - - if remaining == 0 { - return Ok(buf); - } - - // Fast path: try to read *only* from pool cache without acquiring blob lock. This allows - // concurrent reads even while a flush is in progress. - let cached = self - .pool_ref - .read_cached(self.id, &mut buf.as_mut()[..remaining], offset) - .await; - - if cached == remaining { - // All bytes found in cache. - return Ok(buf); - } - - // Slow path: cache miss (partial or full), acquire blob read lock to ensure any in-flight - // write completes before we read from the blob. - let blob_guard = self.blob.read().await; - - // Read remaining bytes that were not already obtained from the earlier cache read. - let uncached_offset = offset + cached as u64; - let uncached_len = remaining - cached; - self.pool_ref - .read( - &*blob_guard, - self.id, - &mut buf.as_mut()[cached..cached + uncached_len], - uncached_offset, - ) - .await?; - - Ok(buf) - } - - /// This [Blob] trait method is unimplemented by [Append] and unconditionally panics. - async fn write_at(&self, _buf: impl Into + Send, _offset: u64) -> Result<(), Error> { - // TODO(): Extend the buffer pool to - // support arbitrary writes. - unimplemented!("append-only blob type does not support write_at") - } - - async fn sync(&self) -> Result<(), Error> { - // Flush any buffered data. When flush_internal returns, the write_at has completed - // and data is in the OS buffer. - { - let guard = self.buffer.write().await; - self.flush_internal(guard).await?; - } - // Sync the OS buffer to disk. We need the blob read lock here since sync() requires - // access to the blob, but only a read lock since we're not modifying blob state. - self.blob.read().await.sync().await - } - - /// Resize the blob to the provided `size`. - /// - /// # Warning - /// - /// Concurrent readers which try to read past the new size during the resize may error. - async fn resize(&self, size: u64) -> Result<(), Error> { - // Implementation note: rewinding the blob across a page boundary potentially results in - // stale data remaining in the buffer pool's cache. We don't proactively purge the data - // within this function since it would be inaccessible anyway. Instead we ensure it is - // always updated should the blob grow back to the point where we have new data for the same - // page, if any old data hasn't expired naturally by then. - - // Acquire buffer lock first. - // NOTE: We MUST acquire the buffer lock before the blob lock to avoid deadlocks with - // `append`, which acquires buffer then blob (via `flush_internal`). - let mut buf_guard = self.buffer.write().await; - let (buffer, blob_size) = &mut *buf_guard; - - let flush_data = self.prepare_flush_data(buffer, *blob_size).await; - - // Acquire blob write lock to prevent concurrent reads throughout the resize. - let blob_guard = self.blob.write().await; - - // Flush any buffered bytes first, using the helper. - // We hold both locks here, so no concurrent operations can happen. - if let Some(buf) = flush_data { - // Write the data to the blob. - let len = buf.len() as u64; - blob_guard.write_at(buf, *blob_size).await?; - - // Update blob_size to reflect the flush. - *blob_size += len; - } - - // Resize the underlying blob. - blob_guard.resize(size).await?; - - // Update the blob size. - *blob_size = size; - - // Reset the append buffer to the new size, ensuring its page alignment. - let leftover_size = size % self.pool_ref.page_size as u64; - buffer.offset = size - leftover_size; // page aligned size - buffer.data.clear(); - if leftover_size != 0 { - let page_buf = vec![0; leftover_size as usize]; - let buf = blob_guard.read_at(page_buf, buffer.offset).await?; - assert!(!buffer.append(buf.as_ref())); - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::{deterministic, Runner, Storage as _}; - use commonware_macros::test_traced; - use commonware_utils::NZUsize; - - const PAGE_SIZE: usize = 1024; - const BUFFER_SIZE: usize = PAGE_SIZE * 2; - - #[test_traced] - #[should_panic(expected = "not implemented")] - fn test_append_blob_write_panics() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - // Start the test within the executor - executor.start(|context| async move { - let (blob, size) = context - .open("test", "blob".as_bytes()) - .await - .expect("Failed to open blob"); - let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10)); - let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone()) - .await - .unwrap(); - assert_eq!(blob.size().await, 0); - blob.write_at(vec![0], 0).await.unwrap(); - }); - } - - #[test_traced] - fn test_append_blob_append() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - // Start the test within the executor - executor.start(|context| async move { - let (blob, size) = context - .open("test", "blob".as_bytes()) - .await - .expect("Failed to open blob"); - assert_eq!(size, 0); - - // Wrap the blob, then append 11 consecutive pages of data. - let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10)); - let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone()) - .await - .unwrap(); - for i in 0..11 { - let buf = vec![i as u8; PAGE_SIZE]; - blob.append(buf).await.unwrap(); - } - assert_eq!(blob.size().await, 11 * PAGE_SIZE as u64); - - blob.sync().await.expect("Failed to sync blob"); - - // Make sure blob has expected size when reopened. - let (blob, size) = context - .open("test", "blob".as_bytes()) - .await - .expect("Failed to open blob"); - assert_eq!(size, 11 * PAGE_SIZE as u64); - blob.sync().await.expect("Failed to sync blob"); - }); - } - - #[test_traced] - fn test_append_blob_read() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - // Start the test within the executor - executor.start(|context| async move { - let (blob, size) = context - .open("test", "blob".as_bytes()) - .await - .expect("Failed to open blob"); - assert_eq!(size, 0); - - let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10)); - let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone()) - .await - .unwrap(); - - // Append one byte & sync to ensure we have "trailing bytes". - blob.append(vec![42]).await.unwrap(); - blob.sync().await.unwrap(); - - // Append 11 consecutive pages of data. - for i in 0..11 { - let buf = vec![i as u8; PAGE_SIZE]; - blob.append(buf).await.unwrap(); - } - - // Read from the blob across a page boundary but well outside any write buffered data. - let mut buf = vec![0; 100]; - buf = blob - .read_at(buf, 1 + PAGE_SIZE as u64 - 50) - .await - .unwrap() - .into(); - let mut expected = vec![0; 50]; - expected.extend_from_slice(&[1; 50]); - assert_eq!(buf, expected); - - // Read from the blob across a page boundary but within the write buffered data. - let mut buf = vec![0; 100]; - buf = blob - .read_at(buf, 1 + (PAGE_SIZE as u64 * 10) - 50) - .await - .unwrap() - .into(); - let mut expected = vec![9; 50]; - expected.extend_from_slice(&[10; 50]); - assert_eq!(buf, expected); - - // Read across read-only and write-buffered section, all the way up to the very last - // byte. - let buf_size = PAGE_SIZE * 4; - let mut buf = vec![0; buf_size]; - buf = blob - .read_at(buf, blob.size().await - buf_size as u64) - .await - .unwrap() - .into(); - let mut expected = vec![7; PAGE_SIZE]; - expected.extend_from_slice(&[8; PAGE_SIZE]); - expected.extend_from_slice(&[9; PAGE_SIZE]); - expected.extend_from_slice(&[10; PAGE_SIZE]); - assert_eq!(buf, expected); - - // Exercise more boundary conditions by reading every possible 2-byte slice. - for i in 0..blob.size().await - 1 { - let mut buf = vec![0; 2]; - buf = blob.read_at(buf, i).await.unwrap().into(); - let page_num = (i / PAGE_SIZE as u64) as u8; - if i == 0 { - assert_eq!(buf, &[42, 0]); - } else if i % PAGE_SIZE as u64 == 0 { - assert_eq!(buf, &[page_num - 1, page_num], "i = {i}"); - } else { - assert_eq!(buf, &[page_num; 2], "i = {i}"); - } - } - - // Confirm all bytes are as expected after syncing the blob. - blob.sync().await.unwrap(); - buf = blob.read_at(vec![0], 0).await.unwrap().into(); - assert_eq!(buf, &[42]); - - for i in 0..11 { - let mut buf = vec![0; PAGE_SIZE]; - buf = blob - .read_at(buf, 1 + i * PAGE_SIZE as u64) - .await - .unwrap() - .into(); - assert_eq!(buf, &[i as u8; PAGE_SIZE]); - } - - blob.sync().await.expect("Failed to sync blob"); - }); - } - - #[test_traced] - fn test_append_blob_tracks_physical_size() { - let executor = deterministic::Runner::default(); - executor.start(|context| async move { - let (blob, size) = context - .open("test", "blob".as_bytes()) - .await - .expect("Failed to open blob"); - - let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10)); - let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone()) - .await - .unwrap(); - - // Initially blob_size should be 0. - assert_eq!(blob.buffer.read().await.1, 0); - - // Write 100 bytes and sync. - blob.append(vec![1u8; 100]).await.unwrap(); - blob.sync().await.unwrap(); - assert_eq!(blob.buffer.read().await.1, 100); - - // Append more data but don't sync yet, blob_size shouldn't change. - blob.append(vec![2u8; 200]).await.unwrap(); - assert_eq!(blob.buffer.read().await.1, 100); - - // Force a flush by exceeding buffer. - blob.append(vec![3u8; BUFFER_SIZE]).await.unwrap(); - assert_eq!(blob.buffer.read().await.1, 100 + 200 + BUFFER_SIZE as u64); - - // Test resize down and up. - blob.resize(50).await.unwrap(); - assert_eq!(blob.buffer.read().await.1, 50); - - blob.resize(150).await.unwrap(); - assert_eq!(blob.buffer.read().await.1, 150); - - // Append after resize and sync. - blob.append(vec![4u8; 100]).await.unwrap(); - blob.sync().await.unwrap(); - assert_eq!(blob.buffer.read().await.1, 250); - - // Close and reopen. - let (blob, size) = context - .open("test", "blob".as_bytes()) - .await - .expect("Failed to reopen blob"); - - let blob = Append::new(blob, size, NZUsize!(BUFFER_SIZE), pool_ref.clone()) - .await - .unwrap(); - assert_eq!(blob.buffer.read().await.1, 250); - - // Verify data integrity after all operations. - let mut buf = vec![0u8; 250]; - buf = blob.read_at(buf, 0).await.unwrap().into(); - assert_eq!(&buf[0..50], &vec![1u8; 50][..]); - assert_eq!(&buf[50..150], &vec![0u8; 100][..]); // Zeros from resize up to 150 - assert_eq!(&buf[150..250], &vec![4u8; 100][..]); - }); - } -} diff --git a/runtime/src/utils/buffer/mod.rs b/runtime/src/utils/buffer/mod.rs index 87083648b8..1f35507ed7 100644 --- a/runtime/src/utils/buffer/mod.rs +++ b/runtime/src/utils/buffer/mod.rs @@ -1,13 +1,11 @@ //! Buffers for reading and writing to [crate::Blob]s. -mod append; pub mod pool; mod read; mod tip; mod write; -pub use append::Append; -pub use pool::{Pool, PoolRef}; +pub use pool::PoolRef; pub use read::Read; pub use write::Write; diff --git a/runtime/src/utils/buffer/pool/append.rs b/runtime/src/utils/buffer/pool/append.rs new file mode 100644 index 0000000000..ab1ec84687 --- /dev/null +++ b/runtime/src/utils/buffer/pool/append.rs @@ -0,0 +1,2086 @@ +//! The [Append] wrapper consists of a [Blob] and a write buffer, and provides a logical view over +//! the underlying blob which has a page-oriented structure that provides integrity guarantees. The +//! wrapper also provides read caching managed by a buffer pool. +//! +//! # Warning +//! +//! Writing new data to the blob can only be done through `append`. The `write` function is not +//! supported and will panic. +//! +//! # Immutability +//! +//! The wrapper can be created in (or converted to) an immutable state, which will prevent any +//! modifications while still supporting cached reads. This can be used to reduce its memory +//! footprint and/or to prevent unintended modifications. +//! +//! # Recovery +//! +//! On `sync`, this wrapper will durably write buffered data to the underlying blob in pages. All +//! pages have a [Checksum] at the end. If no CRC record existed before for the page being written, +//! then one of the checksums will be all zero. If a checksum already existed for the page being +//! written, then the write will overwrite only the checksum with the lesser length value. Should +//! this write fail, the previously committed page state can still be recovered. +//! +//! During non-immutable blob initialization, the wrapper will back up over any page that is not +//! accompanied by a valid CRC, treating it as the result of an incomplete write that may be +//! invalid. Immutable blob initialization will fail if any trailing data is detected that cannot be +//! validated by a CRC. + +use crate::{ + buffer::{ + pool::{Checksum, PoolRef, Read, CHECKSUM_SIZE}, + tip::Buffer, + }, + Blob, Error, RwLock, RwLockWriteGuard, +}; +use commonware_utils::StableBuf; +use std::{num::NonZeroUsize, sync::Arc}; +use tracing::warn; + +/// Indicates which CRC slot in a page record must not be overwritten. +#[derive(Clone, Copy)] +enum ProtectedCrc { + First, + Second, +} + +/// Describes the state of the underlying blob with respect to the buffer. +#[derive(Clone)] +struct BlobState { + blob: B, + + /// The page where the next appended byte will be written to. + current_page: u64, + + /// The state of the partial page in the blob. If it was written due to a sync call, then this + /// will contain its CRC record. + partial_page_state: Option, +} + +/// A [Blob] wrapper that supports write-cached appending of data, with checksums for data integrity +/// and buffer pool managed caching. +#[derive(Clone)] +pub struct Append { + /// The underlying blob being wrapped. + blob_state: Arc>>, + + /// Unique id assigned to this blob by the buffer pool. + id: u64, + + /// A reference to the buffer pool that manages read caching for this blob. + pool_ref: PoolRef, + + /// The write buffer containing any logical bytes following the last full page boundary in the + /// underlying blob. + buffer: Arc>, +} + +/// Returns the capacity with a floor applied to ensure it can hold at least one full page of new +/// data even when caching a nearly-full page of already written data. +fn capacity_with_floor(capacity: usize, page_size: u64) -> usize { + let floor = page_size as usize * 2; + if capacity < floor { + warn!( + floor, + "requested buffer capacity is too low, increasing it to floor" + ); + floor + } else { + capacity + } +} + +impl Append { + /// Create a new [Append] wrapper of the provided `blob` that is known to have `blob_size` + /// underlying physical bytes, using the provided `pool` for read caching, and a write buffer + /// with capacity `capacity`. Rewinds the blob if necessary to ensure it only contains + /// checksum-validated data. + pub async fn new( + blob: B, + original_blob_size: u64, + capacity: usize, + pool_ref: PoolRef, + ) -> Result { + let (partial_page_state, pages, invalid_data_found) = + Self::read_last_valid_page(&blob, original_blob_size, pool_ref.page_size()).await?; + if invalid_data_found { + // Invalid data was detected, trim it from the blob. + let new_blob_size = pages * (pool_ref.page_size() + CHECKSUM_SIZE); + warn!( + original_blob_size, + new_blob_size, "truncating blob to remove invalid data" + ); + blob.resize(new_blob_size).await?; + blob.sync().await?; + } + + let capacity = capacity_with_floor(capacity, pool_ref.page_size()); + + let (blob_state, data) = match partial_page_state { + Some((mut partial_page, crc_record)) => { + // A partial page exists, make sure we buffer it. + partial_page.reserve(capacity - partial_page.len()); + ( + BlobState { + blob, + current_page: pages - 1, + partial_page_state: Some(crc_record), + }, + partial_page, + ) + } + None => ( + BlobState { + blob, + current_page: pages, + partial_page_state: None, + }, + Vec::with_capacity(capacity), + ), + }; + + let buffer = Buffer { + offset: blob_state.current_page * pool_ref.page_size(), + data, + capacity, + immutable: false, + }; + + Ok(Self { + blob_state: Arc::new(RwLock::new(blob_state)), + id: pool_ref.next_id().await, + pool_ref, + buffer: Arc::new(RwLock::new(buffer)), + }) + } + + /// Return a new [Append] wrapper of the provided `blob` that is known to have `blob_size` + /// underlying physical bytes, using the provided `pool` for read caching. The wrapper is for + /// read-only data, and any append attempts will return error. The provided `capacity` is used + /// only if the blob is later turned into a mutable one. Immutable blobs are assumed consistent + /// on disk, so any CRC verification failure results in an error without any recovery attempt. + pub async fn new_immutable( + blob: B, + blob_size: u64, + capacity: usize, + pool_ref: PoolRef, + ) -> Result { + let (partial_page_state, pages, invalid_data_found) = + Self::read_last_valid_page(&blob, blob_size, pool_ref.page_size()).await?; + if invalid_data_found { + // Invalid data was detected, so this blob is not consistent. + return Err(Error::InvalidChecksum); + } + + let capacity = capacity_with_floor(capacity, pool_ref.page_size()); + + let (blob_state, data) = match partial_page_state { + Some((mut partial_page, crc_record)) => { + // A partial page exists, so put it in the buffer. + partial_page.shrink_to_fit(); + ( + BlobState { + blob, + current_page: pages - 1, + partial_page_state: Some(crc_record), + }, + partial_page, + ) + } + None => ( + BlobState { + blob, + current_page: pages, + partial_page_state: None, + }, + vec![], + ), + }; + let buffer = Buffer { + data, + capacity, + offset: blob_state.current_page * pool_ref.page_size(), + immutable: true, + }; + + Ok(Self { + blob_state: Arc::new(RwLock::new(blob_state)), + id: pool_ref.next_id().await, + pool_ref, + buffer: Arc::new(RwLock::new(buffer)), + }) + } + + /// Returns `true` if this blob is in the immutable state. + pub async fn is_immutable(&self) -> bool { + let buffer = self.buffer.read().await; + + buffer.immutable + } + + /// Convert this blob to the immutable state if it's not already in it. + /// + /// If there is unwritten data in the buffer, it will be flushed and synced before returning. + pub async fn to_immutable(&self) -> Result<(), Error> { + // Flush any buffered data. When flush_internal returns, write_at has completed and data + // has been written to the underlying blob. + let mut buf_guard = self.buffer.write().await; + if buf_guard.immutable { + return Ok(()); + } + buf_guard.immutable = true; + self.flush_internal(buf_guard, true).await?; + + // Shrink the buffer capacity to minimum since we won't be adding to it. This requires + // re-acquiring the write lock. + { + let mut buf_guard = self.buffer.write().await; + buf_guard.data.shrink_to_fit(); + } + + // Sync the underlying blob to ensure new_immutable on restart will succeed even in the + // event of a crash. + let blob_state = self.blob_state.read().await; + blob_state.blob.sync().await + } + + /// Convert this blob to the mutable state if it's not already in it. + pub async fn to_mutable(&self) { + let mut buffer = self.buffer.write().await; + if !buffer.immutable { + return; + } + buffer.immutable = false; + } + + /// Scans backwards from the end of the blob, stopping when it finds a valid page. + /// + /// # Returns + /// + /// A tuple of `(partial_page, page_count, invalid_data_found)`: + /// + /// - `partial_page`: If the last valid page is partial (contains fewer than `page_size` logical + /// bytes), returns `Some((data, crc_record))` containing the logical data and its CRC record. + /// Returns `None` if the last valid page is full or if no valid pages exist. + /// + /// - `page_count`: The number of pages in the blob up to and including the last valid page + /// found (whether or not it's partial). Note that it's possible earlier pages may be invalid + /// since this function stops scanning when it finds one valid page. + /// + /// - `invalid_data_found`: `true` if there are any bytes in the blob that follow the last valid + /// page. Typically the blob should be resized to eliminate them since their integrity cannot + /// be guaranteed. + async fn read_last_valid_page( + blob: &B, + blob_size: u64, + page_size: u64, + ) -> Result<(Option<(Vec, Checksum)>, u64, bool), Error> { + let physical_page_size = page_size + CHECKSUM_SIZE; + let partial_bytes = blob_size % physical_page_size; + let mut last_page_end = blob_size - partial_bytes; + + // If the last physical page in the blob is truncated, it can't have a valid CRC record and + // must be invalid. + let mut invalid_data_found = partial_bytes != 0; + + while last_page_end != 0 { + // Read the last page and parse its CRC record. + let page_start = last_page_end - physical_page_size; + let buf = vec![0; physical_page_size as usize]; + let buf = blob.read_at(buf, page_start).await?; + + match Checksum::validate_page(buf.as_ref()) { + Some(crc_record) => { + // Found a valid page. + let (len, _) = crc_record.get_crc(); + let len = len as u64; + if len != page_size { + // The page is partial (logical data doesn't fill the page). + let buf: Vec = buf.into(); + let logical_bytes = buf[..(len as usize)].to_vec(); + return Ok(( + Some((logical_bytes, crc_record)), + last_page_end / physical_page_size, + invalid_data_found, + )); + } + // The page is full. + return Ok((None, last_page_end / physical_page_size, invalid_data_found)); + } + None => { + // The page is invalid. + last_page_end = page_start; + invalid_data_found = true; + } + } + } + + // No valid page exists in the blob. + Ok((None, 0, invalid_data_found)) + } + + /// Append all bytes in `buf` to the tip of the blob. + /// + /// # Errors + /// + /// * `Error::ImmutableBlob` - The blob is in the immutable state. + pub async fn append(&self, buf: &[u8]) -> Result<(), Error> { + let mut buffer = self.buffer.write().await; + if buffer.immutable { + return Err(Error::ImmutableBlob); + } + + if !buffer.append(buf) { + return Ok(()); + } + + // Buffer is over capacity, so we need to write data to the blob. + self.flush_internal(buffer, false).await + } + + /// Flush all full pages from the buffer to disk, resetting the buffer to contain only the bytes + /// in any final partial page. If `write_partial_page` is true, the partial page will be written + /// to the blob as well along with a CRC record. + async fn flush_internal( + &self, + mut buf_guard: RwLockWriteGuard<'_, Buffer>, + write_partial_page: bool, + ) -> Result<(), Error> { + let buffer = &mut *buf_guard; + + // Cache the pages we are writing in the buffer pool so they remain cached for concurrent + // reads while we flush the buffer. + let remaining_byte_count = self + .pool_ref + .cache(self.id, &buffer.data, buffer.offset) + .await; + + // Read the old partial page state before doing the heavy work of preparing physical pages. + // This is safe because partial_page_state is only modified by flush_internal, and we hold + // the buffer write lock which prevents concurrent flushes. + let old_partial_page_state = { + let blob_state = self.blob_state.read().await; + blob_state.partial_page_state.clone() + }; + + // Prepare the *physical* pages corresponding to the data in the buffer. + // Pass the old partial page state so the CRC record is constructed correctly. + let (physical_pages, partial_page_state) = self.to_physical_pages( + &*buffer, + write_partial_page, + old_partial_page_state.as_ref(), + ); + + // If there's nothing to write, return early. + if physical_pages.is_empty() { + return Ok(()); + } + + // Drain the provided buffer of the full pages that are now cached in the buffer pool and + // will be written to the blob. + let bytes_to_drain = buffer.data.len() - remaining_byte_count; + buffer.data.drain(0..bytes_to_drain); + buffer.offset += bytes_to_drain as u64; + let new_offset = buffer.offset; + + // Acquire a write lock on the blob state so nobody tries to read or modify the blob while + // we're writing to it. + let mut blob_state = self.blob_state.write().await; + + // Release the buffer lock to allow for concurrent reads & buffered writes while we write + // the physical pages. + drop(buf_guard); + + let logical_page_size = self.pool_ref.page_size() as usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE as usize; + let write_at_offset = blob_state.current_page * physical_page_size as u64; + + // Count only FULL pages for advancing current_page. A partial page (if included) takes + // up a full physical page on disk, but it's not complete - the next byte still goes to + // that same logical page. + let total_pages_in_buffer = physical_pages.len() / physical_page_size; + let full_pages_written = if partial_page_state.is_some() { + total_pages_in_buffer.saturating_sub(1) + } else { + total_pages_in_buffer + }; + + // Identify protected regions based on the OLD partial page state + let protected_regions = Self::identify_protected_regions(old_partial_page_state.as_ref()); + + // Update state before writing. This may appear to risk data loss if writes fail, + // but write failures are fatal per this codebase's design - callers must not use + // the blob after any mutable method returns an error. + blob_state.current_page += full_pages_written as u64; + blob_state.partial_page_state = partial_page_state; + + // Make sure the buffer offset and underlying blob agree on the state of the tip. + assert_eq!( + blob_state.current_page * self.pool_ref.page_size(), + new_offset + ); + + // Write the physical pages to the blob. + // If there are protected regions in the first page, we need to write around them. + if let Some((prefix_len, protected_crc)) = protected_regions { + match protected_crc { + ProtectedCrc::First => { + // Protected CRC is first: [page_size..page_size+6] + // Write 1: New data in first page [prefix_len..page_size] + if prefix_len < logical_page_size { + blob_state + .blob + .write_at( + physical_pages[prefix_len..logical_page_size].to_vec(), + write_at_offset + prefix_len as u64, + ) + .await?; + } + // Write 2: Second CRC of first page + all remaining pages [page_size+6..end] + let second_crc_start = logical_page_size + 6; + blob_state + .blob + .write_at( + physical_pages[second_crc_start..].to_vec(), + write_at_offset + second_crc_start as u64, + ) + .await?; + } + ProtectedCrc::Second => { + // Protected CRC is second: [page_size+6..page_size+12] + // Write 1: New data + first CRC of first page [prefix_len..page_size+6] + let first_crc_end = logical_page_size + 6; + if prefix_len < first_crc_end { + blob_state + .blob + .write_at( + physical_pages[prefix_len..first_crc_end].to_vec(), + write_at_offset + prefix_len as u64, + ) + .await?; + } + // Write 2: All remaining pages (if any) [physical_page_size..end] + if physical_pages.len() > physical_page_size { + blob_state + .blob + .write_at( + physical_pages[physical_page_size..].to_vec(), + write_at_offset + physical_page_size as u64, + ) + .await?; + } + } + } + } else { + // No protected regions, write everything in one operation + blob_state + .blob + .write_at(physical_pages, write_at_offset) + .await?; + } + + Ok(()) + } + + /// Returns the logical size of the blob. This accounts for both written and buffered data. + pub async fn size(&self) -> u64 { + let buffer = self.buffer.read().await; + buffer.size() + } + + /// Reads up to `buf.len()` bytes starting at `logical_offset`, but only as many as are + /// available. + /// + /// This is useful for reading variable-length prefixes (like varints) where you want to read + /// up to a maximum number of bytes but the actual data might be shorter. + /// + /// Returns the number of bytes actually read into the buffer. Returns an error if no bytes + /// are available at the given offset. + pub async fn read_up_to( + &self, + buf: impl Into + Send, + logical_offset: u64, + ) -> Result<(StableBuf, usize), Error> { + let mut buf = buf.into(); + if buf.is_empty() { + return Ok((buf, 0)); + } + let blob_size = self.size().await; + let available = (blob_size.saturating_sub(logical_offset) as usize).min(buf.len()); + if available == 0 { + return Err(Error::BlobInsufficientLength); + } + if buf.len() > available { + buf.truncate(available); + } + self.read_into(buf.as_mut(), logical_offset).await?; + + Ok((buf, available)) + } + + /// Reads bytes starting at `logical_offset` into `buf`. + /// + /// This method allows reading directly into a mutable slice without taking ownership of the + /// buffer or requiring a specific buffer type. + pub async fn read_into(&self, buf: &mut [u8], logical_offset: u64) -> Result<(), Error> { + // Ensure the read doesn't overflow. + let end_offset = logical_offset + .checked_add(buf.len() as u64) + .ok_or(Error::OffsetOverflow)?; + + // Acquire a read lock on the buffer. + let buffer = self.buffer.read().await; + + // If the data required is beyond the size of the blob, return an error. + if end_offset > buffer.size() { + return Err(Error::BlobInsufficientLength); + } + + // Extract any bytes from the buffer that overlap with the requested range. + let remaining = buffer.extract(buf.as_mut(), logical_offset); + + // Release buffer lock before potential I/O. + drop(buffer); + + if remaining == 0 { + return Ok(()); + } + + // Fast path: try to read *only* from pool cache without acquiring blob lock. This allows + // concurrent reads even while a flush is in progress. + let cached = self + .pool_ref + .read_cached(self.id, &mut buf[..remaining], logical_offset) + .await; + + if cached == remaining { + // All bytes found in cache. + return Ok(()); + } + + // Slow path: cache miss (partial or full), acquire blob read lock to ensure any in-flight + // write completes before we read from the blob. + let blob_guard = self.blob_state.read().await; + + // Read remaining bytes that were not already obtained from the earlier cache read. + let uncached_offset = logical_offset + cached as u64; + let uncached_len = remaining - cached; + self.pool_ref + .read( + &blob_guard.blob, + self.id, + &mut buf[cached..cached + uncached_len], + uncached_offset, + ) + .await + } + + /// Returns the protected region info for a partial page, if any. + /// + /// # Returns + /// + /// `None` if there's no existing partial page. + /// + /// `Some((prefix_len, protected_crc))` where: + /// - `prefix_len`: bytes `[0..prefix_len]` were already written and can be substituted with + /// zeros (skip writing) + /// - `protected_crc`: which CRC slot must not be overwritten + fn identify_protected_regions( + partial_page_state: Option<&Checksum>, + ) -> Option<(usize, ProtectedCrc)> { + let crc_record = partial_page_state?; + let (old_len, _) = crc_record.get_crc(); + // The protected CRC is the one with the larger (authoritative) length. + let protected_crc = if crc_record.len1 >= crc_record.len2 { + ProtectedCrc::First + } else { + ProtectedCrc::Second + }; + Some((old_len as usize, protected_crc)) + } + + /// Prepare a buffer containing the result of converting each buffered logical page in the input + /// into a physical page (meaning each page has a CRC record). If the last page is not yet full, + /// it will be included only if `include_partial_page` is true. + /// + /// # Arguments + /// + /// * `buffer` - The buffer containing logical page data + /// * `include_partial_page` - Whether to include a partial page if one exists + /// * `old_crc_record` - The CRC record from a previously committed partial page, if any. + /// When present, the first page's CRC record will preserve the old CRC in its original slot + /// and place the new CRC in the other slot. + fn to_physical_pages( + &self, + buffer: &Buffer, + include_partial_page: bool, + old_crc_record: Option<&Checksum>, + ) -> (Vec, Option) { + let logical_page_size = self.pool_ref.page_size() as usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE as usize; + let pages_to_write = buffer.data.len() / logical_page_size; + let mut write_buffer = Vec::with_capacity(pages_to_write * physical_page_size); + + // For each logical page, copy over the data and then write a crc record for it. + for page in 0..pages_to_write { + let start_read_idx = page * logical_page_size; + let end_read_idx = start_read_idx + logical_page_size; + let logical_page = &buffer.data[start_read_idx..end_read_idx]; + write_buffer.extend_from_slice(logical_page); + + let crc = crc32fast::hash(logical_page); + let logical_page_size_u16 = + u16::try_from(logical_page_size).expect("page size must fit in u16 for CRC record"); + + // For the first page, if there's an old partial page CRC, construct the record + // to preserve the old CRC in its original slot. + let crc_record = if let (0, Some(old_crc)) = (page, old_crc_record) { + Self::build_crc_record_preserving_old(logical_page_size_u16, crc, old_crc) + } else { + Checksum::new(logical_page_size_u16, crc) + }; + write_buffer.extend_from_slice(&crc_record.to_bytes()); + } + + if !include_partial_page { + return (write_buffer, None); + } + + let partial_page = &buffer.data[pages_to_write * logical_page_size..]; + if partial_page.is_empty() { + // No partial page data to write. + return (write_buffer, None); + } + + // If there are no full pages and the partial page length matches what was already + // written, there's nothing new to write. + if pages_to_write == 0 { + if let Some(old_crc) = old_crc_record { + let (old_len, _) = old_crc.get_crc(); + if partial_page.len() == old_len as usize { + return (write_buffer, None); + } + } + } + write_buffer.extend_from_slice(partial_page); + let partial_len = partial_page.len(); + let crc = crc32fast::hash(partial_page); + + // Pad with zeros to fill up to logical_page_size. + write_buffer.resize(write_buffer.len() + (logical_page_size - partial_len), 0); + + // For partial pages: if this is the first page and there's an old CRC, preserve it. + // Otherwise just use the new CRC in slot 0. + let crc_record = if let (0, Some(old_crc)) = (pages_to_write, old_crc_record) { + Self::build_crc_record_preserving_old(partial_len as u16, crc, old_crc) + } else { + Checksum::new(partial_len as u16, crc) + }; + + write_buffer.extend_from_slice(&crc_record.to_bytes()); + + // Return the CRC record that matches what we wrote to disk, so that future flushes + // correctly identify which slot is protected. + (write_buffer, Some(crc_record)) + } + + /// Build a CRC record that preserves the old CRC in its original slot and places + /// the new CRC in the other slot. + const fn build_crc_record_preserving_old( + new_len: u16, + new_crc: u32, + old_crc: &Checksum, + ) -> Checksum { + let (old_len, old_crc_val) = old_crc.get_crc(); + // The old CRC is in the slot with the larger length value (first slot wins ties). + if old_crc.len1 >= old_crc.len2 { + // Old CRC is in slot 0, put new CRC in slot 1 + Checksum { + len1: old_len, + crc1: old_crc_val, + len2: new_len, + crc2: new_crc, + } + } else { + // Old CRC is in slot 1, put new CRC in slot 0 + Checksum { + len1: new_len, + crc1: new_crc, + len2: old_len, + crc2: old_crc_val, + } + } + } + + /// Flushes any buffered data, then returns a [Read] wrapper for the underlying blob. + /// + /// The returned reader can be used to sequentially read all data from the blob while ensuring + /// all data passes integrity verification. + pub async fn as_blob_reader(&self, capacity: NonZeroUsize) -> Result, Error> { + let logical_page_size = self.pool_ref.page_size(); + let logical_page_size_nz = + NonZeroUsize::new(logical_page_size as usize).expect("page_size is non-zero"); + + // Flush any buffered data (without fsync) so the Read wrapper sees all written data. + // We don't need fsync here since we just want to ensure data has been written to the + // underlying blob, not durably persisted. + { + let buf_guard = self.buffer.write().await; + if !buf_guard.immutable { + self.flush_internal(buf_guard, true).await?; + } + } + + let physical_page_size = logical_page_size + CHECKSUM_SIZE; + let blob_guard = self.blob_state.read().await; + + // Compute both physical and logical blob sizes. + let (physical_blob_size, logical_blob_size) = + blob_guard.partial_page_state.as_ref().map_or_else( + || { + // All pages are full. + let physical = physical_page_size * blob_guard.current_page; + let logical = logical_page_size * blob_guard.current_page; + (physical, logical) + }, + |crc_record| { + // There's a partial page with a checksum. + let (partial_len, _) = crc_record.get_crc(); + let partial_len = partial_len as u64; + // Physical: all pages including the partial one (which is padded to full size). + let physical = physical_page_size * (blob_guard.current_page + 1); + // Logical: full pages before this + partial page's actual data length. + let logical = logical_page_size * blob_guard.current_page + partial_len; + (physical, logical) + }, + ); + + Ok(Read::new( + blob_guard.blob.clone(), + physical_blob_size, + logical_blob_size, + capacity, + logical_page_size_nz, + )) + } +} + +impl Blob for Append { + async fn read_at( + &self, + buf: impl Into + Send, + logical_offset: u64, + ) -> Result { + let mut buf = buf.into(); + self.read_into(buf.as_mut(), logical_offset).await?; + Ok(buf) + } + + async fn sync(&self) -> Result<(), Error> { + // Flush any buffered data, including any partial page. When flush_internal returns, + // write_at has completed and data has been written to the underlying blob. + let buf_guard = self.buffer.write().await; + if buf_guard.immutable { + return Ok(()); + } + self.flush_internal(buf_guard, true).await?; + + // Sync the underlying blob. We need the blob read lock here since sync() requires access + // to the blob, but only a read lock since we're not modifying blob state. + let blob_state = self.blob_state.read().await; + blob_state.blob.sync().await + } + + /// This [Blob] trait method is unimplemented by [Append] and unconditionally panics. + async fn write_at(&self, _buf: impl Into + Send, _offset: u64) -> Result<(), Error> { + // TODO(): Extend the buffer pool to + // support arbitrary writes. + unimplemented!("append-only blob type does not support write_at") + } + + /// Resize the blob to the provided logical `size`. + /// + /// This truncates the blob to contain only `size` logical bytes. The physical blob size will + /// be adjusted to include the necessary CRC records for the remaining pages. + /// + /// # Warning + /// + /// - Concurrent mutable operations (append, resize) are not supported and will cause data loss. + /// - Concurrent readers which try to read past the new size during the resize may error. + /// - The resize is not guaranteed durable until the next sync. + async fn resize(&self, size: u64) -> Result<(), Error> { + let current_size = self.size().await; + + // Handle growing by appending zero bytes. + if size > current_size { + let zeros_needed = (size - current_size) as usize; + let zeros = vec![0u8; zeros_needed]; + self.append(&zeros).await?; + return Ok(()); + } + + // Implementation note: rewinding the blob across a page boundary potentially results in + // stale data remaining in the buffer pool's cache. We don't proactively purge the data + // within this function since it would be inaccessible anyway. Instead we ensure it is + // always updated should the blob grow back to the point where we have new data for the same + // page, if any old data hasn't expired naturally by then. + + let logical_page_size = self.pool_ref.page_size(); + let physical_page_size = logical_page_size + CHECKSUM_SIZE; + + // Flush any buffered data first to ensure we have a consistent state on disk. + self.sync().await?; + + // Acquire both locks to prevent concurrent operations. + let mut buf_guard = self.buffer.write().await; + if buf_guard.immutable { + return Err(Error::ImmutableBlob); + } + let mut blob_guard = self.blob_state.write().await; + + // Calculate the physical size needed for the new logical size. + let full_pages = size / logical_page_size; + let partial_bytes = size % logical_page_size; + let new_physical_size = if partial_bytes > 0 { + // We need full_pages + 1 physical pages to hold the partial data. + // The partial page will be padded to full physical page size. + (full_pages + 1) * physical_page_size + } else { + // No partial page needed. + full_pages * physical_page_size + }; + + // Resize the underlying blob. + blob_guard.blob.resize(new_physical_size).await?; + blob_guard.partial_page_state = None; + + // Update blob state and buffer based on the desired logical size. The partial page data is + // read with CRC validation; the validated length may exceed partial_bytes (reflecting the + // old data length), but we only load the prefix we need. The next sync will write the + // correct CRC for the new length. + // + // Note: This updates state before validation completes, which could leave state + // inconsistent if validation fails. This is acceptable because failures from mutable + // methods are fatal - callers must not use the blob after any error. + + blob_guard.current_page = full_pages; + buf_guard.offset = full_pages * logical_page_size; + + if partial_bytes > 0 { + // There's a partial page. Read its data from disk with CRC validation. + let page_data = + super::get_page_from_blob(&blob_guard.blob, full_pages, logical_page_size).await?; + + // Ensure the validated data covers what we need. + if (page_data.len() as u64) < partial_bytes { + return Err(Error::InvalidChecksum); + } + + buf_guard.data = page_data.as_ref()[..partial_bytes as usize].to_vec(); + } else { + // No partial page - all pages are full or blob is empty. + buf_guard.data = vec![]; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{deterministic, Runner as _, Storage as _}; + use commonware_codec::ReadExt; + use commonware_macros::test_traced; + use commonware_utils::{NZUsize, NZU16}; + use std::num::NonZeroU16; + + const PAGE_SIZE: NonZeroU16 = NZU16!(103); // janky size to ensure we test page alignment + const BUFFER_SIZE: usize = PAGE_SIZE.get() as usize * 2; + + #[test_traced("DEBUG")] + fn test_append_crc_empty() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + // Open a new blob. + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + assert_eq!(blob_size, 0); + + // Create a buffer pool reference. + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + + // Create an Append wrapper. + let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + // Verify initial size is 0. + assert_eq!(append.size().await, 0); + + // Close & re-open. + append.sync().await.unwrap(); + drop(append); + + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + assert_eq!(blob_size, 0); // There was no need to write a crc since there was no data. + + let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + assert_eq!(append.size().await, 0); + }); + } + + #[test_traced("DEBUG")] + fn test_append_crc_basic() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + // Open a new blob. + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + assert_eq!(blob_size, 0); + + // Create a buffer pool reference. + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + + // Create an Append wrapper. + let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + // Verify initial size is 0. + assert_eq!(append.size().await, 0); + + // Append some bytes. + let data = vec![1, 2, 3, 4, 5]; + append.append(&data).await.unwrap(); + + // Verify size reflects appended data. + assert_eq!(append.size().await, 5); + + // Append more bytes. + let more_data = vec![6, 7, 8, 9, 10]; + append.append(&more_data).await.unwrap(); + + // Verify size is cumulative. + assert_eq!(append.size().await, 10); + + // Read back the first chunk and verify. + let read_buf = vec![0u8; 5]; + let read_buf = append.read_at(read_buf, 0).await.unwrap(); + assert_eq!(read_buf.as_ref(), &data[..]); + + // Read back the second chunk and verify. + let read_buf = vec![0u8; 5]; + let read_buf = append.read_at(read_buf, 5).await.unwrap(); + assert_eq!(read_buf.as_ref(), &more_data[..]); + + // Read all data at once and verify. + let read_buf = vec![0u8; 10]; + let read_buf = append.read_at(read_buf, 0).await.unwrap(); + assert_eq!(read_buf.as_ref(), &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + + // Close and reopen the blob and make sure the data is still there and the trailing + // checksum is written & stripped as expected. + append.sync().await.unwrap(); + drop(append); + + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + // Physical page = 103 logical + 12 Checksum = 115 bytes (padded partial page) + assert_eq!(blob_size, 115); + let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + assert_eq!(append.size().await, 10); // CRC should be stripped after verification + + // Append data that spans a page boundary. + // PAGE_SIZE=103 is the logical page size. We have 10 bytes, so writing + // 100 more bytes (total 110) will cross the page boundary at byte 103. + let spanning_data: Vec = (11..=110).collect(); + append.append(&spanning_data).await.unwrap(); + assert_eq!(append.size().await, 110); + + // Read back data that spans the page boundary. + let read_buf = vec![0u8; 100]; + let read_buf = append.read_at(read_buf, 10).await.unwrap(); + assert_eq!(read_buf.as_ref(), &spanning_data[..]); + + // Read all 110 bytes at once. + let read_buf = vec![0u8; 110]; + let read_buf = append.read_at(read_buf, 0).await.unwrap(); + let expected: Vec = (1..=110).collect(); + assert_eq!(read_buf.as_ref(), &expected[..]); + + // Drop and re-open and make sure bytes are still there. + append.sync().await.unwrap(); + drop(append); + + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + // 2 physical pages: 2 * 115 = 230 bytes + assert_eq!(blob_size, 230); + let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + assert_eq!(append.size().await, 110); + + // Append data to reach exactly a page boundary. + // Logical page size is 103. We have 110 bytes, next boundary is 206 (103 * 2). + // So we need 96 more bytes. + let boundary_data: Vec = (111..=206).collect(); + assert_eq!(boundary_data.len(), 96); + append.append(&boundary_data).await.unwrap(); + assert_eq!(append.size().await, 206); + + // Verify we can read it back. + let read_buf = vec![0u8; 206]; + let read_buf = append.read_at(read_buf, 0).await.unwrap(); + let expected: Vec = (1..=206).collect(); + assert_eq!(read_buf.as_ref(), &expected[..]); + + // Drop and re-open at the page boundary. + append.sync().await.unwrap(); + drop(append); + + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + // Physical size should be exactly 2 pages: 115 * 2 = 230 bytes + assert_eq!(blob_size, 230); + let append = Append::new(blob, blob_size, BUFFER_SIZE, pool_ref) + .await + .unwrap(); + assert_eq!(append.size().await, 206); + + // Verify data is still readable after reopen. + let read_buf = vec![0u8; 206]; + let read_buf = append.read_at(read_buf, 0).await.unwrap(); + assert_eq!(read_buf.as_ref(), &expected[..]); + }); + } + + /// Helper to read the CRC record from raw blob bytes at the end of a physical page. + fn read_crc_record_from_page(page_bytes: &[u8]) -> Checksum { + let crc_start = page_bytes.len() - CHECKSUM_SIZE as usize; + Checksum::read(&mut &page_bytes[crc_start..]).unwrap() + } + + /// Dummy marker bytes with len=0 so the mangled slot is never authoritative. + /// Format: [len_hi=0, len_lo=0, 0xDE, 0xAD, 0xBE, 0xEF] + const DUMMY_MARKER: [u8; 6] = [0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF]; + + #[test] + fn test_identify_protected_regions_equal_lengths() { + // When lengths are equal, the first CRC should be protected (tie-breaking rule). + let record = Checksum { + len1: 50, + crc1: 0xAAAAAAAA, + len2: 50, + crc2: 0xBBBBBBBB, + }; + + let result = + Append::::identify_protected_regions(Some(&record)); + assert!(result.is_some()); + let (prefix_len, protected_crc) = result.unwrap(); + assert_eq!(prefix_len, 50); + assert!( + matches!(protected_crc, ProtectedCrc::First), + "First CRC should be protected when lengths are equal" + ); + } + + #[test] + fn test_identify_protected_regions_len1_larger() { + // When len1 > len2, the first CRC should be protected. + let record = Checksum { + len1: 100, + crc1: 0xAAAAAAAA, + len2: 50, + crc2: 0xBBBBBBBB, + }; + + let result = + Append::::identify_protected_regions(Some(&record)); + assert!(result.is_some()); + let (prefix_len, protected_crc) = result.unwrap(); + assert_eq!(prefix_len, 100); + assert!( + matches!(protected_crc, ProtectedCrc::First), + "First CRC should be protected when len1 > len2" + ); + } + + #[test] + fn test_identify_protected_regions_len2_larger() { + // When len2 > len1, the second CRC should be protected. + let record = Checksum { + len1: 50, + crc1: 0xAAAAAAAA, + len2: 100, + crc2: 0xBBBBBBBB, + }; + + let result = + Append::::identify_protected_regions(Some(&record)); + assert!(result.is_some()); + let (prefix_len, protected_crc) = result.unwrap(); + assert_eq!(prefix_len, 100); + assert!( + matches!(protected_crc, ProtectedCrc::Second), + "Second CRC should be protected when len2 > len1" + ); + } + + /// Test that slot 1 is NOT overwritten when it's the protected slot. + /// + /// Strategy: After extending twice (so slot 1 becomes authoritative with larger len), + /// mangle the non-authoritative slot 0. Then extend again - slot 0 should be overwritten + /// with the new CRC, while slot 1 (protected) should remain untouched. + #[test_traced("DEBUG")] + fn test_crc_slot1_protected() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + let slot0_offset = PAGE_SIZE.get() as u64; + let slot1_offset = PAGE_SIZE.get() as u64 + 6; + + // === Step 1: Write 10 bytes → slot 0 authoritative (len=10) === + let (blob, _) = context.open("test_partition", b"slot1_prot").await.unwrap(); + let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append.append(&(1..=10).collect::>()).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 2: Extend to 30 bytes → slot 1 authoritative (len=30) === + let (blob, size) = context.open("test_partition", b"slot1_prot").await.unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(11..=30).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Verify slot 1 is now authoritative + let (blob, size) = context.open("test_partition", b"slot1_prot").await.unwrap(); + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert!( + crc.len2 > crc.len1, + "Slot 1 should be authoritative (len2={} > len1={})", + crc.len2, + crc.len1 + ); + + // Capture slot 1 bytes before mangling slot 0 + let slot1_before: Vec = blob + .read_at(vec![0u8; 6], slot1_offset) + .await + .unwrap() + .into(); + + // === Step 3: Mangle slot 0 (non-authoritative) === + blob.write_at(DUMMY_MARKER.to_vec(), slot0_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Verify mangle worked + let slot0_mangled: Vec = blob + .read_at(vec![0u8; 6], slot0_offset) + .await + .unwrap() + .into(); + assert_eq!(slot0_mangled, DUMMY_MARKER, "Mangle failed"); + + // === Step 4: Extend to 50 bytes → new CRC goes to slot 0, slot 1 protected === + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(31..=50).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 5: Verify slot 0 was overwritten, slot 1 unchanged === + let (blob, _) = context.open("test_partition", b"slot1_prot").await.unwrap(); + + // Slot 0 should have new CRC (not our dummy marker) + let slot0_after: Vec = blob + .read_at(vec![0u8; 6], slot0_offset) + .await + .unwrap() + .into(); + assert_ne!( + slot0_after, DUMMY_MARKER, + "Slot 0 should have been overwritten with new CRC" + ); + + // Slot 1 should be UNCHANGED (protected) + let slot1_after: Vec = blob + .read_at(vec![0u8; 6], slot1_offset) + .await + .unwrap() + .into(); + assert_eq!( + slot1_before, slot1_after, + "Slot 1 was modified! Protected region violated." + ); + + // Verify the new CRC in slot 0 has len=50 + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert_eq!(crc.len1, 50, "Slot 0 should have len=50"); + }); + } + + /// Test that slot 0 is NOT overwritten when it's the protected slot. + /// + /// Strategy: After extending three times (slot 0 becomes authoritative again with largest len), + /// mangle the non-authoritative slot 1. Then extend again - slot 1 should be overwritten + /// with the new CRC, while slot 0 (protected) should remain untouched. + #[test_traced("DEBUG")] + fn test_crc_slot0_protected() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + let slot0_offset = PAGE_SIZE.get() as u64; + let slot1_offset = PAGE_SIZE.get() as u64 + 6; + + // === Step 1: Write 10 bytes → slot 0 authoritative (len=10) === + let (blob, _) = context.open("test_partition", b"slot0_prot").await.unwrap(); + let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append.append(&(1..=10).collect::>()).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 2: Extend to 30 bytes → slot 1 authoritative (len=30) === + let (blob, size) = context.open("test_partition", b"slot0_prot").await.unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(11..=30).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 3: Extend to 50 bytes → slot 0 authoritative (len=50) === + let (blob, size) = context.open("test_partition", b"slot0_prot").await.unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(31..=50).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Verify slot 0 is now authoritative + let (blob, size) = context.open("test_partition", b"slot0_prot").await.unwrap(); + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert!( + crc.len1 > crc.len2, + "Slot 0 should be authoritative (len1={} > len2={})", + crc.len1, + crc.len2 + ); + + // Capture slot 0 bytes before mangling slot 1 + let slot0_before: Vec = blob + .read_at(vec![0u8; 6], slot0_offset) + .await + .unwrap() + .into(); + + // === Step 4: Mangle slot 1 (non-authoritative) === + blob.write_at(DUMMY_MARKER.to_vec(), slot1_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Verify mangle worked + let slot1_mangled: Vec = blob + .read_at(vec![0u8; 6], slot1_offset) + .await + .unwrap() + .into(); + assert_eq!(slot1_mangled, DUMMY_MARKER, "Mangle failed"); + + // === Step 5: Extend to 70 bytes → new CRC goes to slot 1, slot 0 protected === + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(51..=70).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 6: Verify slot 1 was overwritten, slot 0 unchanged === + let (blob, _) = context.open("test_partition", b"slot0_prot").await.unwrap(); + + // Slot 1 should have new CRC (not our dummy marker) + let slot1_after: Vec = blob + .read_at(vec![0u8; 6], slot1_offset) + .await + .unwrap() + .into(); + assert_ne!( + slot1_after, DUMMY_MARKER, + "Slot 1 should have been overwritten with new CRC" + ); + + // Slot 0 should be UNCHANGED (protected) + let slot0_after: Vec = blob + .read_at(vec![0u8; 6], slot0_offset) + .await + .unwrap() + .into(); + assert_eq!( + slot0_before, slot0_after, + "Slot 0 was modified! Protected region violated." + ); + + // Verify the new CRC in slot 1 has len=70 + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert_eq!(crc.len2, 70, "Slot 1 should have len=70"); + }); + } + + /// Test that the data prefix is NOT overwritten when extending a partial page. + /// + /// Strategy: Write data, then mangle the padding area (between data end and CRC start). + /// After extending, the original data should be unchanged but the mangled padding + /// should be overwritten with new data. + #[test_traced("DEBUG")] + fn test_data_prefix_not_overwritten() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + + // === Step 1: Write 20 bytes === + let (blob, _) = context + .open("test_partition", b"prefix_test") + .await + .unwrap(); + let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + let data1: Vec = (1..=20).collect(); + append.append(&data1).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 2: Capture the first 20 bytes and mangle bytes 25-30 (in padding area) === + let (blob, size) = context + .open("test_partition", b"prefix_test") + .await + .unwrap(); + assert_eq!(size, physical_page_size as u64); + + let prefix_before: Vec = blob.read_at(vec![0u8; 20], 0).await.unwrap().into(); + + // Mangle bytes 25-30 (safely in the padding area, after our 20 bytes of data) + blob.write_at(DUMMY_MARKER.to_vec(), 25).await.unwrap(); + blob.sync().await.unwrap(); + + // === Step 3: Extend to 40 bytes === + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(21..=40).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 4: Verify prefix unchanged, mangled area overwritten === + let (blob, _) = context + .open("test_partition", b"prefix_test") + .await + .unwrap(); + + // Original 20 bytes should be unchanged + let prefix_after: Vec = blob.read_at(vec![0u8; 20], 0).await.unwrap().into(); + assert_eq!(prefix_before, prefix_after, "Data prefix was modified!"); + + // Bytes at offset 25-30: data (21..=40) starts at offset 20, so offset 25 has value 26 + let overwritten: Vec = blob.read_at(vec![0u8; 6], 25).await.unwrap().into(); + assert_eq!( + overwritten, + vec![26, 27, 28, 29, 30, 31], + "New data should overwrite padding area" + ); + }); + } + + /// Test CRC slot protection when extending past a page boundary. + /// + /// Strategy: Write partial page, mangle slot 0 (non-authoritative after we do first extend), + /// then extend past page boundary. Verify slot 0 gets new full-page CRC while + /// the mangled marker is overwritten, and second page is written correctly. + #[test_traced("DEBUG")] + fn test_crc_slot_protection_across_page_boundary() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + let slot0_offset = PAGE_SIZE.get() as u64; + let slot1_offset = PAGE_SIZE.get() as u64 + 6; + + // === Step 1: Write 50 bytes → slot 0 authoritative === + let (blob, _) = context.open("test_partition", b"boundary").await.unwrap(); + let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append.append(&(1..=50).collect::>()).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 2: Extend to 80 bytes → slot 1 authoritative === + let (blob, size) = context.open("test_partition", b"boundary").await.unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(51..=80).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Verify slot 1 is authoritative + let (blob, size) = context.open("test_partition", b"boundary").await.unwrap(); + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert!(crc.len2 > crc.len1, "Slot 1 should be authoritative"); + + // Capture slot 1 before extending past page boundary + let slot1_before: Vec = blob + .read_at(vec![0u8; 6], slot1_offset) + .await + .unwrap() + .into(); + + // Mangle slot 0 (non-authoritative) + blob.write_at(DUMMY_MARKER.to_vec(), slot0_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // === Step 3: Extend past page boundary (80 + 40 = 120, PAGE_SIZE=103) === + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(81..=120).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 4: Verify results === + let (blob, size) = context.open("test_partition", b"boundary").await.unwrap(); + assert_eq!(size, (physical_page_size * 2) as u64, "Should have 2 pages"); + + // Slot 0 should have been overwritten with full-page CRC (not dummy marker) + let slot0_after: Vec = blob + .read_at(vec![0u8; 6], slot0_offset) + .await + .unwrap() + .into(); + assert_ne!( + slot0_after, DUMMY_MARKER, + "Slot 0 should have full-page CRC" + ); + + // Slot 1 should be UNCHANGED (protected during boundary crossing) + let slot1_after: Vec = blob + .read_at(vec![0u8; 6], slot1_offset) + .await + .unwrap() + .into(); + assert_eq!( + slot1_before, slot1_after, + "Slot 1 was modified during page boundary crossing!" + ); + + // Verify page 0 has correct CRC structure + let page0 = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc0 = read_crc_record_from_page(page0.as_ref()); + assert_eq!( + crc0.len1, + PAGE_SIZE.get(), + "Slot 0 should have full page length" + ); + + // Verify data integrity + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + assert_eq!(append.size().await, 120); + let all_data: Vec = append.read_at(vec![0u8; 120], 0).await.unwrap().into(); + let expected: Vec = (1..=120).collect(); + assert_eq!(all_data, expected); + }); + } + + /// Test that corrupting the primary CRC (but not its length) causes fallback to the previous + /// partial page contents. + /// + /// Strategy: + /// 1. Write 10 bytes → slot 0 authoritative (len=10, valid crc) + /// 2. Extend to 30 bytes → slot 1 authoritative (len=30, valid crc) + /// 3. Corrupt ONLY the crc2 value in slot 1 (not the length) + /// 4. Re-open and verify we fall back to slot 0's 10 bytes + #[test_traced("DEBUG")] + fn test_crc_fallback_on_corrupted_primary() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + // crc2 is at offset: PAGE_SIZE + 6 (for len2) + 2 (skip len2 bytes) = PAGE_SIZE + 8 + let crc2_offset = PAGE_SIZE.get() as u64 + 8; + + // === Step 1: Write 10 bytes → slot 0 authoritative (len=10) === + let (blob, _) = context + .open("test_partition", b"crc_fallback") + .await + .unwrap(); + let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + let data1: Vec = (1..=10).collect(); + append.append(&data1).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 2: Extend to 30 bytes → slot 1 authoritative (len=30) === + let (blob, size) = context + .open("test_partition", b"crc_fallback") + .await + .unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append + .append(&(11..=30).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Verify slot 1 is now authoritative and data reads correctly + let (blob, size) = context + .open("test_partition", b"crc_fallback") + .await + .unwrap(); + assert_eq!(size, physical_page_size as u64); + + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert!( + crc.len2 > crc.len1, + "Slot 1 should be authoritative (len2={} > len1={})", + crc.len2, + crc.len1 + ); + assert_eq!(crc.len2, 30, "Slot 1 should have len=30"); + assert_eq!(crc.len1, 10, "Slot 0 should have len=10"); + + // Verify we can read all 30 bytes before corruption + let append = Append::new(blob.clone(), size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + assert_eq!(append.size().await, 30); + let all_data: Vec = append.read_at(vec![0u8; 30], 0).await.unwrap().into(); + let expected: Vec = (1..=30).collect(); + assert_eq!(all_data, expected); + drop(append); + + // === Step 3: Corrupt ONLY crc2 (not len2) === + // crc2 is 4 bytes at offset PAGE_SIZE + 8 + blob.write_at(vec![0xDE, 0xAD, 0xBE, 0xEF], crc2_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Verify corruption: len2 should still be 30, but crc2 is now garbage + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert_eq!(crc.len2, 30, "len2 should still be 30 after corruption"); + assert_eq!(crc.crc2, 0xDEADBEEF, "crc2 should be our corrupted value"); + + // === Step 4: Re-open and verify fallback to slot 0's 10 bytes === + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + // Should fall back to 10 bytes (slot 0's length) + assert_eq!( + append.size().await, + 10, + "Should fall back to slot 0's 10 bytes after primary CRC corruption" + ); + + // Verify the data is the original 10 bytes + let fallback_data: Vec = append.read_at(vec![0u8; 10], 0).await.unwrap().into(); + assert_eq!( + fallback_data, data1, + "Fallback data should match original 10 bytes" + ); + + // Reading beyond 10 bytes should fail + let result = append.read_at(vec![0u8; 11], 0).await; + assert!(result.is_err(), "Reading beyond fallback size should fail"); + }); + } + + /// Test that corrupting a non-last page's primary CRC fails even if fallback is valid. + /// + /// Non-last pages must always be full. If the primary CRC is corrupted and the fallback + /// indicates a partial page, validation should fail entirely (not fall back to partial). + /// + /// Strategy: + /// 1. Write 10 bytes → slot 0 has len=10 (partial) + /// 2. Extend to full page (103 bytes) → slot 1 has len=103 (full, authoritative) + /// 3. Extend past page boundary (e.g., 110 bytes) → page 0 is now non-last + /// 4. Corrupt the primary CRC of page 0 (slot 1's crc, which has len=103) + /// 5. Re-open and verify that reading from page 0 fails (fallback has len=10, not full) + #[test_traced("DEBUG")] + fn test_non_last_page_rejects_partial_fallback() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + // crc2 for page 0 is at offset: PAGE_SIZE + 8 + let page0_crc2_offset = PAGE_SIZE.get() as u64 + 8; + + // === Step 1: Write 10 bytes → slot 0 has len=10 === + let (blob, _) = context + .open("test_partition", b"non_last_page") + .await + .unwrap(); + let append = Append::new(blob, 0, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + append.append(&(1..=10).collect::>()).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // === Step 2: Extend to exactly full page (103 bytes) → slot 1 has len=103 === + let (blob, size) = context + .open("test_partition", b"non_last_page") + .await + .unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + // Add bytes 11 through 103 (93 more bytes) + append + .append(&(11..=PAGE_SIZE.get() as u8).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Verify page 0 slot 1 is authoritative with len=103 (full page) + let (blob, size) = context + .open("test_partition", b"non_last_page") + .await + .unwrap(); + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert_eq!(crc.len1, 10, "Slot 0 should have len=10"); + assert_eq!( + crc.len2, + PAGE_SIZE.get(), + "Slot 1 should have len=103 (full page)" + ); + assert!(crc.len2 > crc.len1, "Slot 1 should be authoritative"); + + // === Step 3: Extend past page boundary (add 10 more bytes for total of 113) === + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + // Add bytes 104 through 113 (10 more bytes, now on page 1) + append + .append(&(104..=113).collect::>()) + .await + .unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Verify we now have 2 pages + let (blob, size) = context + .open("test_partition", b"non_last_page") + .await + .unwrap(); + assert_eq!( + size, + (physical_page_size * 2) as u64, + "Should have 2 physical pages" + ); + + // Verify data is readable before corruption + let append = Append::new(blob.clone(), size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + assert_eq!(append.size().await, 113); + let all_data: Vec = append.read_at(vec![0u8; 113], 0).await.unwrap().into(); + let expected: Vec = (1..=113).collect(); + assert_eq!(all_data, expected); + drop(append); + + // === Step 4: Corrupt page 0's primary CRC (slot 1's crc2) === + blob.write_at(vec![0xDE, 0xAD, 0xBE, 0xEF], page0_crc2_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Verify corruption: page 0's slot 1 still has len=103 but bad CRC + let page = blob + .read_at(vec![0u8; physical_page_size], 0) + .await + .unwrap(); + let crc = read_crc_record_from_page(page.as_ref()); + assert_eq!(crc.len2, PAGE_SIZE.get(), "len2 should still be 103"); + assert_eq!(crc.crc2, 0xDEADBEEF, "crc2 should be corrupted"); + // Slot 0 fallback has len=10 (partial), which is invalid for non-last page + assert_eq!(crc.len1, 10, "Fallback slot 0 has partial length"); + + // === Step 5: Re-open and try to read from page 0 === + // The first page's primary CRC is bad, and fallback indicates partial (len=10). + // Since page 0 is not the last page, a partial fallback is invalid. + // Reading from page 0 should fail because the fallback CRC indicates a partial + // page, which is not allowed for non-last pages. + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + // The blob still reports 113 bytes because init only validates the last page. + // But reading from page 0 should fail because the CRC fallback is partial. + assert_eq!(append.size().await, 113); + + // Try to read from page 0 - this should fail with InvalidChecksum because + // the fallback CRC has len=10 (partial), which is invalid for a non-last page. + let result = append.read_at(vec![0u8; 10], 0).await; + assert!( + result.is_err(), + "Reading from corrupted non-last page via Append should fail, but got: {:?}", + result + ); + drop(append); + + // Also verify that reading via a Read wrapper fails the same way. + let (blob, size) = context + .open("test_partition", b"non_last_page") + .await + .unwrap(); + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + let mut reader = append.as_blob_reader(NZUsize!(1024)).await.unwrap(); + + // Try to read from offset 0 (page 0) via the Read wrapper. + let result = reader.read_up_to(vec![0u8; 10]).await; + assert!( + result.is_err(), + "Reading from corrupted non-last page via Read wrapper should fail, but got: {:?}", + result + ); + }); + } + + #[test] + fn test_resize_shrink_validates_crc() { + // Verify that shrinking a blob to a partial page validates the CRC, rather than + // blindly reading raw bytes which could silently load corrupted data. + let executor = deterministic::Runner::default(); + + executor.start(|context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + + let (blob, size) = context + .open("test_partition", b"resize_crc_test") + .await + .unwrap(); + + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + // Write data across 3 pages: page 0 (full), page 1 (full), page 2 (partial). + // PAGE_SIZE = 103, so 250 bytes = 103 + 103 + 44. + let data: Vec = (0..=249).collect(); + append.append(&data).await.unwrap(); + append.sync().await.unwrap(); + assert_eq!(append.size().await, 250); + drop(append); + + // Corrupt the CRC record of page 1 (middle page). + let (blob, size) = context + .open("test_partition", b"resize_crc_test") + .await + .unwrap(); + assert_eq!(size as usize, physical_page_size * 3); + + // Page 1 CRC record is at the end of the second physical page. + let page1_crc_offset = (physical_page_size * 2 - CHECKSUM_SIZE as usize) as u64; + blob.write_at(vec![0xFF; CHECKSUM_SIZE as usize], page1_crc_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Open the blob - Append::new() validates the LAST page (page 2), which is still valid. + // So it should open successfully with size 250. + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + assert_eq!(append.size().await, 250); + + // Try to shrink to 150 bytes, which ends in page 1 (the corrupted page). + // 150 bytes = page 0 (103 full) + page 1 (47 partial). + // This should fail because page 1's CRC is corrupted. + let result = append.resize(150).await; + assert!( + matches!(result, Err(crate::Error::InvalidChecksum)), + "Expected InvalidChecksum when shrinking to corrupted page, got: {:?}", + result + ); + }); + } + + #[test] + fn test_immutable_blob_rejects_append_and_resize() { + let executor = deterministic::Runner::default(); + + executor.start(|context| async move { + const PAGE_SIZE: NonZeroU16 = NZU16!(64); + const BUFFER_SIZE: usize = 256; + + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(4)); + + let (blob, size) = context + .open("test_partition", b"immutable_test") + .await + .unwrap(); + + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + // Write some initial data. + append.append(&[1, 2, 3, 4, 5]).await.unwrap(); + append.sync().await.unwrap(); + assert_eq!(append.size().await, 5); + + // Convert to immutable. + append.to_immutable().await.unwrap(); + assert!(append.is_immutable().await); + + // Verify append() returns ImmutableBlob error. + let result = append.append(&[6, 7, 8]).await; + assert!( + matches!(result, Err(crate::Error::ImmutableBlob)), + "Expected ImmutableBlob error from append(), got: {:?}", + result + ); + + // Verify resize() returns ImmutableBlob error. + let result = append.resize(100).await; + assert!( + matches!(result, Err(crate::Error::ImmutableBlob)), + "Expected ImmutableBlob error from resize(), got: {:?}", + result + ); + + // Verify sync() returns Ok. + let result = append.sync().await; + assert!( + result.is_ok(), + "sync() on immutable blob should return Ok, got: {:?}", + result + ); + + // Verify data is still readable. + let data: Vec = append.read_at(vec![0u8; 5], 0).await.unwrap().into(); + assert_eq!(data, vec![1, 2, 3, 4, 5]); + }); + } + + #[test] + fn test_corrupted_crc_len_too_large() { + let executor = deterministic::Runner::default(); + + executor.start(|context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let physical_page_size = PAGE_SIZE.get() as usize + CHECKSUM_SIZE as usize; + + // Step 1: Create blob with valid data + let (blob, size) = context + .open("test_partition", b"crc_len_test") + .await + .unwrap(); + + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + append.append(&[0x42; 50]).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Step 2: Corrupt the CRC record to have len > page_size + let (blob, size) = context + .open("test_partition", b"crc_len_test") + .await + .unwrap(); + assert_eq!(size as usize, physical_page_size); + + // CRC record is at the end of the physical page + let crc_offset = PAGE_SIZE.get() as u64; + + // Create a CRC record with len1 = 0xFFFF (65535), which is >> page_size (103) + // Format: [len1_hi, len1_lo, crc1 (4 bytes), len2_hi, len2_lo, crc2 (4 bytes)] + let bad_crc_record: [u8; 12] = [ + 0xFF, 0xFF, // len1 = 65535 (way too large) + 0xDE, 0xAD, 0xBE, 0xEF, // crc1 (garbage) + 0x00, 0x00, // len2 = 0 + 0x00, 0x00, 0x00, 0x00, // crc2 = 0 + ]; + blob.write_at(bad_crc_record.to_vec(), crc_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Step 3: Try to open the blob - should NOT panic, should return error or handle gracefully + let result = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()).await; + + // Either returns InvalidChecksum error OR truncates the corrupted data + // (both are acceptable behaviors - panicking is NOT acceptable) + match result { + Ok(append) => { + // If it opens successfully, the corrupted page should have been truncated + let recovered_size = append.size().await; + assert_eq!( + recovered_size, 0, + "Corrupted page should be truncated, size should be 0" + ); + } + Err(e) => { + // Error is also acceptable (for immutable blobs) + assert!( + matches!(e, crate::Error::InvalidChecksum), + "Expected InvalidChecksum error, got: {:?}", + e + ); + } + } + }); + } + + #[test] + fn test_corrupted_crc_both_slots_len_too_large() { + let executor = deterministic::Runner::default(); + + executor.start(|context| async move { + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + + // Step 1: Create blob with valid data + let (blob, size) = context + .open("test_partition", b"crc_both_bad") + .await + .unwrap(); + + let append = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()) + .await + .unwrap(); + + append.append(&[0x42; 50]).await.unwrap(); + append.sync().await.unwrap(); + drop(append); + + // Step 2: Corrupt BOTH CRC slots to have len > page_size + let (blob, size) = context + .open("test_partition", b"crc_both_bad") + .await + .unwrap(); + + let crc_offset = PAGE_SIZE.get() as u64; + + // Both slots have len > page_size + let bad_crc_record: [u8; 12] = [ + 0x01, 0x00, // len1 = 256 (> 103) + 0xDE, 0xAD, 0xBE, 0xEF, // crc1 (garbage) + 0x02, 0x00, // len2 = 512 (> 103) + 0xCA, 0xFE, 0xBA, 0xBE, // crc2 (garbage) + ]; + blob.write_at(bad_crc_record.to_vec(), crc_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Step 3: Try to open - should NOT panic + let result = Append::new(blob, size, BUFFER_SIZE, pool_ref.clone()).await; + + match result { + Ok(append) => { + // Corrupted page truncated + assert_eq!(append.size().await, 0); + } + Err(e) => { + assert!( + matches!(e, crate::Error::InvalidChecksum), + "Expected InvalidChecksum, got: {:?}", + e + ); + } + } + }); + } +} diff --git a/runtime/src/utils/buffer/pool/mod.rs b/runtime/src/utils/buffer/pool/mod.rs new file mode 100644 index 0000000000..c7094b8e79 --- /dev/null +++ b/runtime/src/utils/buffer/pool/mod.rs @@ -0,0 +1,505 @@ +//! Blob wrappers for reading and writing data with integrity guarantees, plus a buffer pool that +//! manages read caching over the data. +//! +//! # Page-oriented structure +//! +//! Blob data is stored in _pages_ having a logical `page_size` dictated by the managing buffer +//! pool. A _physical page_ consists of `page_size` bytes of data followed by a 12-byte _CRC +//! record_ containing: +//! +//! ```text +//! | len1 (2 bytes) | crc1 (4 bytes) | len2 (2 bytes) | crc2 (4 bytes) | +//! ``` +//! +//! Two checksums are stored so that partial pages can be re-written without overwriting a valid +//! checksum for its previously committed contents. A checksum over a page is computed over the +//! first [0,len) bytes in the page, with all other bytes in the page ignored. This implementation +//! always 0-pads the range [len, page_size). A checksum with length 0 is never considered +//! valid. If both checksums are valid for the page, the one with the larger `len` is considered +//! authoritative. +//! +//! A _full_ page is one whose crc stores a len equal to the logical page size. Otherwise the page +//! is called _partial_. All pages in a blob are full except for the very last page, which can be +//! full or partial. A partial page's logical bytes are immutable on commit, and if it's re-written, +//! it's only to add more bytes after the existing ones. + +use crate::{Blob, Error}; +use bytes::{Buf, BufMut}; +use commonware_codec::{EncodeFixed, FixedSize, Read as CodecRead, ReadExt, Write}; +use commonware_utils::StableBuf; + +mod append; +mod page_cache; +mod read; + +pub use append::Append; +pub use page_cache::PoolRef; +pub use read::Read; +use tracing::{debug, error}; + +// A checksum record contains two u16 lengths and two CRCs (each 4 bytes). +const CHECKSUM_SIZE: u64 = 12; + +/// Read the designated page from the underlying blob and return its logical bytes as a vector if it +/// passes the integrity check, returning error otherwise. Safely handles partial pages. Caller can +/// check the length of the returned vector to determine if the page was partial vs full. +async fn get_page_from_blob( + blob: &impl Blob, + page_num: u64, + logical_page_size: u64, +) -> Result { + let physical_page_size = logical_page_size + CHECKSUM_SIZE; + let physical_page_start = page_num * physical_page_size; + + let mut page = blob + .read_at(vec![0; physical_page_size as usize], physical_page_start) + .await?; + + let Some(record) = Checksum::validate_page(page.as_ref()) else { + return Err(Error::InvalidChecksum); + }; + let (len, _) = record.get_crc(); + + page.truncate(len as usize); + + Ok(page) +} + +/// Describes a CRC record stored at the end of a page. +/// +/// The CRC accompanied by the larger length is the one that should be treated as authoritative for +/// the page. Two checksums are stored so that partial pages can be written without overwriting a +/// valid checksum for a previously committed partial page. +#[derive(Clone)] +struct Checksum { + len1: u16, + crc1: u32, + len2: u16, + crc2: u32, +} + +impl Checksum { + /// Create a new CRC record with the given length and CRC. + /// The new CRC is stored in the first slot (len1/crc1), with the second slot zeroed. + const fn new(len: u16, crc: u32) -> Self { + Self { + len1: len, + crc1: crc, + len2: 0, + crc2: 0, + } + } + + /// Return the CRC record for the page if it is valid. The provided slice is assumed to be + /// exactly the size of a physical page. The record may not precisely reflect the bytes written + /// if what should have been the most recent CRC doesn't validate, in which case it will be + /// zeroed and the other CRC used as a fallback. + fn validate_page(buf: &[u8]) -> Option { + let page_size = buf.len() as u64; + if page_size < CHECKSUM_SIZE { + error!( + page_size, + required = CHECKSUM_SIZE, + "read page smaller than CRC record" + ); + return None; + } + + let crc_start_idx = (page_size - CHECKSUM_SIZE) as usize; + let mut crc_bytes = &buf[crc_start_idx..]; + let mut crc_record = Self::read(&mut crc_bytes).expect("CRC record read should not fail"); + let (len, crc) = crc_record.get_crc(); + + // Validate that len is in the valid range [1, logical_page_size]. + // A page with len=0 is invalid (e.g., all-zero pages from unwritten data). + let len_usize = len as usize; + if len_usize == 0 { + // Both CRCs have 0 length, so there is no fallback possible. + debug!("Invalid CRC: len==0"); + return None; + } + + if len_usize > crc_start_idx { + // len is too large so this CRC isn't valid. Fall back to the other CRC. + debug!("Invalid CRC: len too long. Using fallback CRC"); + if crc_record.validate_fallback(buf, crc_start_idx) { + return Some(crc_record); + } + return None; + } + + let computed_crc = crc32fast::hash(&buf[..len_usize]); + if computed_crc != crc { + debug!("Invalid CRC: doesn't match page contents. Using fallback CRC"); + if crc_record.validate_fallback(buf, crc_start_idx) { + return Some(crc_record); + } + return None; + } + + Some(crc_record) + } + + /// Attempts to validate a CRC record based on its fallback CRC because the primary CRC failed + /// validation. The primary CRC is zeroed in the process. Returns false if the fallback CRC + /// fails validation. + fn validate_fallback(&mut self, buf: &[u8], crc_start_idx: usize) -> bool { + let (len, crc) = self.get_fallback_crc(); + if len == 0 { + // No fallback available (only one CRC was ever written to this page). + debug!("Invalid fallback CRC: len==0"); + return false; + } + + let len_usize = len as usize; + + if len_usize > crc_start_idx { + // len is too large so this CRC isn't valid. + debug!("Invalid fallback CRC: len too long."); + return false; + } + + let computed_crc = crc32fast::hash(&buf[..len_usize]); + if computed_crc != crc { + debug!("Invalid fallback CRC: doesn't match page contents."); + return false; + } + + true + } + + /// Returns the CRC record with the longer (authoritative) length, without performing any + /// validation. If they both have the same length (which should only happen due to data + /// corruption) return the first. + const fn get_crc(&self) -> (u16, u32) { + if self.len1 >= self.len2 { + (self.len1, self.crc1) + } else { + (self.len2, self.crc2) + } + } + + /// Zeroes the primary CRC (because we assumed it failed validation) and returns the other. This + /// should only be called if the primary CRC failed validation. After this returns, get_crc will + /// no longer return the invalid primary CRC. + const fn get_fallback_crc(&mut self) -> (u16, u32) { + if self.len1 >= self.len2 { + // First CRC was primary, and must have been invalid. Zero it and return the second. + self.len1 = 0; + self.crc1 = 0; + (self.len2, self.crc2) + } else { + // Second CRC was primary, and must have been invalid. Zero it and return the first. + self.len2 = 0; + self.crc2 = 0; + (self.len1, self.crc1) + } + } + + /// Returns the CRC record in its storage representation. + fn to_bytes(&self) -> [u8; CHECKSUM_SIZE as usize] { + self.encode_fixed() + } +} + +impl Write for Checksum { + fn write(&self, buf: &mut impl BufMut) { + self.len1.write(buf); + self.crc1.write(buf); + self.len2.write(buf); + self.crc2.write(buf); + } +} + +impl CodecRead for Checksum { + type Cfg = (); + + fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { + Ok(Self { + len1: u16::read(buf)?, + crc1: u32::read(buf)?, + len2: u16::read(buf)?, + crc2: u32::read(buf)?, + }) + } +} + +impl FixedSize for Checksum { + const SIZE: usize = CHECKSUM_SIZE as usize; +} + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for Checksum { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + Ok(Self { + len1: u.arbitrary()?, + crc1: u.arbitrary()?, + len2: u.arbitrary()?, + crc2: u.arbitrary()?, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const CHECKSUM_SIZE_USIZE: usize = CHECKSUM_SIZE as usize; + + #[test] + fn test_crc_record_encode_read_roundtrip() { + let record = Checksum { + len1: 0x1234, + crc1: 0xAABBCCDD, + len2: 0x5678, + crc2: 0x11223344, + }; + + let bytes = record.to_bytes(); + let restored = Checksum::read(&mut &bytes[..]).unwrap(); + + assert_eq!(restored.len1, 0x1234); + assert_eq!(restored.crc1, 0xAABBCCDD); + assert_eq!(restored.len2, 0x5678); + assert_eq!(restored.crc2, 0x11223344); + } + + #[test] + fn test_crc_record_encoding() { + let record = Checksum { + len1: 0x0102, + crc1: 0x03040506, + len2: 0x0708, + crc2: 0x090A0B0C, + }; + + let bytes = record.to_bytes(); + // Verify big-endian encoding + assert_eq!( + bytes, + [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C] + ); + } + + #[test] + fn test_crc_record_get_crc_len1_larger() { + let record = Checksum { + len1: 200, + crc1: 0xAAAAAAAA, + len2: 100, + crc2: 0xBBBBBBBB, + }; + + let (len, crc) = record.get_crc(); + assert_eq!(len, 200); + assert_eq!(crc, 0xAAAAAAAA); + } + + #[test] + fn test_crc_record_get_crc_len2_larger() { + let record = Checksum { + len1: 100, + crc1: 0xAAAAAAAA, + len2: 200, + crc2: 0xBBBBBBBB, + }; + + let (len, crc) = record.get_crc(); + assert_eq!(len, 200); + assert_eq!(crc, 0xBBBBBBBB); + } + + #[test] + fn test_crc_record_get_crc_equal_lengths() { + // When lengths are equal, len1/crc1 is returned (first slot wins ties). + let record = Checksum { + len1: 100, + crc1: 0xAAAAAAAA, + len2: 100, + crc2: 0xBBBBBBBB, + }; + + let (len, crc) = record.get_crc(); + assert_eq!(len, 100); + assert_eq!(crc, 0xAAAAAAAA); + } + + #[test] + fn test_validate_page_valid() { + let logical_page_size = 64usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE; + let mut page = vec![0u8; physical_page_size]; + + // Write some data + let data = b"hello world"; + page[..data.len()].copy_from_slice(data); + + // Compute CRC of the data portion + let crc = crc32fast::hash(&page[..data.len()]); + let record = Checksum::new(data.len() as u16, crc); + + // Write the CRC record at the end + let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE; + page[crc_start..].copy_from_slice(&record.to_bytes()); + + // Validate - should return Some with the Checksum + let validated = Checksum::validate_page(&page); + assert!(validated.is_some()); + let (len, _) = validated.unwrap().get_crc(); + assert_eq!(len as usize, data.len()); + } + + #[test] + fn test_validate_page_invalid_crc() { + let logical_page_size = 64usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE; + let mut page = vec![0u8; physical_page_size]; + + // Write some data + let data = b"hello world"; + page[..data.len()].copy_from_slice(data); + + // Write a record with wrong CRC + let wrong_crc = 0xBADBADBA; + let record = Checksum::new(data.len() as u16, wrong_crc); + + let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE; + page[crc_start..].copy_from_slice(&record.to_bytes()); + + // Should fail validation (return None) + let validated = Checksum::validate_page(&page); + assert!(validated.is_none()); + } + + #[test] + fn test_validate_page_corrupted_data() { + let logical_page_size = 64usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE; + let mut page = vec![0u8; physical_page_size]; + + // Write some data and compute correct CRC + let data = b"hello world"; + page[..data.len()].copy_from_slice(data); + let crc = crc32fast::hash(&page[..data.len()]); + let record = Checksum::new(data.len() as u16, crc); + + let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE; + page[crc_start..].copy_from_slice(&record.to_bytes()); + + // Corrupt the data + page[0] = 0xFF; + + // Should fail validation (return None) + let validated = Checksum::validate_page(&page); + assert!(validated.is_none()); + } + + #[test] + fn test_validate_page_uses_larger_len() { + let logical_page_size = 64usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE; + let mut page = vec![0u8; physical_page_size]; + + // Write data and compute CRC for the larger portion + let data = b"hello world, this is longer"; + page[..data.len()].copy_from_slice(data); + let crc = crc32fast::hash(&page[..data.len()]); + + // Create a record where len2 has the valid CRC for longer data + let record = Checksum { + len1: 5, + crc1: 0xDEADBEEF, // Invalid CRC for shorter data + len2: data.len() as u16, + crc2: crc, + }; + + let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE; + page[crc_start..].copy_from_slice(&record.to_bytes()); + + // Should validate using len2/crc2 since len2 > len1 + let validated = Checksum::validate_page(&page); + assert!(validated.is_some()); + let (len, _) = validated.unwrap().get_crc(); + assert_eq!(len as usize, data.len()); + } + + #[test] + fn test_validate_page_uses_fallback() { + let logical_page_size = 64usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE; + let mut page = vec![0u8; physical_page_size]; + + // Write data + let data = b"fallback data"; + page[..data.len()].copy_from_slice(data); + let valid_crc = crc32fast::hash(&page[..data.len()]); + let valid_len = data.len() as u16; + + // Create a record where: + // len1 is larger (primary) but INVALID + // len2 is smaller (fallback) but VALID + let record = Checksum { + len1: valid_len + 10, // Larger, so it's primary + crc1: 0xBAD1DEA, // Invalid CRC + len2: valid_len, // Smaller, so it's fallback + crc2: valid_crc, // Valid CRC + }; + + let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE; + page[crc_start..].copy_from_slice(&record.to_bytes()); + + // Should validate using the fallback (len2) + let validated = Checksum::validate_page(&page); + + assert!(validated.is_some(), "Should have validated using fallback"); + let validated = validated.unwrap(); + let (len, crc) = validated.get_crc(); + assert_eq!(len, valid_len); + assert_eq!(crc, valid_crc); + + // Verify that the invalid primary was zeroed out + assert_eq!(validated.len1, 0); + assert_eq!(validated.crc1, 0); + } + + #[test] + fn test_validate_page_no_fallback_available() { + let logical_page_size = 64usize; + let physical_page_size = logical_page_size + CHECKSUM_SIZE_USIZE; + let mut page = vec![0u8; physical_page_size]; + + // Write some data + let data = b"some data"; + page[..data.len()].copy_from_slice(data); + + // Create a record where: + // len1 > 0 (primary) but with INVALID CRC + // len2 = 0 (no fallback available) + let record = Checksum { + len1: data.len() as u16, + crc1: 0xBAD1DEA, // Invalid CRC + len2: 0, // No fallback + crc2: 0, + }; + + let crc_start = physical_page_size - CHECKSUM_SIZE_USIZE; + page[crc_start..].copy_from_slice(&record.to_bytes()); + + // Should fail validation since primary is invalid and no fallback exists + let validated = Checksum::validate_page(&page); + assert!( + validated.is_none(), + "Should fail when primary is invalid and fallback has len=0" + ); + } + + #[cfg(feature = "arbitrary")] + mod conformance { + use super::*; + use commonware_codec::conformance::CodecConformance; + + commonware_conformance::conformance_tests! { + CodecConformance, + } + } +} diff --git a/runtime/src/utils/buffer/pool.rs b/runtime/src/utils/buffer/pool/page_cache.rs similarity index 58% rename from runtime/src/utils/buffer/pool.rs rename to runtime/src/utils/buffer/pool/page_cache.rs index 7ab0381b24..c7ddaf31ba 100644 --- a/runtime/src/utils/buffer/pool.rs +++ b/runtime/src/utils/buffer/pool/page_cache.rs @@ -1,25 +1,29 @@ +//! A buffer pool for caching _logical_ pages of [Blob] data in memory. The buffer pool is unaware +//! of the physical page format used by the blob, which is left to the blob implementation. + +use super::get_page_from_blob; use crate::{Blob, Error, RwLock}; use commonware_utils::StableBuf; use futures::{future::Shared, FutureExt}; use std::{ collections::{hash_map::Entry, HashMap}, future::Future, - num::NonZeroUsize, + num::{NonZeroU16, NonZeroUsize}, pin::Pin, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, }, }; -use tracing::{debug, trace}; +use tracing::{debug, error, trace}; // Type alias for the future we'll be storing for each in-flight page fetch. // // We wrap [Error] in an Arc so it will be cloneable, which is required for the future to be -// [Shared]. +// [Shared]. The StableBuf contains only the logical (validated) bytes of the page. type PageFetchFut = Shared>> + Send>>>; -/// A [Pool] caches pages of [Blob] data in memory. +/// A [Pool] caches pages of [Blob] data in memory after verifying the integrity of each. /// /// A single buffer pool can be used to cache data from multiple blobs by assigning a unique id to /// each. @@ -63,17 +67,23 @@ struct CacheEntry { /// A bit indicating whether this page was recently referenced. referenced: AtomicBool, - /// The cached page itself. + /// The cached page itself. Only logical bytes are cached, so the vector will be 12 bytes shorter + /// than the physical page size. data: Vec, } -/// A reference to a [Pool] that can be shared across threads via cloning, along with the page size -/// that will be used with it. Provides the API for interacting with the buffer pool in a +/// A reference to a page cache that can be shared across threads via cloning, along with the page +/// size that will be used with it. Provides the API for interacting with the buffer pool in a /// thread-safe manner. #[derive(Clone)] pub struct PoolRef { - /// The size of each page in the buffer pool. - pub(super) page_size: usize, + /// The size of each page in the underlying blobs managed by this buffer pool. + /// + /// # Warning + /// + /// You cannot change the page size once data has been written without invalidating it. (Reads + /// on blobs that were written with a different page size will fail their integrity check.) + page_size: u64, /// The next id to assign to a blob that will be managed by this pool. next_id: Arc, @@ -83,49 +93,52 @@ pub struct PoolRef { } impl PoolRef { - /// Returns a new [PoolRef] with the given `page_size` and `capacity`. - pub fn new(page_size: NonZeroUsize, capacity: NonZeroUsize) -> Self { + /// Returns a new [PoolRef] that will buffer up to `capacity` pages with the + /// given `page_size`. + pub fn new(page_size: NonZeroU16, capacity: NonZeroUsize) -> Self { + let page_size = page_size.get() as u64; + Self { - page_size: page_size.get(), + page_size, next_id: Arc::new(AtomicU64::new(0)), pool: Arc::new(RwLock::new(Pool::new(capacity.get()))), } } + /// The page size used by this buffer pool. + #[inline] + pub const fn page_size(&self) -> u64 { + self.page_size + } + /// Returns a unique id for the next blob that will use this buffer pool. pub async fn next_id(&self) -> u64 { self.next_id.fetch_add(1, Ordering::Relaxed) } - /// Convert an offset into the number of the page it belongs to and the offset within that page. - pub const fn offset_to_page(&self, offset: u64) -> (u64, usize) { + /// Convert a logical offset into the number of the page it belongs to and the offset within + /// that page. + pub const fn offset_to_page(&self, offset: u64) -> (u64, u64) { Pool::offset_to_page(self.page_size, offset) } /// Try to read the specified bytes from the buffer pool cache only. Returns the number of /// bytes successfully read from cache and copied to `buf` before a page fault, if any. - /// - /// This method never reads from the underlying blob - it only checks the cache. - /// - /// # Warning - /// - /// Attempts to read any of the last (blob_size % page_size) "trailing bytes" of the blob will - /// always return 0 since the buffer pool only deals with page sized chunks. pub(super) async fn read_cached( &self, blob_id: u64, mut buf: &mut [u8], - mut offset: u64, + mut logical_offset: u64, ) -> usize { let original_len = buf.len(); let buffer_pool = self.pool.read().await; while !buf.is_empty() { - let count = buffer_pool.read_at(self.page_size, blob_id, buf, offset); + let count = buffer_pool.read_at(self.page_size, blob_id, buf, logical_offset); if count == 0 { // Cache miss - return how many bytes we successfully read break; } - offset += count as u64; + logical_offset += count as u64; buf = &mut buf[count..]; } original_len - buf.len() @@ -133,13 +146,6 @@ impl PoolRef { /// Read the specified bytes, preferentially from the buffer pool cache. Bytes not found in the /// buffer pool will be read from the provided `blob` and cached for future reads. - /// - /// # Warning - /// - /// Attempts to read any of the last (blob_size % page_size) "trailing bytes" of the blob will - /// result in a ReadFailed error since the buffer pool only deals with page sized chunks. - /// Trailing bytes need to be dealt with outside of the buffer pool. For example, - /// [crate::buffer::Append] uses a [crate::buffer::tip::Buffer] to buffer them. pub(super) async fn read( &self, blob: &B, @@ -172,10 +178,10 @@ impl PoolRef { Ok(()) } - /// Fetch the specified page after encountering a page fault, which may involve retrieving it + /// Fetch the requested page after encountering a page fault, which may involve retrieving it /// from `blob` & caching the result in `pool`. Returns the number of bytes read, which should /// always be non-zero. - async fn read_after_page_fault( + pub(super) async fn read_after_page_fault( &self, blob: &B, blob_id: u64, @@ -185,7 +191,7 @@ impl PoolRef { assert!(!buf.is_empty()); let (page_num, offset_in_page) = Pool::offset_to_page(self.page_size, offset); - let page_size = self.page_size; + let offset_in_page = offset_in_page as usize; trace!(page_num, blob_id, "page fault"); // Create or clone a future that retrieves the desired page from the underlying blob. This @@ -196,7 +202,7 @@ impl PoolRef { // There's a (small) chance the page was fetched & buffered by another task before we // were able to acquire the write lock, so check the cache before doing anything else. - let count = pool.read_at(page_size, blob_id, buf, offset); + let count = pool.read_at(self.page_size, blob_id, buf, offset); if count != 0 { return Ok(count); } @@ -208,12 +214,27 @@ impl PoolRef { (o.get().clone(), false) } Entry::Vacant(v) => { - // Nobody is currently fetching this page, so create a future that will do the work. + // Nobody is currently fetching this page, so create a future that will do the + // work. get_page_from_blob handles CRC validation and returns only logical bytes. let blob = blob.clone(); + let page_size = self.page_size; let future = async move { - blob.read_at(vec![0; page_size], page_num * page_size as u64) + let page = get_page_from_blob(&blob, page_num, page_size) .await - .map_err(Arc::new) + .map_err(Arc::new)?; + // We should never be fetching partial pages through the buffer pool. This can happen + // if a non-last page is corrupted and falls back to a partial CRC. + let len = page.as_ref().len(); + if len != page_size as usize { + error!( + page_num, + expected = page_size, + actual = len, + "attempted to fetch partial page from blob" + ); + return Err(Arc::new(Error::InvalidChecksum)); + } + Ok(page) }; // Make the future shareable and insert it into the map. @@ -231,10 +252,11 @@ impl PoolRef { let fetch_result = fetch_future.await; if !is_first_fetcher { // Copy the requested portion of the page into the buffer and return immediately. - let page_buf: Vec = fetch_result.map_err(|_| Error::ReadFailed)?.into(); - let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page); - buf[..bytes_to_copy] - .copy_from_slice(&page_buf[offset_in_page..offset_in_page + bytes_to_copy]); + let page_buf = fetch_result.map_err(|_| Error::ReadFailed)?; + let bytes_to_copy = std::cmp::min(buf.len(), page_buf.as_ref().len() - offset_in_page); + buf[..bytes_to_copy].copy_from_slice( + &page_buf.as_ref()[offset_in_page..offset_in_page + bytes_to_copy], + ); return Ok(bytes_to_copy); } @@ -247,43 +269,42 @@ impl PoolRef { // Remove the entry from `page_fetches`. let _ = pool.page_fetches.remove(&(blob_id, page_num)); - // Cache the result in the buffer pool. - let Ok(page_buf) = fetch_result else { - return Err(Error::ReadFailed); + // Cache the result in the buffer pool. get_page_from_blob already validated the CRC. + let page_buf = match fetch_result { + Ok(page_buf) => page_buf, + Err(err) => { + error!(page_num, ?err, "Page fetch failed"); + return Err(Error::ReadFailed); + } }; - pool.cache(page_size, blob_id, page_buf.as_ref(), page_num); + + pool.cache(self.page_size, blob_id, page_buf.as_ref(), page_num); // Copy the requested portion of the page into the buffer. - let page_buf: Vec = page_buf.into(); - let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page); + let bytes_to_copy = std::cmp::min(buf.len(), page_buf.as_ref().len() - offset_in_page); buf[..bytes_to_copy] - .copy_from_slice(&page_buf[offset_in_page..offset_in_page + bytes_to_copy]); + .copy_from_slice(&page_buf.as_ref()[offset_in_page..offset_in_page + bytes_to_copy]); Ok(bytes_to_copy) } - /// Cache the provided slice of data in the buffer pool, returning the remaining bytes that + /// Cache the provided pages of data in the buffer pool, returning the remaining bytes that /// didn't fill a whole page. `offset` must be page aligned. /// - /// If the next page index would overflow `u64`, caching stops and the uncached bytes are - /// returned. This can only occur with 1-byte pages on 64-bit architectures. On 32-bit - /// architectures it cannot occur because the buffer length is bounded by `usize::MAX` (2^32-1), - /// so even starting at page `u64::MAX` with 1-byte pages, at most 2^32-1 pages can be cached. - /// On 64-bit architectures with page_size >= 2, the maximum starting page (`u64::MAX / 2`) - /// plus maximum cacheable pages (`usize::MAX / 2`) equals `u64::MAX - 1`. - /// /// # Panics /// - /// Panics if `offset` is not page aligned. + /// - Panics if `offset` is not page aligned. + /// - If the buffer is not the size of a page. pub async fn cache(&self, blob_id: u64, mut buf: &[u8], offset: u64) -> usize { let (mut page_num, offset_in_page) = self.offset_to_page(offset); assert_eq!(offset_in_page, 0); { // Write lock the buffer pool. + let page_size = self.page_size as usize; let mut buffer_pool = self.pool.write().await; - while buf.len() >= self.page_size { - buffer_pool.cache(self.page_size, blob_id, &buf[..self.page_size], page_num); - buf = &buf[self.page_size..]; + while buf.len() >= page_size { + buffer_pool.cache(self.page_size, blob_id, &buf[..page_size], page_num); + buf = &buf[page_size..]; page_num = match page_num.checked_add(1) { Some(next) => next, None => break, @@ -314,11 +335,8 @@ impl Pool { } /// Convert an offset into the number of the page it belongs to and the offset within that page. - const fn offset_to_page(page_size: usize, offset: u64) -> (u64, usize) { - ( - offset / page_size as u64, - (offset % page_size as u64) as usize, - ) + const fn offset_to_page(page_size: u64, offset: u64) -> (u64, u64) { + (offset / page_size, offset % page_size) } /// Attempt to fetch blob data starting at `offset` from the buffer pool. Returns the number of @@ -326,8 +344,8 @@ impl Pool { /// never more than `self.page_size` or the length of `buf`. The returned bytes won't cross a /// page boundary, so multiple reads may be required even if all data in the desired range is /// buffered. - fn read_at(&self, page_size: usize, blob_id: u64, buf: &mut [u8], offset: u64) -> usize { - let (page_num, offset_in_page) = Self::offset_to_page(page_size, offset); + fn read_at(&self, page_size: u64, blob_id: u64, buf: &mut [u8], logical_offset: u64) -> usize { + let (page_num, offset_in_page) = Self::offset_to_page(page_size, logical_offset); let page_index = self.index.get(&(blob_id, page_num)); let Some(&page_index) = page_index else { return 0; @@ -337,20 +355,18 @@ impl Pool { page.referenced.store(true, Ordering::Relaxed); let page = &page.data; - let bytes_to_copy = std::cmp::min(buf.len(), page_size - offset_in_page); - buf[..bytes_to_copy].copy_from_slice(&page[offset_in_page..offset_in_page + bytes_to_copy]); + let logical_page_size = page_size as usize; + let bytes_to_copy = std::cmp::min(buf.len(), logical_page_size - offset_in_page as usize); + buf[..bytes_to_copy].copy_from_slice( + &page[offset_in_page as usize..offset_in_page as usize + bytes_to_copy], + ); bytes_to_copy } /// Put the given `page` into the buffer pool. - /// - /// # Panics - /// - /// Panics if the provided page is not exactly PAGE_SIZE bytes long. - fn cache(&mut self, page_size: usize, blob_id: u64, page: &[u8], page_num: u64) { - assert_eq!(page.len(), page_size); - + fn cache(&mut self, page_size: u64, blob_id: u64, page: &[u8], page_num: u64) { + assert_eq!(page.len(), page_size as usize); let key = (blob_id, page_num); let index_entry = self.index.entry(key); if let Entry::Occupied(index_entry) = index_entry { @@ -400,52 +416,59 @@ impl Pool { #[cfg(test)] mod tests { - use super::*; - use crate::{deterministic, Runner as _, Storage as _}; + use super::{super::Checksum, *}; + use crate::{buffer::pool::CHECKSUM_SIZE, deterministic, Runner as _, Storage as _}; use commonware_macros::test_traced; - use commonware_utils::NZUsize; + use commonware_utils::{NZUsize, NZU16}; + use std::num::NonZeroU16; - const PAGE_SIZE: usize = 1024; + // Logical page size (what PoolRef uses and what gets cached). + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); + const PAGE_SIZE_U64: u64 = PAGE_SIZE.get() as u64; #[test_traced] fn test_pool_basic() { let mut pool: Pool = Pool::new(10); - let mut buf = vec![0; PAGE_SIZE]; - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0); + // Cache stores logical-sized pages. + let mut buf = vec![0; PAGE_SIZE.get() as usize]; + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0); assert_eq!(bytes_read, 0); - pool.cache(PAGE_SIZE, 0, &[1; PAGE_SIZE], 0); - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0); - assert_eq!(bytes_read, PAGE_SIZE); - assert_eq!(buf, [1; PAGE_SIZE]); + pool.cache(PAGE_SIZE_U64, 0, &[1; PAGE_SIZE.get() as usize], 0); + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0); + assert_eq!(bytes_read, PAGE_SIZE.get() as usize); + assert_eq!(buf, [1; PAGE_SIZE.get() as usize]); // Test replacement -- should log a duplicate page warning but still work. - pool.cache(PAGE_SIZE, 0, &[2; PAGE_SIZE], 0); - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0); - assert_eq!(bytes_read, PAGE_SIZE); - assert_eq!(buf, [2; PAGE_SIZE]); + pool.cache(PAGE_SIZE_U64, 0, &[2; PAGE_SIZE.get() as usize], 0); + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0); + assert_eq!(bytes_read, PAGE_SIZE.get() as usize); + assert_eq!(buf, [2; PAGE_SIZE.get() as usize]); // Test exceeding the cache capacity. for i in 0u64..11 { - pool.cache(PAGE_SIZE, 0, &[i as u8; PAGE_SIZE], i); + pool.cache(PAGE_SIZE_U64, 0, &[i as u8; PAGE_SIZE.get() as usize], i); } // Page 0 should have been evicted. - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, 0); + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, 0); assert_eq!(bytes_read, 0); // Page 1-10 should be in the cache. for i in 1u64..11 { - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, i * PAGE_SIZE as u64); - assert_eq!(bytes_read, PAGE_SIZE); - assert_eq!(buf, [i as u8; PAGE_SIZE]); + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, i * PAGE_SIZE_U64); + assert_eq!(bytes_read, PAGE_SIZE.get() as usize); + assert_eq!(buf, [i as u8; PAGE_SIZE.get() as usize]); } // Test reading from an unaligned offset by adding 2 to an aligned offset. The read - // should be 2 bytes short of a full page. - let mut buf = vec![0; PAGE_SIZE]; - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, PAGE_SIZE as u64 + 2); - assert_eq!(bytes_read, PAGE_SIZE - 2); - assert_eq!(&buf[..PAGE_SIZE - 2], [1; PAGE_SIZE - 2]); + // should be 2 bytes short of a full logical page. + let mut buf = vec![0; PAGE_SIZE.get() as usize]; + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, PAGE_SIZE_U64 + 2); + assert_eq!(bytes_read, PAGE_SIZE.get() as usize - 2); + assert_eq!( + &buf[..PAGE_SIZE.get() as usize - 2], + [1; PAGE_SIZE.get() as usize - 2] + ); } #[test_traced] @@ -454,39 +477,50 @@ mod tests { let executor = deterministic::Runner::default(); // Start the test within the executor executor.start(|context| async move { - // Populate a blob with 11 consecutive pages of data. + // Physical page size = logical + CRC record. + let physical_page_size = PAGE_SIZE_U64 + CHECKSUM_SIZE; + + // Populate a blob with 11 consecutive pages of CRC-protected data. let (blob, size) = context .open("test", "blob".as_bytes()) .await .expect("Failed to open blob"); assert_eq!(size, 0); for i in 0..11 { - let buf = vec![i as u8; PAGE_SIZE]; - blob.write_at(buf, i * PAGE_SIZE as u64).await.unwrap(); + // Write logical data followed by Checksum. + let logical_data = vec![i as u8; PAGE_SIZE.get() as usize]; + let crc = crc32fast::hash(&logical_data); + let record = Checksum::new(PAGE_SIZE.get(), crc); + let mut page_data = logical_data; + page_data.extend_from_slice(&record.to_bytes()); + blob.write_at(page_data, i * physical_page_size) + .await + .unwrap(); } - // Fill the buffer pool with the blob's data. - let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(10)); + // Fill the buffer pool with the blob's data via PoolRef::read. + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(10)); assert_eq!(pool_ref.next_id().await, 0); assert_eq!(pool_ref.next_id().await, 1); for i in 0..11 { - let mut buf = vec![0; PAGE_SIZE]; + // Read expects logical bytes only (CRCs are stripped). + let mut buf = vec![0; PAGE_SIZE.get() as usize]; pool_ref - .read(&blob, 0, &mut buf, i * PAGE_SIZE as u64) + .read(&blob, 0, &mut buf, i * PAGE_SIZE_U64) .await .unwrap(); - assert_eq!(buf, [i as u8; PAGE_SIZE]); + assert_eq!(buf, [i as u8; PAGE_SIZE.get() as usize]); } // Repeat the read to exercise reading from the buffer pool. Must start at 1 because // page 0 should be evicted. for i in 1..11 { - let mut buf = vec![0; PAGE_SIZE]; + let mut buf = vec![0; PAGE_SIZE.get() as usize]; pool_ref - .read(&blob, 0, &mut buf, i * PAGE_SIZE as u64) + .read(&blob, 0, &mut buf, i * PAGE_SIZE_U64) .await .unwrap(); - assert_eq!(buf, [i as u8; PAGE_SIZE]); + assert_eq!(buf, [i as u8; PAGE_SIZE.get() as usize]); } // Cleanup. @@ -498,41 +532,63 @@ mod tests { fn test_pool_cache_max_page() { let executor = deterministic::Runner::default(); executor.start(|_context| async move { - let pool_ref = PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(2)); + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(2)); // Use the largest page-aligned offset representable for the configured PAGE_SIZE. - let aligned_max_offset = u64::MAX - (u64::MAX % PAGE_SIZE as u64); + let aligned_max_offset = u64::MAX - (u64::MAX % PAGE_SIZE_U64); + + // PoolRef::cache expects only logical bytes (no CRC). + let logical_data = vec![42u8; PAGE_SIZE.get() as usize]; // Caching exactly one page at the maximum offset should succeed. let remaining = pool_ref - .cache(0, vec![42; PAGE_SIZE].as_slice(), aligned_max_offset) + .cache(0, logical_data.as_slice(), aligned_max_offset) .await; assert_eq!(remaining, 0); - let mut buf = vec![0u8; PAGE_SIZE]; + // Reading from the pool should return the logical bytes. + let mut buf = vec![0u8; PAGE_SIZE.get() as usize]; let pool = pool_ref.pool.read().await; - let bytes_read = pool.read_at(PAGE_SIZE, 0, &mut buf, aligned_max_offset); - assert_eq!(bytes_read, PAGE_SIZE); + let bytes_read = pool.read_at(PAGE_SIZE_U64, 0, &mut buf, aligned_max_offset); + assert_eq!(bytes_read, PAGE_SIZE.get() as usize); assert!(buf.iter().all(|b| *b == 42)); }); } #[test_traced] - fn test_pool_cache_page_overflow_partial() { + fn test_pool_cache_at_high_offset() { let executor = deterministic::Runner::default(); executor.start(|_context| async move { - // Use the minimum page size to force the page index to reach u64::MAX and trigger the - // overflow guard. - let pool_ref = PoolRef::new(NZUsize!(1), NZUsize!(2)); - - // Caching across the maximum page should stop before overflow and report the remainder. - let remaining = pool_ref.cache(0, &[1, 2], u64::MAX).await; - assert_eq!(remaining, 1); + // Use the minimum page size (CHECKSUM_SIZE + 1 = 13) with high offset. + const MIN_PAGE_SIZE: u64 = CHECKSUM_SIZE + 1; + let pool_ref = PoolRef::new(NZU16!(MIN_PAGE_SIZE as u16), NZUsize!(2)); + + // Create two pages worth of logical data (no CRCs - PoolRef::cache expects logical only). + let data = vec![1u8; MIN_PAGE_SIZE as usize * 2]; + + // Cache pages at a high (but not max) aligned offset so we can verify both pages. + // Use an offset that's a few pages below max to avoid overflow when verifying. + let aligned_max_offset = u64::MAX - (u64::MAX % MIN_PAGE_SIZE); + let high_offset = aligned_max_offset - (MIN_PAGE_SIZE * 2); + let remaining = pool_ref.cache(0, &data, high_offset).await; + // Both pages should be cached. + assert_eq!(remaining, 0); - let mut buf = [0u8; 1]; + // Verify the first page was cached correctly. + let mut buf = vec![0u8; MIN_PAGE_SIZE as usize]; let pool = pool_ref.pool.read().await; - assert_eq!(pool.read_at(1, 0, &mut buf, u64::MAX), 1); - assert_eq!(buf, [1]); + assert_eq!( + pool.read_at(MIN_PAGE_SIZE, 0, &mut buf, high_offset), + MIN_PAGE_SIZE as usize + ); + assert!(buf.iter().all(|b| *b == 1)); + + // Verify the second page was cached correctly. + assert_eq!( + pool.read_at(MIN_PAGE_SIZE, 0, &mut buf, high_offset + MIN_PAGE_SIZE), + MIN_PAGE_SIZE as usize + ); + assert!(buf.iter().all(|b| *b == 1)); }); } } diff --git a/runtime/src/utils/buffer/pool/read.rs b/runtime/src/utils/buffer/pool/read.rs new file mode 100644 index 0000000000..95a13d1647 --- /dev/null +++ b/runtime/src/utils/buffer/pool/read.rs @@ -0,0 +1,470 @@ +use super::{Checksum, CHECKSUM_SIZE}; +use crate::{Blob, Error}; +use commonware_utils::StableBuf; +use std::num::NonZeroUsize; +use tracing::{debug, error}; + +const CHECKSUM_SIZE_USIZE: usize = CHECKSUM_SIZE as usize; + +/// A reader that buffers content from a [Blob] with page-level CRCs to optimize the performance of +/// a full scan of contents. +pub struct Read { + /// The underlying blob to read from. + blob: B, + /// The physical size of the blob (always a multiple of physical page size). + physical_blob_size: u64, + /// The logical size of the blob (actual data bytes, not including CRCs or padding). + logical_blob_size: u64, + /// The buffer storing the data read from the blob. The buffer stores logical bytes only. + buffer: Vec, + /// The current page in the blob from where the buffer was filled (the buffer always starts at a + /// page boundary). + blob_page: u64, + /// The current position within the buffer containing the next byte to be read. + buffer_position: usize, + /// The capacity of the buffer. We always fully fill the buffer, unless we are at the end of + /// the blob. The buffer capacity must be a multiple of the page size. + buffer_capacity: usize, + /// The physical page size of each full page in the blob, including its 12-byte Checksum. + page_size: usize, +} + +impl Read { + /// Creates a new `Read` that reads from the given blob with the specified buffer size. The + /// `logical_page_size` is the size of the logical data portion of each page (not including the + /// Checksum). If the buffer capacity is not a multiple of the physical page size, it will be + /// rounded up to the nearest. + /// + /// The `physical_blob_size` is the size of the underlying blob on disk (must be a multiple of + /// the physical page size). The `logical_blob_size` is the actual data size (not including + /// CRCs or padding in partial pages). + pub fn new( + blob: B, + physical_blob_size: u64, + logical_blob_size: u64, + capacity: NonZeroUsize, + logical_page_size: NonZeroUsize, + ) -> Self { + let page_size = logical_page_size.get() + CHECKSUM_SIZE_USIZE; + let mut capacity = capacity.get(); + if !capacity.is_multiple_of(page_size) { + capacity += page_size - capacity % page_size; + debug!( + capacity, + "rounded buffer capacity up to nearest multiple of page_size" + ); + } + + Self { + blob, + physical_blob_size, + logical_blob_size, + buffer: Vec::with_capacity(capacity), + blob_page: 0, + buffer_position: 0, + buffer_capacity: capacity, + page_size, + } + } + + /// Returns the logical size of the blob in bytes. + pub const fn blob_size(&self) -> u64 { + self.logical_blob_size + } + + /// Returns the current logical position in the blob. + pub const fn position(&self) -> u64 { + let logical_page_size = (self.page_size - CHECKSUM_SIZE_USIZE) as u64; + self.blob_page * logical_page_size + self.buffer_position as u64 + } + + /// Reads up to `buf.len()` bytes from the current position, but only as many as are available. + /// + /// This is useful for reading variable-length prefixes (like varints) where you want to read up + /// to a maximum number of bytes but the actual remaining bytes in the blob might be less. + /// + /// Returns the number of bytes actually read into the buffer, which will be [0, buf.len()). + pub async fn read_up_to( + &mut self, + buf: impl Into + Send, + ) -> Result<(StableBuf, usize), Error> { + let mut buf = buf.into(); + if buf.is_empty() { + return Ok((buf, 0)); + } + let current_pos = self.position(); + let blob_size = self.blob_size(); + let available = (blob_size.saturating_sub(current_pos) as usize).min(buf.len()); + if available == 0 { + return Err(Error::BlobInsufficientLength); + } + self.read_exact(buf.as_mut(), available).await?; + + Ok((buf, available)) + } + + /// Reads exactly `size` bytes into the provided buffer. Returns [Error::BlobInsufficientLength] + /// if not enough bytes are available. + /// + /// # Panics + /// + /// Panics if `size` is greater than the length of `buf`. + pub async fn read_exact(&mut self, buf: &mut [u8], size: usize) -> Result<(), Error> { + assert!(size <= buf.len()); + + let mut bytes_copied = 0; + while bytes_copied < size { + // Refill buffer if exhausted + if self.buffer_position >= self.buffer.len() { + self.fill_buffer().await?; + } + + // Copy logical bytes + let available = self.buffer.len() - self.buffer_position; + // The buffer might be empty if we're at the end of the blob. + if available == 0 { + return Err(Error::BlobInsufficientLength); + } + + let bytes_to_copy = (size - bytes_copied).min(available); + buf[bytes_copied..bytes_copied + bytes_to_copy].copy_from_slice( + &self.buffer[self.buffer_position..self.buffer_position + bytes_to_copy], + ); + + bytes_copied += bytes_to_copy; + self.buffer_position += bytes_to_copy; + } + + Ok(()) + } + + /// Fills the buffer from the blob starting at the current physical position and verifies the + /// CRC of each page (including any trailing partial page). + async fn fill_buffer(&mut self) -> Result<(), Error> { + let logical_page_size = self.page_size - CHECKSUM_SIZE_USIZE; + + // Advance blob_page based on how much of the buffer we've consumed. We use ceiling division + // because even a partial page counts as a "page" read from the blob. + let pages_consumed = self.buffer.len().div_ceil(logical_page_size); + self.blob_page += pages_consumed as u64; + + // Reset position to the offset within the new page. If the buffer was not empty, we are + // continuing a sequential read, so we start at the beginning of the next page. If the + // buffer was empty (e.g. after a seek), we preserve the offset set by seek_to. + if !self.buffer.is_empty() { + self.buffer_position = 0; + } + + // Calculate physical read parameters + let start_offset = match self.blob_page.checked_mul(self.page_size as u64) { + Some(o) => o, + None => return Err(Error::OffsetOverflow), + }; + + if start_offset >= self.physical_blob_size { + return Err(Error::BlobInsufficientLength); + } + + let bytes_to_read = + ((self.physical_blob_size - start_offset) as usize).min(self.buffer_capacity); + if bytes_to_read == 0 { + return Err(Error::BlobInsufficientLength); + } + + // Read physical data directly into the main buffer, then validate CRCs and compact in-place. + // This avoids allocating a separate staging buffer. + self.buffer.clear(); + self.buffer.resize(bytes_to_read, 0); + let buf = std::mem::take(&mut self.buffer); + let buf = self.blob.read_at(buf, start_offset).await?; + self.buffer = buf.into(); + + // Validate CRCs and compact by removing CRC records in-place. + let mut read_offset = 0; + let mut write_offset = 0; + let physical_len = self.buffer.len(); + + while read_offset < physical_len { + let remaining = physical_len - read_offset; + + // Check if full page or partial + if remaining >= self.page_size { + let page_slice = &self.buffer[read_offset..read_offset + self.page_size]; + let Some(record) = Checksum::validate_page(page_slice) else { + error!( + page = self.blob_page + (read_offset / self.page_size) as u64, + "CRC mismatch" + ); + return Err(Error::InvalidChecksum); + }; + // For non-last pages, the validated length must equal logical_page_size. + let (len, _) = record.get_crc(); + let len = len as usize; + let is_last_page = start_offset + read_offset as u64 + self.page_size as u64 + >= self.physical_blob_size; + if !is_last_page && len != logical_page_size { + error!( + page = self.blob_page + (read_offset / self.page_size) as u64, + expected = logical_page_size, + actual = len, + "non-last page has partial length" + ); + return Err(Error::InvalidChecksum); + } + // Compact: move logical data to remove CRC record gap + if write_offset != read_offset { + self.buffer + .copy_within(read_offset..read_offset + len, write_offset); + } + write_offset += len; + read_offset += self.page_size; + continue; + } + + // Partial page - must have at least CHECKSUM_SIZE bytes + if remaining < CHECKSUM_SIZE_USIZE { + error!( + page = self.blob_page + (read_offset / self.page_size) as u64, + "short page" + ); + return Err(Error::InvalidChecksum); + } + let page_slice = &self.buffer[read_offset..]; + let Some(record) = Checksum::validate_page(page_slice) else { + error!( + page = self.blob_page + (read_offset / self.page_size) as u64, + "CRC mismatch" + ); + return Err(Error::InvalidChecksum); + }; + let (len, _) = record.get_crc(); + let logical_len = len as usize; + // Compact: move logical data + if write_offset != read_offset { + self.buffer + .copy_within(read_offset..read_offset + logical_len, write_offset); + } + write_offset += logical_len; + break; + } + + // Truncate buffer to only contain logical data + self.buffer.truncate(write_offset); + + // If we sought to a position that is beyond the end of what we just read, error. + if self.buffer_position >= self.buffer.len() { + return Err(Error::BlobInsufficientLength); + } + + Ok(()) + } + + /// Repositions the buffer to read from the specified logical position in the blob. + pub fn seek_to(&mut self, position: u64) -> Result<(), Error> { + let logical_page_size = (self.page_size - CHECKSUM_SIZE_USIZE) as u64; + + // Check if the position is within the current buffer. + let buffer_start = self.blob_page * logical_page_size; + let buffer_end = buffer_start + self.buffer.len() as u64; + if position >= buffer_start && position < buffer_end { + self.buffer_position = (position - buffer_start) as usize; + return Ok(()); + } + + self.blob_page = position / logical_page_size; + self.buffer_position = (position % logical_page_size) as usize; + self.buffer.clear(); // Invalidate buffer, will be refilled on next read + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::super::{append::Append, PoolRef}; + use crate::{deterministic, Blob, Error, Runner as _, Storage as _}; + use commonware_macros::test_traced; + use commonware_utils::{NZUsize, NZU16}; + use std::num::NonZeroU16; + + const PAGE_SIZE: NonZeroU16 = NZU16!(103); // Logical page size (intentionally odd to test alignment) + const BUFFER_SIZE: usize = PAGE_SIZE.get() as usize * 2; + + #[test_traced("DEBUG")] + fn test_read_after_append() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + // Create a blob and write data using Append + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + assert_eq!(blob_size, 0); + + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref) + .await + .unwrap(); + + // Write data that spans multiple pages + let data: Vec = (0u8..=255).cycle().take(300).collect(); + append.append(&data).await.unwrap(); + + // Create a Read to read the data back + let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap(); + + // Verify initial position + assert_eq!(reader.position(), 0); + + // Read all data back + let mut read_buf = vec![0u8; 300]; + reader.read_exact(&mut read_buf, 300).await.unwrap(); + assert_eq!(read_buf, data); + + // Verify position after read + assert_eq!(reader.position(), 300); + }); + } + + #[test_traced("DEBUG")] + fn test_read_with_seek() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + // Create a blob and write data using Append + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref) + .await + .unwrap(); + + // Write data that spans multiple pages (300 bytes = ~3 logical pages) + let data: Vec = (0u8..=255).cycle().take(300).collect(); + append.append(&data).await.unwrap(); + + let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap(); + + // Read first 50 bytes + let mut buf = vec![0u8; 50]; + reader.read_exact(&mut buf, 50).await.unwrap(); + assert_eq!(buf, &data[0..50]); + assert_eq!(reader.position(), 50); + + // Seek to middle of second page (position 150) + reader.seek_to(150).unwrap(); + assert_eq!(reader.position(), 150); + + // Read 50 bytes from position 150 + reader.read_exact(&mut buf, 50).await.unwrap(); + assert_eq!(buf, &data[150..200]); + assert_eq!(reader.position(), 200); + + // Seek back to beginning + reader.seek_to(0).unwrap(); + assert_eq!(reader.position(), 0); + + // Read all data to verify seek worked + let mut full_buf = vec![0u8; 300]; + reader.read_exact(&mut full_buf, 300).await.unwrap(); + assert_eq!(full_buf, data); + }); + } + + #[test_traced("DEBUG")] + fn test_read_partial_page() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + // Create a blob and write data that doesn't fill the last page + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref) + .await + .unwrap(); + + // Write exactly one full logical page plus 10 more bytes + let data: Vec = (1u8..=(PAGE_SIZE.get() + 10) as u8).collect(); + assert_eq!(data.len(), PAGE_SIZE.get() as usize + 10); + append.append(&data).await.unwrap(); + + let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap(); + + // Read all data back + let mut read_buf = vec![0u8; data.len()]; + reader.read_exact(&mut read_buf, data.len()).await.unwrap(); + assert_eq!(read_buf, data); + + // Verify we can seek to partial page and read + reader.seek_to(PAGE_SIZE.get() as u64).unwrap(); + let mut partial_buf = vec![0u8; 10]; + reader.read_exact(&mut partial_buf, 10).await.unwrap(); + assert_eq!(partial_buf, &data[PAGE_SIZE.get() as usize..]); + }); + } + + #[test_traced("DEBUG")] + fn test_read_across_page_boundary() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref) + .await + .unwrap(); + + // Write 200 bytes spanning multiple pages + let data: Vec = (0u8..200).collect(); + append.append(&data).await.unwrap(); + + let mut reader = append.as_blob_reader(NZUsize!(BUFFER_SIZE)).await.unwrap(); + + // Seek to position 90 (13 bytes before first page boundary at 103) + reader.seek_to(90).unwrap(); + + // Read 20 bytes across the page boundary + let mut buf = vec![0u8; 20]; + reader.read_exact(&mut buf, 20).await.unwrap(); + assert_eq!(buf, &data[90..110]); + }); + } + + #[test_traced("DEBUG")] + fn test_read_rejects_partial_crc_on_non_last_page() { + let executor = deterministic::Runner::default(); + executor.start(|context: deterministic::Context| async move { + let (blob, blob_size) = context.open("test_partition", b"test_blob").await.unwrap(); + + let pool_ref = PoolRef::new(PAGE_SIZE, NZUsize!(BUFFER_SIZE)); + let append = Append::new(blob.clone(), blob_size, BUFFER_SIZE, pool_ref) + .await + .unwrap(); + + // Two full pages. + let data: Vec = (0u8..=255) + .cycle() + .take(PAGE_SIZE.get() as usize * 2) + .collect(); + append.append(&data).await.unwrap(); + append.sync().await.unwrap(); + + // Corrupt page 0 to claim a shorter (partial) length with a valid CRC. + let page_size = PAGE_SIZE.get() as u64; + let short_len = page_size / 2; + let crc = crc32fast::hash(&data[..short_len as usize]); + let record = super::Checksum::new(short_len as u16, crc); + let crc_offset = page_size; // CRC record starts after logical page bytes + blob.write_at(record.to_bytes().to_vec(), crc_offset) + .await + .unwrap(); + blob.sync().await.unwrap(); + + // Capacity of one page => bug reproduces if last-page check is buffer-based. + let mut reader = append + .as_blob_reader(NZUsize!(page_size as usize)) + .await + .unwrap(); + let mut buf = vec![0u8; page_size as usize]; + let result = reader.read_exact(&mut buf, page_size as usize).await; + + assert!(matches!(result, Err(Error::InvalidChecksum))); + }); + } +} diff --git a/runtime/src/utils/buffer/tip.rs b/runtime/src/utils/buffer/tip.rs index 7061e69dfd..ac68e77945 100644 --- a/runtime/src/utils/buffer/tip.rs +++ b/runtime/src/utils/buffer/tip.rs @@ -1,5 +1,3 @@ -use std::num::NonZeroUsize; - /// A buffer for caching data written to the tip of a blob. /// /// The buffer always represents data at the "tip" of the logical blob, starting at `offset` and @@ -16,15 +14,20 @@ pub(super) struct Buffer { /// The maximum size of the buffer. pub(super) capacity: usize, + + /// Whether this buffer should allow new data. + // TODO(#2371): Use a distinct state-type for immutable vs immutable. + pub(super) immutable: bool, } impl Buffer { - /// Creates a new buffer with the provided `size` and `capacity`. - pub(super) fn new(size: u64, capacity: NonZeroUsize) -> Self { + /// Creates a new buffer with the provided `offset` and `capacity`. + pub(super) fn new(offset: u64, capacity: usize) -> Self { Self { - data: Vec::with_capacity(capacity.get()), - offset: size, - capacity: capacity.get(), + data: Vec::with_capacity(capacity), + offset, + capacity, + immutable: false, } } @@ -75,11 +78,11 @@ impl Buffer { } } - /// Returns the buffered data and its blob offset, or returns `None` if the buffer is - /// already empty. + /// Returns the buffered data and its blob offset, or returns `None` if the buffer is already + /// empty. /// - /// The buffer is reset to the empty state with an updated offset positioned at - /// the end of the logical blob. + /// The buffer is reset to the empty state with an updated offset positioned at the end of the + /// logical blob. pub(super) fn take(&mut self) -> Option<(Vec, u64)> { if self.is_empty() { return None; @@ -153,11 +156,19 @@ impl Buffer { true } - /// Appends the provided `data` to the buffer, and returns `true` if the buffer is now above - /// capacity. If above capacity, the caller is responsible for using `take` to bring it back - /// under. + /// Appends the provided `data` to the buffer, and returns `true` if the buffer is over capacity + /// after the append. + /// + /// If the buffer is above capacity, the caller is responsible for using `take` to bring it back + /// under. Further appends are safe, but will continue growing the buffer beyond its capacity. pub(super) fn append(&mut self, data: &[u8]) -> bool { self.data.extend_from_slice(data); + + self.over_capacity() + } + + /// Whether the buffer is over capacity and should be taken & flushed to the underlying blob. + const fn over_capacity(&self) -> bool { self.data.len() > self.capacity } } @@ -165,11 +176,10 @@ impl Buffer { #[cfg(test)] mod tests { use super::*; - use commonware_utils::NZUsize; #[test] fn test_tip_append() { - let mut buffer = Buffer::new(50, NZUsize!(100)); + let mut buffer = Buffer::new(50, 100); assert_eq!(buffer.size(), 50); assert!(buffer.is_empty()); assert_eq!(buffer.take(), None); @@ -198,7 +208,7 @@ mod tests { #[test] fn test_tip_resize() { - let mut buffer = Buffer::new(50, NZUsize!(100)); + let mut buffer = Buffer::new(50, 100); buffer.append(&[1, 2, 3]); assert_eq!(buffer.size(), 53); diff --git a/runtime/src/utils/buffer/write.rs b/runtime/src/utils/buffer/write.rs index e18189f3c2..b923995717 100644 --- a/runtime/src/utils/buffer/write.rs +++ b/runtime/src/utils/buffer/write.rs @@ -2,8 +2,8 @@ use crate::{buffer::tip::Buffer, Blob, Error, RwLock}; use commonware_utils::StableBuf; use std::{num::NonZeroUsize, sync::Arc}; -/// A writer that buffers content to a [Blob] to optimize the performance -/// of appending or updating data. +/// A writer that buffers the raw content of a [Blob] to optimize the performance of appending or +/// updating data. /// /// # Example /// @@ -54,7 +54,7 @@ impl Write { pub fn new(blob: B, size: u64, capacity: NonZeroUsize) -> Self { Self { blob, - buffer: Arc::new(RwLock::new(Buffer::new(size, capacity))), + buffer: Arc::new(RwLock::new(Buffer::new(size, capacity.get()))), } } diff --git a/storage/conformance.toml b/storage/conformance.toml index 02770d0057..ab29dff0ac 100644 --- a/storage/conformance.toml +++ b/storage/conformance.toml @@ -1,10 +1,10 @@ ["commonware_storage::archive::conformance::ArchiveImmutable"] n_cases = 128 -hash = "8e578ed38733486716d072e565e62fe5d9ba7185ffb6e26ec7db8611c69b90b8" +hash = "6acfa1bc0c17920b5c0e0437af106e09ee57dcd37459091402192f2c146afdb5" ["commonware_storage::archive::conformance::ArchivePrunable"] n_cases = 128 -hash = "674e81c769c06a3965dc691b1f8c0327374f427e8a4bf67895c6ad4e566fed20" +hash = "cb063a05c6a75902893f790e9802b4906be506f2f7e5d10b46dff90a92e40819" ["commonware_storage::archive::immutable::storage::conformance::CodecConformance"] n_cases = 65536 @@ -48,15 +48,15 @@ hash = "13b3e99a8c74b50dc18150194a92306de670b94e6642758feb6d9b6e9881f827" ["commonware_storage::journal::conformance::ContiguousFixed"] n_cases = 512 -hash = "134bb8b838241c2dedf98d96130f014bea19f1bc7580307c9798540466eb81c6" +hash = "4c786b6b7f91b9924a62a7b9a1c32a8d47398f1c8a3d5bf06fe1a90998e86aab" ["commonware_storage::journal::conformance::ContiguousVariable"] n_cases = 512 -hash = "29d37f2309943dd27d4344710a900bb3b992c0a1089ff9734cddbfa78c039200" +hash = "973ebd77804d2ea346574d377f39bd29350063098f2e6fab9596783ba43664e5" ["commonware_storage::journal::conformance::SegmentedFixed"] n_cases = 512 -hash = "505611ba11d6380254c159eb6234f87cc19a62b0919bc96d59e83de498b458fa" +hash = "e077ce8c6d9a79c87cf9b48866c65f387ffbec9d8e8c65dd40c46b4296cfc050" ["commonware_storage::journal::conformance::SegmentedGlob"] n_cases = 512 @@ -64,11 +64,11 @@ hash = "adb1efeef12c203c05879ce4d1d03ef443c767737a6c6b57433189100eec9197" ["commonware_storage::journal::conformance::SegmentedOversized"] n_cases = 512 -hash = "b98d56d2eb039657b3452135666795eeeefdc83e9d6f3cb070e7ca114b4621a1" +hash = "b815138329a06cbe235cf547ed62774165bd2108e68c65bf15ae152bedf84b3a" ["commonware_storage::journal::conformance::SegmentedVariable"] n_cases = 512 -hash = "cd79e09ca53917f78c290e67efe08bf17b3ec0d0faf1b5f6507d4665749574b1" +hash = "418dafd67008cb74d34fe58b9be8747cfaf86e345a71eb4c35ae0e43a4c077ef" ["commonware_storage::mmr::proof::tests::conformance::CodecConformance>"] n_cases = 65536 diff --git a/storage/fuzz/fuzz_targets/archive_operations.rs b/storage/fuzz/fuzz_targets/archive_operations.rs index 1a2ffe1fa6..70761c41fc 100644 --- a/storage/fuzz/fuzz_targets/archive_operations.rs +++ b/storage/fuzz/fuzz_targets/archive_operations.rs @@ -9,9 +9,9 @@ use commonware_storage::{ }, translator::EightCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; -use std::num::NonZeroUsize; +use std::num::{NonZeroU16, NonZeroUsize}; type Key = FixedBytes<16>; type Value = FixedBytes<32>; @@ -40,7 +40,7 @@ struct FuzzInput { operations: Vec, } -const PAGE_SIZE: NonZeroUsize = NZUsize!(555); +const PAGE_SIZE: NonZeroU16 = NZU16!(456); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(100); fn fuzz(data: FuzzInput) { diff --git a/storage/fuzz/fuzz_targets/cache_operations.rs b/storage/fuzz/fuzz_targets/cache_operations.rs index d780f426a0..bfa3c4b894 100644 --- a/storage/fuzz/fuzz_targets/cache_operations.rs +++ b/storage/fuzz/fuzz_targets/cache_operations.rs @@ -8,7 +8,7 @@ use libfuzzer_sys::{ fuzz_target, }; use rand::{rngs::StdRng, SeedableRng}; -use std::collections::BTreeMap; +use std::{collections::BTreeMap, num::NonZeroU16}; const MAX_OPERATIONS: usize = 50; const MAX_INDEX: u64 = 10000; @@ -21,8 +21,8 @@ const MIN_REPLAY_BUFFER: usize = 256; const MAX_REPLAY_BUFFER: usize = 2 * 8192; const MIN_COMPRESSION_LEVEL: u8 = 1; const MAX_COMPRESSION_LEVEL: u8 = 21; -const MIN_BUFFER_POOL_PAGE_SIZE: usize = 512; -const MAX_BUFFER_POOL_PAGE_SIZE: usize = 4096; +const MIN_BUFFER_POOL_PAGE_SIZE: u16 = 511; +const MAX_BUFFER_POOL_PAGE_SIZE: u16 = 4097; const MIN_BUFFER_POOL_CAPACITY: usize = 10; const MAX_BUFFER_POOL_CAPACITY: usize = 64; @@ -45,7 +45,7 @@ struct CacheConfig { write_buffer: usize, replay_buffer: usize, compression: Option, - buffer_pool_pages_size: usize, + buffer_pool_pages_size: NonZeroU16, buffer_pool_capacity: usize, } @@ -71,7 +71,8 @@ impl<'a> Arbitrary<'a> for FuzzInput { None }; let buffer_pool_pages_size = - u.int_in_range(MIN_BUFFER_POOL_PAGE_SIZE..=MAX_BUFFER_POOL_PAGE_SIZE)?; + NonZeroU16::new(u.int_in_range(MIN_BUFFER_POOL_PAGE_SIZE..=MAX_BUFFER_POOL_PAGE_SIZE)?) + .unwrap(); let buffer_pool_capacity = u.int_in_range(MIN_BUFFER_POOL_CAPACITY..=MAX_BUFFER_POOL_CAPACITY)?; @@ -137,7 +138,7 @@ fn fuzz(input: FuzzInput) { replay_buffer: NZUsize!(input.config.replay_buffer), items_per_blob: NZU64!(input.config.items_per_blob), buffer_pool: PoolRef::new( - NZUsize!(input.config.buffer_pool_pages_size), + input.config.buffer_pool_pages_size, NZUsize!(input.config.buffer_pool_capacity), ), }; diff --git a/storage/fuzz/fuzz_targets/current_ordered_operations.rs b/storage/fuzz/fuzz_targets/current_ordered_operations.rs index 81bda24787..0b3a6b2a27 100644 --- a/storage/fuzz/fuzz_targets/current_ordered_operations.rs +++ b/storage/fuzz/fuzz_targets/current_ordered_operations.rs @@ -11,9 +11,12 @@ use commonware_storage::{ }, translator::TwoCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; -use std::{collections::HashMap, num::NonZeroU64}; +use std::{ + collections::HashMap, + num::{NonZeroU16, NonZeroU64}, +}; type Key = FixedBytes<32>; type Value = FixedBytes<32>; @@ -74,7 +77,7 @@ impl<'a> Arbitrary<'a> for FuzzInput { } } -const PAGE_SIZE: usize = 88; +const PAGE_SIZE: NonZeroU16 = NZU16!(91); const PAGE_CACHE_SIZE: usize = 8; const MMR_ITEMS_PER_BLOB: u64 = 11; const LOG_ITEMS_PER_BLOB: u64 = 7; @@ -95,7 +98,7 @@ fn fuzz(data: FuzzInput) { log_write_buffer: NZUsize!(WRITE_BUFFER_SIZE), bitmap_metadata_partition: "fuzz_current_bitmap_metadata".into(), translator: TwoCap, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), thread_pool: None, }; diff --git a/storage/fuzz/fuzz_targets/current_unordered_operations.rs b/storage/fuzz/fuzz_targets/current_unordered_operations.rs index f8220562e7..465e76f61d 100644 --- a/storage/fuzz/fuzz_targets/current_unordered_operations.rs +++ b/storage/fuzz/fuzz_targets/current_unordered_operations.rs @@ -11,9 +11,12 @@ use commonware_storage::{ }, translator::TwoCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; -use std::{collections::HashMap, num::NonZeroU64}; +use std::{ + collections::HashMap, + num::{NonZeroU16, NonZeroU64}, +}; type Key = FixedBytes<32>; type Value = FixedBytes<32>; @@ -68,7 +71,7 @@ impl<'a> Arbitrary<'a> for FuzzInput { } } -const PAGE_SIZE: usize = 88; +const PAGE_SIZE: NonZeroU16 = NZU16!(88); const PAGE_CACHE_SIZE: usize = 8; const MMR_ITEMS_PER_BLOB: u64 = 11; const LOG_ITEMS_PER_BLOB: u64 = 7; @@ -89,7 +92,7 @@ fn fuzz(data: FuzzInput) { log_write_buffer: NZUsize!(WRITE_BUFFER_SIZE), bitmap_metadata_partition: "fuzz_current_bitmap_metadata".into(), translator: TwoCap, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), thread_pool: None, }; diff --git a/storage/fuzz/fuzz_targets/fixed_journal_operations.rs b/storage/fuzz/fuzz_targets/fixed_journal_operations.rs index 65c2f27a6a..486c966730 100644 --- a/storage/fuzz/fuzz_targets/fixed_journal_operations.rs +++ b/storage/fuzz/fuzz_targets/fixed_journal_operations.rs @@ -4,9 +4,10 @@ use arbitrary::{Arbitrary, Result, Unstructured}; use commonware_cryptography::{Hasher as _, Sha256}; use commonware_runtime::{buffer::PoolRef, deterministic, Runner}; use commonware_storage::journal::contiguous::fixed::{Config as JournalConfig, Journal}; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use futures::{pin_mut, StreamExt}; use libfuzzer_sys::fuzz_target; +use std::num::NonZeroU16; const MAX_REPLAY_BUF: usize = 2048; const MAX_WRITE_BUF: usize = 2048; @@ -51,7 +52,7 @@ struct FuzzInput { operations: Vec, } -const PAGE_SIZE: usize = 128; +const PAGE_SIZE: NonZeroU16 = NZU16!(57); const PAGE_CACHE_SIZE: usize = 1; fn fuzz(input: FuzzInput) { @@ -62,7 +63,7 @@ fn fuzz(input: FuzzInput) { partition: "fixed_journal_operations_fuzz_test".to_string(), items_per_blob: NZU64!(3), write_buffer: NZUsize!(MAX_WRITE_BUF), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap(); diff --git a/storage/fuzz/fuzz_targets/freezer_operations.rs b/storage/fuzz/fuzz_targets/freezer_operations.rs index 44a4ae4e89..9764bafa64 100644 --- a/storage/fuzz/fuzz_targets/freezer_operations.rs +++ b/storage/fuzz/fuzz_targets/freezer_operations.rs @@ -3,9 +3,12 @@ use arbitrary::Arbitrary; use commonware_runtime::{buffer::PoolRef, deterministic, Runner}; use commonware_storage::freezer::{Config, Freezer, Identifier}; -use commonware_utils::{sequence::FixedBytes, NZUsize}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16}; use libfuzzer_sys::fuzz_target; -use std::{collections::HashMap, num::NonZeroUsize}; +use std::{ + collections::HashMap, + num::{NonZeroU16, NonZeroUsize}, +}; #[derive(Arbitrary, Debug)] enum Op { @@ -40,7 +43,7 @@ fn vec_to_key(v: &[u8]) -> FixedBytes<32> { FixedBytes::<32>::new(buf) } -const PAGE_SIZE: NonZeroUsize = NZUsize!(555); +const PAGE_SIZE: NonZeroU16 = NZU16!(393); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(100); fn fuzz(input: FuzzInput) { diff --git a/storage/fuzz/fuzz_targets/mmr_journaled.rs b/storage/fuzz/fuzz_targets/mmr_journaled.rs index 07c5cbbe04..4906ef58e8 100644 --- a/storage/fuzz/fuzz_targets/mmr_journaled.rs +++ b/storage/fuzz/fuzz_targets/mmr_journaled.rs @@ -8,12 +8,13 @@ use commonware_storage::mmr::{ location::{Location, LocationRangeExt}, Position, StandardHasher as Standard, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; +use std::num::NonZeroU16; const MAX_OPERATIONS: usize = 200; const MAX_DATA_SIZE: usize = 64; -const PAGE_SIZE: usize = 111; +const PAGE_SIZE: NonZeroU16 = NZU16!(111); const PAGE_CACHE_SIZE: usize = 5; const ITEMS_PER_BLOB: u64 = 7; @@ -88,7 +89,7 @@ fn test_config(partition_suffix: &str) -> Config { items_per_blob: NZU64!(ITEMS_PER_BLOB), write_buffer: NZUsize!(1024), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), } } diff --git a/storage/fuzz/fuzz_targets/oversized_recovery.rs b/storage/fuzz/fuzz_targets/oversized_recovery.rs index bb52d6348f..549a4caaa7 100644 --- a/storage/fuzz/fuzz_targets/oversized_recovery.rs +++ b/storage/fuzz/fuzz_targets/oversized_recovery.rs @@ -10,9 +10,9 @@ use bytes::{Buf, BufMut}; use commonware_codec::{FixedSize, Read, ReadExt, Write}; use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner, Storage as _}; use commonware_storage::journal::segmented::oversized::{Config, Oversized, Record}; -use commonware_utils::NZUsize; +use commonware_utils::{NZUsize, NZU16}; use libfuzzer_sys::fuzz_target; -use std::num::NonZeroUsize; +use std::num::{NonZeroU16, NonZeroUsize}; /// Test index entry that stores a u64 id and references a value. #[derive(Debug, Clone, PartialEq)] @@ -154,7 +154,7 @@ struct FuzzInput { sync_before_corrupt: bool, } -const PAGE_SIZE: NonZeroUsize = NZUsize!(128); +const PAGE_SIZE: NonZeroU16 = NZU16!(128); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(4); const INDEX_PARTITION: &str = "fuzz_index"; const VALUE_PARTITION: &str = "fuzz_values"; diff --git a/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs b/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs index 156801031d..f752165cbc 100644 --- a/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs +++ b/storage/fuzz/fuzz_targets/qmdb_any_fixed_sync.rs @@ -13,9 +13,9 @@ use commonware_storage::{ }, translator::TwoCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; -use std::sync::Arc; +use std::{num::NonZeroU16, sync::Arc}; type Key = FixedBytes<32>; type Value = FixedBytes<32>; @@ -86,7 +86,7 @@ impl<'a> Arbitrary<'a> for FuzzInput { } } -const PAGE_SIZE: usize = 128; +const PAGE_SIZE: NonZeroU16 = NZU16!(129); fn test_config(test_name: &str) -> Config { Config { @@ -99,7 +99,7 @@ fn test_config(test_name: &str) -> Config { log_write_buffer: NZUsize!(1024), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(1)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(1)), } } diff --git a/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs b/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs index 13f4bf1e83..5e767d80ef 100644 --- a/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs +++ b/storage/fuzz/fuzz_targets/qmdb_any_variable_sync.rs @@ -11,10 +11,13 @@ use commonware_storage::{ }, translator::TwoCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; use mmr::location::Location; -use std::{collections::HashMap, num::NonZeroU64}; +use std::{ + collections::HashMap, + num::{NonZeroU16, NonZeroU64}, +}; const MAX_OPERATIONS: usize = 50; @@ -129,7 +132,7 @@ impl<'a> Arbitrary<'a> for FuzzInput { } } -const PAGE_SIZE: usize = 128; +const PAGE_SIZE: NonZeroU16 = NZU16!(128); fn test_config(test_name: &str) -> Config, ())> { Config { @@ -144,7 +147,7 @@ fn test_config(test_name: &str) -> Config Config, ())> { log_write_buffer: NZUsize!(1024), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), } } diff --git a/storage/fuzz/fuzz_targets/qmdb_keyless.rs b/storage/fuzz/fuzz_targets/qmdb_keyless.rs index 66db54d449..1390ae3326 100644 --- a/storage/fuzz/fuzz_targets/qmdb_keyless.rs +++ b/storage/fuzz/fuzz_targets/qmdb_keyless.rs @@ -10,8 +10,9 @@ use commonware_storage::{ verify_proof, }, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; +use std::num::NonZeroU16; const MAX_OPERATIONS: usize = 50; const MAX_PROOF_OPS: u64 = 100; @@ -117,7 +118,7 @@ impl<'a> Arbitrary<'a> for FuzzInput { } } -const PAGE_SIZE: usize = 128; +const PAGE_SIZE: NonZeroU16 = NZU16!(127); const PAGE_CACHE_SIZE: usize = 8; type CleanDb = Keyless, Sha256>; @@ -134,7 +135,7 @@ fn test_config(test_name: &str) -> Config<(commonware_codec::RangeCfg, () log_codec_config: ((0..=10000).into(), ()), log_items_per_section: NZU64!(7), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), } } diff --git a/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs b/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs index b103ca9a12..d799aeaece 100644 --- a/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs +++ b/storage/fuzz/fuzz_targets/qmdb_ordered_batching.rs @@ -8,10 +8,11 @@ use commonware_storage::{ qmdb::any::{ordered::fixed::Db, FixedConfig as Config}, translator::EightCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; use std::{ collections::{BTreeMap, HashSet}, + num::NonZeroU16, ops::Bound::{Excluded, Unbounded}, }; @@ -36,7 +37,7 @@ struct FuzzInput { operations: Vec, } -const PAGE_SIZE: usize = 555; +const PAGE_SIZE: NonZeroU16 = NZU16!(111); const PAGE_CACHE_SIZE: usize = 100; fn fuzz(data: FuzzInput) { @@ -53,7 +54,7 @@ fn fuzz(data: FuzzInput) { log_write_buffer: NZUsize!(1024), translator: EightCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; let mut db = OrderedDb::init(context.clone(), cfg.clone()) diff --git a/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs b/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs index 02e0125c06..bd7fc96db7 100644 --- a/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs +++ b/storage/fuzz/fuzz_targets/qmdb_ordered_operations.rs @@ -11,11 +11,11 @@ use commonware_storage::{ }, translator::EightCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; use std::{ collections::{HashMap, HashSet}, - num::NonZeroU64, + num::{NonZeroU16, NonZeroU64}, }; type Key = FixedBytes<32>; @@ -60,7 +60,7 @@ struct FuzzInput { operations: Vec, } -const PAGE_SIZE: usize = 555; +const PAGE_SIZE: NonZeroU16 = NZU16!(555); const PAGE_CACHE_SIZE: usize = 100; fn fuzz(data: FuzzInput) { @@ -78,7 +78,7 @@ fn fuzz(data: FuzzInput) { log_write_buffer: NZUsize!(1024), translator: EightCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; let mut db = Db::<_, Key, Value, Sha256, EightCap>::init(context.clone(), cfg.clone()) diff --git a/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs b/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs index 98ed0bf025..8eed64abf8 100644 --- a/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs +++ b/storage/fuzz/fuzz_targets/qmdb_unordered_operations.rs @@ -11,9 +11,12 @@ use commonware_storage::{ }, translator::EightCap, }; -use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; -use std::collections::{HashMap, HashSet}; +use std::{ + collections::{HashMap, HashSet}, + num::NonZeroU16, +}; type Key = FixedBytes<32>; type Value = FixedBytes<64>; @@ -36,7 +39,7 @@ struct FuzzInput { operations: Vec, } -const PAGE_SIZE: usize = 555; +const PAGE_SIZE: NonZeroU16 = NZU16!(223); const PAGE_CACHE_SIZE: usize = 100; fn fuzz(data: FuzzInput) { @@ -54,7 +57,7 @@ fn fuzz(data: FuzzInput) { log_write_buffer: NZUsize!(1024), translator: EightCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; let mut db = Db::<_, Key, Value, Sha256, EightCap>::init(context.clone(), cfg.clone()) diff --git a/storage/fuzz/fuzz_targets/store_operations.rs b/storage/fuzz/fuzz_targets/store_operations.rs index 6a1ac79507..a7dd7c633b 100644 --- a/storage/fuzz/fuzz_targets/store_operations.rs +++ b/storage/fuzz/fuzz_targets/store_operations.rs @@ -7,8 +7,9 @@ use commonware_storage::{ qmdb::store::db::{Config, Db}, translator::TwoCap, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use libfuzzer_sys::fuzz_target; +use std::num::NonZeroU16; const MAX_OPERATIONS: usize = 50; @@ -86,7 +87,7 @@ impl<'a> Arbitrary<'a> for FuzzInput { } } -const PAGE_SIZE: usize = 128; +const PAGE_SIZE: NonZeroU16 = NZU16!(125); const PAGE_CACHE_SIZE: usize = 8; fn test_config(test_name: &str) -> Config, ())> { @@ -97,7 +98,7 @@ fn test_config(test_name: &str) -> Config FixedBytes<64> { diff --git a/storage/src/archive/prunable/mod.rs b/storage/src/archive/prunable/mod.rs index a6a7c2f550..a244077fb1 100644 --- a/storage/src/archive/prunable/mod.rs +++ b/storage/src/archive/prunable/mod.rs @@ -120,7 +120,7 @@ //! prunable::{Archive, Config}, //! }, //! }; -//! use commonware_utils::{NZUsize, NZU64}; +//! use commonware_utils::{NZUsize, NZU16, NZU64}; //! //! let executor = deterministic::Runner::default(); //! executor.start(|context| async move { @@ -128,7 +128,7 @@ //! let cfg = Config { //! translator: FourCap, //! key_partition: "demo_index".into(), -//! key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! key_buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)), //! value_partition: "demo_value".into(), //! compression: Some(3), //! codec_config: (), @@ -203,15 +203,15 @@ mod tests { }; use commonware_codec::{DecodeExt, Error as CodecError}; use commonware_macros::{test_group, test_traced}; - use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage}; - use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; + use commonware_runtime::{deterministic, Metrics, Runner}; + use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use rand::Rng; - use std::collections::BTreeMap; + use std::{collections::BTreeMap, num::NonZeroU16}; const DEFAULT_ITEMS_PER_SECTION: u64 = 65536; const DEFAULT_WRITE_BUFFER: usize = 1024; const DEFAULT_REPLAY_BUFFER: usize = 4096; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); fn test_key(key: &str) -> FixedBytes<64> { @@ -288,79 +288,6 @@ mod tests { }); } - #[test_traced] - fn test_archive_record_corruption() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - executor.start(|context| async move { - // Initialize the archive - let cfg = Config { - translator: FourCap, - key_partition: "test_index".into(), - key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - value_partition: "test_value".into(), - codec_config: (), - compression: None, - key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), - value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), - replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), - items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - }; - let mut archive = Archive::init(context.clone(), cfg.clone()) - .await - .expect("Failed to initialize archive"); - - let index = 1u64; - let key = test_key("testkey"); - let data = 1; - - // Put the key-data pair - archive - .put(index, key.clone(), data) - .await - .expect("Failed to put data"); - - // Sync and drop the archive - archive.sync().await.expect("Failed to sync archive"); - drop(archive); - - // Corrupt the index journal - let section = (index / DEFAULT_ITEMS_PER_SECTION) * DEFAULT_ITEMS_PER_SECTION; - let (blob, _) = context - .open("test_index", §ion.to_be_bytes()) - .await - .unwrap(); - blob.write_at(b"corrupt!".to_vec(), 8).await.unwrap(); - blob.sync().await.unwrap(); - - // Initialize the archive again - let archive = Archive::<_, _, FixedBytes<64>, i32>::init( - context, - Config { - translator: FourCap, - key_partition: "test_index".into(), - key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - value_partition: "test_value".into(), - codec_config: (), - compression: None, - key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), - value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), - replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), - items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - }, - ) - .await - .expect("Failed to initialize archive"); - - // Check that the archive is empty - let retrieved: Option = archive - .get(Identifier::Index(index)) - .await - .expect("Failed to get data"); - assert!(retrieved.is_none()); - }); - } - #[test_traced] fn test_archive_overlapping_key_basic() { // Initialize the deterministic context diff --git a/storage/src/cache/mod.rs b/storage/src/cache/mod.rs index 0c4f170707..5f4dfd1061 100644 --- a/storage/src/cache/mod.rs +++ b/storage/src/cache/mod.rs @@ -42,7 +42,7 @@ //! ```rust //! use commonware_runtime::{Spawner, Runner, deterministic, buffer::PoolRef}; //! use commonware_storage::cache::{Cache, Config}; -//! use commonware_utils::{NZUsize, NZU64}; +//! use commonware_utils::{NZUsize, NZU16, NZU64}; //! //! let executor = deterministic::Runner::default(); //! executor.start(|context| async move { @@ -54,7 +54,7 @@ //! items_per_blob: NZU64!(1024), //! write_buffer: NZUsize!(1024 * 1024), //! replay_buffer: NZUsize!(4096), -//! buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)), //! }; //! let mut cache = Cache::init(context, cfg).await.unwrap(); //! @@ -126,17 +126,16 @@ pub struct Config { mod tests { use super::*; use crate::journal::Error as JournalError; - use commonware_codec::{varint::UInt, EncodeSize}; use commonware_macros::{test_group, test_traced}; - use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_runtime::{deterministic, Metrics, Runner}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::Rng; - use std::collections::BTreeMap; + use std::{collections::BTreeMap, num::NonZeroU16}; const DEFAULT_ITEMS_PER_BLOB: u64 = 65536; const DEFAULT_WRITE_BUFFER: usize = 1024; const DEFAULT_REPLAY_BUFFER: usize = 4096; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); #[test_traced] @@ -185,72 +184,6 @@ mod tests { }); } - #[test_traced] - fn test_cache_record_corruption() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - executor.start(|context| async move { - // Initialize the cache - let cfg = Config { - partition: "test_partition".into(), - codec_config: (), - compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), - replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), - items_per_blob: NZU64!(DEFAULT_ITEMS_PER_BLOB), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - }; - let mut cache = Cache::init(context.clone(), cfg.clone()) - .await - .expect("Failed to initialize cache"); - - let index = 1u64; - let data = 1; - - // Put the data - cache - .put(index, data) - .await - .expect("Failed to put data"); - - // Sync and drop the cache - cache.sync().await.expect("Failed to sync cache"); - drop(cache); - - // Corrupt the value - let section = (index / DEFAULT_ITEMS_PER_BLOB) * DEFAULT_ITEMS_PER_BLOB; - let (blob, _) = context - .open("test_partition", §ion.to_be_bytes()) - .await - .unwrap(); - let value_location = 4 /* journal size */ + UInt(1u64).encode_size() as u64 /* index */ + 4 /* value length */; - blob.write_at(b"testdaty".to_vec(), value_location).await.unwrap(); - blob.sync().await.unwrap(); - - // Initialize the cache again - let cache = Cache::<_, i32>::init( - context, - Config { - partition: "test_partition".into(), - codec_config: (), - compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), - replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), - items_per_blob: NZU64!(DEFAULT_ITEMS_PER_BLOB), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - }, - ) - .await.expect("Failed to initialize cache"); - - // Check that the cache is empty - let retrieved: Option = cache - .get(index) - .await - .expect("Failed to get data"); - assert!(retrieved.is_none()); - }); - } - #[test_traced] fn test_cache_prune() { // Initialize the deterministic context diff --git a/storage/src/freezer/benches/utils.rs b/storage/src/freezer/benches/utils.rs index 87ebc0d555..fcdbc64144 100644 --- a/storage/src/freezer/benches/utils.rs +++ b/storage/src/freezer/benches/utils.rs @@ -2,9 +2,9 @@ use commonware_runtime::{buffer::PoolRef, tokio::Context}; use commonware_storage::freezer::{Config, Freezer}; -use commonware_utils::{sequence::FixedBytes, NZUsize}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16}; use rand::{rngs::StdRng, RngCore, SeedableRng}; -use std::num::NonZeroUsize; +use std::num::{NonZeroU16, NonZeroUsize}; /// Number of bytes that can be buffered before being written to disk. const WRITE_BUFFER: usize = 1024 * 1024; // 1MB @@ -34,7 +34,7 @@ pub const VALUE_PARTITION: &str = "freezer_bench_value"; pub const TABLE_PARTITION: &str = "freezer_bench_table"; /// Use a "prod sized" page size to test the performance of the journal. -const PAGE_SIZE: NonZeroUsize = NZUsize!(16_384); +const PAGE_SIZE: NonZeroU16 = NZU16!(16_384); /// The number of pages to cache in the buffer pool. const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10_000); diff --git a/storage/src/freezer/mod.rs b/storage/src/freezer/mod.rs index f30492cedc..4ebbeb848b 100644 --- a/storage/src/freezer/mod.rs +++ b/storage/src/freezer/mod.rs @@ -164,7 +164,7 @@ //! ```rust //! use commonware_runtime::{Spawner, Runner, deterministic, buffer::PoolRef}; //! use commonware_storage::freezer::{Freezer, Config, Identifier}; -//! use commonware_utils::{sequence::FixedBytes, NZUsize}; +//! use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16}; //! //! let executor = deterministic::Runner::default(); //! executor.start(|context| async move { @@ -172,7 +172,7 @@ //! let cfg = Config { //! key_partition: "freezer_key_index".into(), //! key_write_buffer: NZUsize!(1024 * 1024), // 1MB -//! key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! key_buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)), //! value_partition: "freezer_value_journal".into(), //! value_compression: Some(3), //! value_write_buffer: NZUsize!(1024 * 1024), // 1MB @@ -276,8 +276,9 @@ mod tests { use commonware_codec::DecodeExt; use commonware_macros::{test_group, test_traced}; use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage}; - use commonware_utils::{hex, sequence::FixedBytes, NZUsize}; + use commonware_utils::{hex, sequence::FixedBytes, NZUsize, NZU16}; use rand::{Rng, RngCore}; + use std::num::NonZeroU16; const DEFAULT_WRITE_BUFFER: usize = 1024; const DEFAULT_VALUE_TARGET_SIZE: u64 = 10 * 1024 * 1024; @@ -285,7 +286,7 @@ mod tests { const DEFAULT_TABLE_RESIZE_FREQUENCY: u8 = 4; const DEFAULT_TABLE_RESIZE_CHUNK_SIZE: u32 = 128; // force multiple chunks const DEFAULT_TABLE_REPLAY_BUFFER: usize = 64 * 1024; // 64KB - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); fn test_key(key: &str) -> FixedBytes<64> { diff --git a/storage/src/journal/authenticated.rs b/storage/src/journal/authenticated.rs index 174c683c5d..a5a516011f 100644 --- a/storage/src/journal/authenticated.rs +++ b/storage/src/journal/authenticated.rs @@ -637,11 +637,12 @@ mod tests { deterministic::{self, Context}, Runner as _, }; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use futures::StreamExt as _; + use std::num::NonZeroU16; - const PAGE_SIZE: usize = 101; - const PAGE_CACHE_SIZE: usize = 11; + const PAGE_SIZE: NonZeroU16 = NZU16!(101); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11); /// Create MMR configuration for tests. fn mmr_config(suffix: &str) -> MmrConfig { @@ -651,7 +652,7 @@ mod tests { items_per_blob: NZU64!(11), write_buffer: NZUsize!(1024), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } @@ -661,7 +662,7 @@ mod tests { partition: format!("journal_{suffix}"), items_per_blob: NZU64!(7), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/journal/benches/bench.rs b/storage/src/journal/benches/bench.rs index a9f19c499a..d8e6181cfb 100644 --- a/storage/src/journal/benches/bench.rs +++ b/storage/src/journal/benches/bench.rs @@ -1,9 +1,9 @@ use commonware_runtime::{buffer::PoolRef, tokio::Context}; use commonware_storage::journal::contiguous::fixed::{Config as JConfig, Journal}; -use commonware_utils::{sequence::FixedBytes, NZUsize}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16}; use criterion::criterion_main; use rand::{rngs::StdRng, RngCore, SeedableRng}; -use std::num::{NonZeroU64, NonZeroUsize}; +use std::num::{NonZeroU16, NonZeroU64, NonZeroUsize}; mod fixed_append; mod fixed_read_random; @@ -21,7 +21,7 @@ criterion_main!( const WRITE_BUFFER: NonZeroUsize = NZUsize!(1_024 * 1024); // 1MB /// Use a "prod sized" page size to test the performance of the journal. -const PAGE_SIZE: NonZeroUsize = NZUsize!(16_384); +const PAGE_SIZE: NonZeroU16 = NZU16!(16_384); /// The number of pages to cache in the buffer pool. Make it big enough to be /// fast, but not so big we avoid any page faults for the larger benchmarks. diff --git a/storage/src/journal/benches/fixed_replay.rs b/storage/src/journal/benches/fixed_replay.rs index 4c4b93cab3..c4d01ebd0a 100644 --- a/storage/src/journal/benches/fixed_replay.rs +++ b/storage/src/journal/benches/fixed_replay.rs @@ -59,7 +59,7 @@ fn bench_fixed_replay(c: &mut Criterion) { // Run the benchmarks let runner = tokio::Runner::new(cfg.clone()); - for buffer in [128, 16_384, 65_536, 1_048_576] { + for buffer in [16_384, 65_536, 1_048_576] { c.bench_function( &format!( "{}/items={} buffer={} size={}", diff --git a/storage/src/journal/conformance.rs b/storage/src/journal/conformance.rs index 48936ef478..4df130368c 100644 --- a/storage/src/journal/conformance.rs +++ b/storage/src/journal/conformance.rs @@ -8,14 +8,14 @@ use bytes::{Buf, BufMut}; use commonware_codec::{FixedSize, RangeCfg, Read, ReadExt, Write}; use commonware_conformance::{conformance_tests, Conformance}; use commonware_runtime::{buffer::PoolRef, deterministic, Metrics, Runner}; -use commonware_utils::{NZUsize, NZU64}; -use core::num::{NonZeroU64, NonZeroUsize}; +use commonware_utils::{NZUsize, NZU16, NZU64}; +use core::num::{NonZeroU16, NonZeroU64, NonZeroUsize}; use oversized::Record; use rand::Rng; const WRITE_BUFFER: NonZeroUsize = NZUsize!(1024); const ITEMS_PER_BLOB: NonZeroU64 = NZU64!(4096); -const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); +const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); struct ContiguousFixed; diff --git a/storage/src/journal/contiguous/fixed.rs b/storage/src/journal/contiguous/fixed.rs index cf88ea435d..8501e7593b 100644 --- a/storage/src/journal/contiguous/fixed.rs +++ b/storage/src/journal/contiguous/fixed.rs @@ -9,22 +9,22 @@ //! # Format //! //! Data stored in a `fixed::Journal` is persisted in one of many Blobs within a caller-provided -//! `partition`. Each `Blob` contains a configurable maximum of `items_per_blob`, with each item -//! followed by its checksum (CRC32): +//! `partition`. Each `Blob` contains a configurable maximum of `items_per_blob`, with page-level +//! data integrity provided by a buffer pool. //! //! ```text -//! +--------+-----------+--------+-----------+--------+----------+-------------+ -//! | item_0 | C(Item_0) | item_1 | C(Item_1) | ... | item_n-1 | C(Item_n-1) | -//! +--------+-----------+--------+----0------+--------+----------+-------------+ +//! +--------+----- --+--- -+----------+ +//! | item_0 | item_1 | ... | item_n-1 | +//! +--------+-----------+--------+----0 //! -//! n = config.items_per_blob, C = CRC32 +//! n = config.items_per_blob //! ``` //! //! The most recent blob may not necessarily be full, in which case it will contain fewer than the //! maximum number of items. //! -//! A fetched or replayed item's checksum is always computed and checked against the stored value -//! before it is returned. If the checksums do not match, an error is returned instead. +//! Data fetched from disk is always checked for integrity before being returned. If the data is +//! found to be invalid, an error is returned instead. //! //! # Open Blobs //! @@ -44,12 +44,11 @@ //! //! # State Sync //! -//! `Journal::init_sync` allows for initializing a journal for use in state sync. -//! When opened in this mode, we attempt to populate the journal within the given range -//! with persisted data. -//! If the journal is empty, we create a fresh journal at the specified position. -//! If the journal is not empty, we prune the journal to the specified lower bound and rewind to -//! the specified upper bound. +//! `Journal::init_sync` allows for initializing a journal for use in state sync. When opened in +//! this mode, we attempt to populate the journal within the given range with persisted data. If the +//! journal is empty, we create a fresh journal at the specified position. If the journal is not +//! empty, we prune the journal to the specified lower bound and rewind to the specified upper +//! bound. //! //! # Replay //! @@ -59,10 +58,9 @@ use crate::{ journal::{contiguous::MutableContiguous, Error}, Persistable, }; -use bytes::BufMut; -use commonware_codec::{CodecFixed, DecodeExt as _, FixedSize}; +use commonware_codec::{CodecFixed, DecodeExt as _}; use commonware_runtime::{ - buffer::{Append, PoolRef, Read}, + buffer::pool::{Append, PoolRef}, telemetry::metrics::status::GaugeExt, Blob, Error as RError, Metrics, Storage, }; @@ -137,7 +135,7 @@ pub struct Journal { } impl> Journal { - const CHUNK_SIZE: usize = u32::SIZE + A::SIZE; + pub(crate) const CHUNK_SIZE: usize = A::SIZE; pub(crate) const CHUNK_SIZE_U64: u64 = Self::CHUNK_SIZE as u64; /// Initialize a new `Journal` instance. @@ -147,10 +145,12 @@ impl> Journal { /// /// # Repair /// - /// Like [sqlite](https://github.com/sqlite/sqlite/blob/8658a8df59f00ec8fcfea336a2a6a4b5ef79d2ee/src/wal.c#L1504-L1505) - /// and [rocksdb](https://github.com/facebook/rocksdb/blob/0c533e61bc6d89fdf1295e8e0bcee4edb3aef401/include/rocksdb/options.h#L441-L445), - /// the first invalid data read will be considered the new end of the journal (and the underlying [Blob] will be truncated to the last - /// valid item). + /// Like + /// [sqlite](https://github.com/sqlite/sqlite/blob/8658a8df59f00ec8fcfea336a2a6a4b5ef79d2ee/src/wal.c#L1504-L1505) + /// and + /// [rocksdb](https://github.com/facebook/rocksdb/blob/0c533e61bc6d89fdf1295e8e0bcee4edb3aef401/include/rocksdb/options.h#L441-L445), + /// the first invalid data read will be considered the new end of the journal (and the + /// underlying [Blob] will be truncated to the last valid item). pub async fn init(context: E, cfg: Config) -> Result { // Iterate over blobs in partition let mut blobs = BTreeMap::new(); @@ -172,21 +172,16 @@ impl> Journal { blobs.insert(index, (blob, size)); } - // Check that there are no gaps in the historical blobs and that they are all full. + // Check that there are no gaps in the historical blobs. let full_size = cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64; if !blobs.is_empty() { let mut it = blobs.keys().rev(); let mut prev_index = *it.next().unwrap(); for index in it { - let (_, size) = blobs.get(index).unwrap(); if *index != prev_index - 1 { return Err(Error::MissingBlob(prev_index - 1)); } prev_index = *index; - if *size != full_size { - // Non-final blobs that have invalid sizes are not recoverable. - return Err(Error::InvalidBlobSize(*index, *size)); - } } } else { debug!("no blobs found"); @@ -204,15 +199,43 @@ impl> Journal { context.register("pruned", "Number of blobs pruned", pruned.clone()); let _ = tracked.try_set(blobs.len()); - // Initialize the tail blob. - let (mut tail_index, (mut tail, mut tail_size)) = blobs.pop_last().unwrap(); + // Wrap all blobs with Append wrappers, starting with the tail. + let (mut tail_index, (blob, blob_size)) = blobs.pop_last().unwrap(); + let mut tail = Append::new( + blob, + blob_size, + cfg.write_buffer.get(), + cfg.buffer_pool.clone(), + ) + .await?; + let mut tail_size = tail.size().await; - // Trim invalid items from the tail blob. - tail_size = Self::trim_tail(&tail, tail_size, tail_index).await?; - if tail_size > full_size { - return Err(Error::InvalidBlobSize(tail_index, tail_size)); + // Trim the tail blob if necessary. + if !tail_size.is_multiple_of(Self::CHUNK_SIZE_U64) { + warn!( + blob = tail_index, + invalid_size = tail_size, + "last blob size is not a multiple of item size, truncating" + ); + tail_size -= tail_size % Self::CHUNK_SIZE_U64; + tail.resize(tail_size).await?; } + // Non-tail blobs can be immutable. + let mut blobs = try_join_all(blobs.into_iter().map(|(index, (blob, size))| { + let pool = cfg.buffer_pool.clone(); + async move { + let blob = Append::new_immutable(blob, size, cfg.write_buffer.get(), pool).await?; + let logical_size = blob.size().await; + // Verify the non-tail blobs are full as expected. + if logical_size != full_size { + return Err(Error::InvalidBlobSize(logical_size, full_size)); + } + Ok::<_, Error>((index, (blob, logical_size))) + } + })) + .await?; + // If the tail blob is full we need to start a new one to maintain its invariant that there // is always room for another item. if tail_size == full_size { @@ -220,33 +243,30 @@ impl> Journal { blob = tail_index, "tail blob is full, creating a new empty one" ); - blobs.insert(tail_index, (tail, tail_size)); + tail.to_immutable().await?; + blobs.push((tail_index, (tail, tail_size))); tail_index += 1; - (tail, tail_size) = context + let (blob, blob_size) = context .open(&cfg.partition, &tail_index.to_be_bytes()) .await?; - assert_eq!(tail_size, 0); + assert_eq!(blob_size, 0); + tail = Append::new( + blob, + blob_size, + cfg.write_buffer.get(), + cfg.buffer_pool.clone(), + ) + .await?; + tail_size = 0; tracked.inc(); } - // Wrap all blobs with Append wrappers. - // TODO(https://github.com/commonwarexyz/monorepo/issues/1219): Consider creating an - // Immutable wrapper which doesn't allocate a write buffer for these. - let blobs = try_join_all(blobs.into_iter().map(|(index, (blob, size))| { - let pool = cfg.buffer_pool.clone(); - async move { - let blob = Append::new(blob, size, cfg.write_buffer, pool).await?; - Ok::<_, Error>((index, (blob, size))) - } - })) - .await?; - let tail = Append::new(tail, tail_size, cfg.write_buffer, cfg.buffer_pool.clone()).await?; - let size = tail_index * cfg.items_per_blob.get() + (tail_size / Self::CHUNK_SIZE_U64); let pruning_boundary = if blobs.is_empty() { tail_index * cfg.items_per_blob.get() } else { blobs[0].0 * cfg.items_per_blob.get() }; + let size = tail_index * cfg.items_per_blob.get() + (tail_size / Self::CHUNK_SIZE_U64); assert!(size >= pruning_boundary); Ok(Self { @@ -267,51 +287,6 @@ impl> Journal { }) } - /// Trim any invalid data found at the end of the tail blob and return the new size. The new - /// size will be less than or equal to the originally provided size, and a multiple of the item - /// size. - async fn trim_tail( - tail: &::Blob, - mut tail_size: u64, - tail_index: u64, - ) -> Result { - let mut truncated = false; - if !tail_size.is_multiple_of(Self::CHUNK_SIZE_U64) { - warn!( - blob = tail_index, - invalid_size = tail_size, - "last blob size is not a multiple of item size, truncating" - ); - tail_size -= tail_size % Self::CHUNK_SIZE_U64; - tail.resize(tail_size).await?; - truncated = true; - } - - // Truncate any records with failing checksums. This can happen if the file system allocated - // extra space for a blob but there was a crash before any data was written to that space. - while tail_size > 0 { - let offset = tail_size - Self::CHUNK_SIZE_U64; - let read = tail.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?; - match Self::verify_integrity(read.as_ref()) { - Ok(_) => break, // Valid item found, we can stop truncating. - Err(Error::ChecksumMismatch(_, _)) => { - warn!(blob = tail_index, offset, "checksum mismatch: truncating",); - tail_size -= Self::CHUNK_SIZE_U64; - tail.resize(tail_size).await?; - truncated = true; - } - Err(err) => return Err(err), - } - } - - // If we truncated the blob, make sure to sync it. - if truncated { - tail.sync().await?; - } - - Ok(tail_size) - } - /// Sync any pending updates to disk. pub async fn sync(&mut self) -> Result<(), Error> { self.synced.inc(); @@ -332,17 +307,12 @@ impl> Journal { let mut size = self.tail.size().await; assert!(size < self.cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64); assert_eq!(size % Self::CHUNK_SIZE_U64, 0); - - // Pre-allocate exact size and write directly to avoid copying - let mut buf: Vec = Vec::with_capacity(Self::CHUNK_SIZE); - item.write(&mut buf); - let checksum = crc32fast::hash(&buf); - buf.put_u32(checksum); + let item = item.encode_mut(); // Write the item to the blob let item_pos = (size / Self::CHUNK_SIZE_U64) + self.cfg.items_per_blob.get() * self.tail_index; - self.tail.append(buf).await?; + self.tail.append(&item).await?; trace!(blob = self.tail_index, pos = item_pos, "appended item"); size += Self::CHUNK_SIZE_U64; @@ -351,7 +321,7 @@ impl> Journal { if size == self.cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64 { // Sync the tail blob before creating a new one so if we crash we don't end up with a // non-full historical blob. - self.tail.sync().await?; + self.tail.to_immutable().await?; // Create a new empty blob. let next_blob_index = self.tail_index + 1; @@ -364,7 +334,7 @@ impl> Journal { let next_blob = Append::new( next_blob, size, - self.cfg.write_buffer, + self.cfg.write_buffer.get(), self.cfg.buffer_pool.clone(), ) .await?; @@ -406,6 +376,7 @@ impl> Journal { let (blob_index, mut new_tail) = self.blobs.pop_last().unwrap(); assert_eq!(blob_index, self.tail_index - 1); std::mem::swap(&mut self.tail, &mut new_tail); + self.tail.to_mutable().await; self.remove_blob(self.tail_index, new_tail).await?; self.tail_index -= 1; } @@ -459,21 +430,14 @@ impl> Journal { }; let read = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?; - Self::verify_integrity(read.as_ref()) + Self::decode_buf(read.as_ref()) } - /// Verify the integrity of the Array + checksum in `buf`, returning: - /// - The array if it is valid, - /// - Error::ChecksumMismatch if the checksum is invalid, or - /// - Error::Codec if the array could not be decoded after passing the checksum check. + /// Decode the array from `buf`, returning: + /// - Error::Codec if the array could not be decoded. /// /// Error::Codec likely indicates a logic error rather than a corruption issue. - fn verify_integrity(buf: &[u8]) -> Result { - let stored_checksum = u32::from_be_bytes(buf[A::SIZE..].try_into().unwrap()); - let checksum = crc32fast::hash(&buf[..A::SIZE]); - if checksum != stored_checksum { - return Err(Error::ChecksumMismatch(stored_checksum, checksum)); - } + fn decode_buf(buf: &[u8]) -> Result { A::decode(&buf[..A::SIZE]).map_err(Error::Codec) } @@ -498,24 +462,20 @@ impl> Journal { let start_blob = start_pos / items_per_blob; assert!(start_blob <= self.tail_index); let blobs = self.blobs.range(start_blob..).collect::>(); - let full_size = items_per_blob * Self::CHUNK_SIZE_U64; - let mut blob_plus = Vec::with_capacity(blobs.len() + 1); + let mut readers = Vec::with_capacity(blobs.len() + 1); for (blob_index, blob) in blobs { - blob_plus.push((*blob_index, blob.clone_blob().await, full_size)); + let reader = blob.as_blob_reader(buffer).await?; + readers.push((*blob_index, reader)); } // Include the tail blob. - self.tail.sync().await?; // make sure no data is buffered - let tail_size = self.tail.size().await; - blob_plus.push((self.tail_index, self.tail.clone_blob().await, tail_size)); + let tail_reader = self.tail.as_blob_reader(buffer).await?; + readers.push((self.tail_index, tail_reader)); let start_offset = (start_pos % items_per_blob) * Self::CHUNK_SIZE_U64; // Replay all blobs in order and stream items as they are read (to avoid occupying too much // memory with buffered data). - let stream = stream::iter(blob_plus).flat_map(move |(blob_index, blob, size)| { - // Create a new reader and buffer for each blob. Preallocating the buffer here to avoid - // a per-iteration allocation improves performance by ~20%. - let mut reader = Read::new(blob, size, buffer); + let stream = stream::iter(readers).flat_map(move |(blob_index, mut reader)| { let buf = vec![0u8; Self::CHUNK_SIZE]; let initial_offset = if blob_index == start_blob { // If this is the very first blob then we need to seek to the starting position. @@ -538,7 +498,7 @@ impl> Journal { match reader.read_exact(&mut buf, Self::CHUNK_SIZE).await { Ok(()) => { let next_offset = offset + Self::CHUNK_SIZE_U64; - let result = Self::verify_integrity(&buf).map(|item| (item_pos, item)); + let result = Self::decode_buf(&buf).map(|item| (item_pos, item)); if result.is_err() { warn!("corrupted item at {item_pos}"); } @@ -550,7 +510,8 @@ impl> Journal { err = err.to_string(), "error reading item during replay" ); - Some((Err(Error::Runtime(err)), (buf, reader, size))) + let blob_size = reader.blob_size(); + Some((Err(Error::Runtime(err)), (buf, reader, blob_size))) } } }, @@ -698,11 +659,15 @@ mod tests { use super::*; use commonware_cryptography::{sha256::Digest, Hasher as _, Sha256}; use commonware_macros::test_traced; - use commonware_runtime::{deterministic, Blob, Runner, Storage}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_runtime::{ + deterministic::{self, Context}, + Blob, Runner, Storage, + }; + use commonware_utils::{NZUsize, NZU16, NZU64}; use futures::{pin_mut, StreamExt}; + use std::num::NonZeroU16; - const PAGE_SIZE: NonZeroUsize = NZUsize!(44); + const PAGE_SIZE: NonZeroU16 = NZU16!(44); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3); /// Generate a SHA-256 digest for the given value. @@ -966,19 +931,16 @@ mod tests { journal.sync().await.expect("Failed to sync journal"); drop(journal); - // Corrupt one of the checksums and make sure it's detected. - let checksum_offset = Digest::SIZE as u64 - + (ITEMS_PER_BLOB.get() / 2) * (Digest::SIZE + u32::SIZE) as u64; + // Corrupt one of the bytes and make sure it's detected. let (blob, _) = context .open(&cfg.partition, &40u64.to_be_bytes()) .await .expect("Failed to open blob"); - // Write incorrect checksum - let bad_checksum = 123456789u32; - blob.write_at(bad_checksum.to_be_bytes().to_vec(), checksum_offset) + // Write junk bytes. + let bad_bytes = 123456789u32; + blob.write_at(bad_bytes.to_be_bytes().to_vec(), 1) .await - .expect("Failed to write incorrect checksum"); - let corrupted_item_pos = 40 * ITEMS_PER_BLOB.get() + ITEMS_PER_BLOB.get() / 2; + .expect("Failed to write bad bytes"); blob.sync().await.expect("Failed to sync blob"); // Re-initialize the journal to simulate a restart @@ -986,19 +948,22 @@ mod tests { .await .expect("Failed to re-initialize journal"); - // Make sure reading the corrupted item fails with appropriate error. - let err = journal.read(corrupted_item_pos).await.unwrap_err(); - assert!(matches!(err, Error::ChecksumMismatch(x, _) if x == bad_checksum)); + // Make sure reading an item that resides in the corrupted page fails. + let err = journal + .read(40 * ITEMS_PER_BLOB.get() + 1) + .await + .unwrap_err(); + assert!(matches!(err, Error::Runtime(_))); - // Replay all items, making sure the checksum mismatch error is handled correctly. + // Replay all items. { + let mut error_found = false; let stream = journal .replay(NZUsize!(1024), 0) .await .expect("failed to replay journal"); let mut items = Vec::new(); pin_mut!(stream); - let mut error_count = 0; while let Some(result) = stream.next().await { match result { Ok((pos, item)) => { @@ -1006,17 +971,13 @@ mod tests { items.push(pos); } Err(err) => { - error_count += 1; - assert!(matches!(err, Error::ChecksumMismatch(_, _))); + error_found = true; + assert!(matches!(err, Error::Runtime(_))); + break; } } } - assert_eq!(error_count, 1); - // Result will be missing only the one corrupted value. - assert_eq!( - items.len(), - ITEMS_PER_BLOB.get() as usize * 100 + ITEMS_PER_BLOB.get() as usize / 2 - 1 - ); + assert!(error_found); // error should abort replay } }); } @@ -1056,10 +1017,7 @@ mod tests { blob.resize(size - 1).await.expect("Failed to corrupt blob"); blob.sync().await.expect("Failed to sync blob"); let result = Journal::<_, Digest>::init(context.clone(), cfg.clone()).await; - assert!(matches!( - result.err().unwrap(), - Error::InvalidBlobSize(_, _) - )); + assert!(matches!(result.err().unwrap(), Error::Runtime(_))); // Delete a blob and make sure the gap is detected during initialization. context @@ -1096,32 +1054,9 @@ mod tests { journal.sync().await.expect("Failed to sync journal"); drop(journal); - // Truncate the tail blob by one byte, which should result in the 3rd item being - // trimmed. - let (blob, size) = context - .open(&cfg.partition, &1u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - blob.resize(size - 1).await.expect("Failed to corrupt blob"); - - // Write incorrect checksum into the second item in the blob, which should result in the - // second item being trimmed. - let checksum_offset = Digest::SIZE + u32::SIZE + Digest::SIZE; - - let bad_checksum = 123456789u32; - blob.write_at(bad_checksum.to_be_bytes().to_vec(), checksum_offset as u64) - .await - .expect("Failed to write incorrect checksum"); - blob.sync().await.expect("Failed to sync blob"); - - let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone()) - .await - .unwrap(); - - // Confirm 2 items were trimmed. - assert_eq!(journal.size(), item_count - 2); - - // Corrupt the last item, ensuring last blob is trimmed to empty state. + // Truncate the tail blob by one byte, which should result in the last page worth of + // data being discarded due to an invalid checksum. This will result in one item being + // lost. let (blob, size) = context .open(&cfg.partition, &1u64.to_be_bytes()) .await @@ -1133,8 +1068,8 @@ mod tests { .await .unwrap(); - // Confirm last item in blob was trimmed. - assert_eq!(journal.size(), item_count - 3); + // Confirm 1 item was lost. + assert_eq!(journal.size(), item_count - 1); // Cleanup. journal.destroy().await.expect("Failed to destroy journal"); @@ -1339,14 +1274,13 @@ mod tests { journal.sync().await.expect("Failed to sync journal"); drop(journal); - // Manually extend the blob by an amount at least some multiple of the chunk size to - // simulate a failure where the file was extended, but no bytes were written due to - // failure. + // Manually extend the blob to simulate a failure where the file was extended, but no + // bytes were written due to failure. let (blob, size) = context .open(&cfg.partition, &0u64.to_be_bytes()) .await .expect("Failed to open blob"); - blob.write_at(vec![0u8; Digest::SIZE * 3 - 1], size) + blob.write_at(vec![0u8; PAGE_SIZE.get() as usize * 3], size) .await .expect("Failed to extend blob"); blob.sync().await.expect("Failed to sync blob"); @@ -1356,7 +1290,7 @@ mod tests { .await .expect("Failed to re-initialize journal"); - // Ensure we've recovered to the state of a single item. + // No items should be lost since we called sync. assert_eq!(journal.size(), 1); assert_eq!(journal.oldest_retained_pos(), Some(0)); @@ -1367,10 +1301,6 @@ mod tests { .expect("failed to append data"); assert_eq!(journal.size(), 2); - // Get the value of the first item - let item = journal.read(0).await.unwrap(); - assert_eq!(item, test_digest(0)); - // Get the value of new item let item = journal.read(1).await.unwrap(); assert_eq!(item, test_digest(1)); @@ -1487,4 +1417,87 @@ mod tests { journal.destroy().await.unwrap(); }); } + + /// Test recovery when blob is truncated to a page boundary with item size not dividing page size. + /// + /// This tests the scenario where: + /// 1. Items (32 bytes) don't divide evenly into page size (44 bytes) + /// 2. Data spans multiple pages + /// 3. Blob is truncated to a page boundary (simulating crash before last page was written) + /// 4. Journal should recover correctly on reopen + #[test_traced] + fn test_fixed_journal_recover_from_page_boundary_truncation() { + let executor = deterministic::Runner::default(); + executor.start(|context: Context| async move { + // Use a small items_per_blob to keep the test focused on a single blob + let cfg = test_cfg(NZU64!(100)); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to initialize journal"); + + // Item size is 32 bytes (Digest), page size is 44 bytes. + // 32 doesn't divide 44, so items will cross page boundaries. + // Physical page size = 44 + 12 (CRC) = 56 bytes. + // + // Write enough items to span multiple pages: + // - 10 items = 320 logical bytes + // - This spans ceil(320/44) = 8 logical pages + for i in 0u64..10 { + journal + .append(test_digest(i)) + .await + .expect("failed to append data"); + } + assert_eq!(journal.size(), 10); + journal.sync().await.expect("Failed to sync journal"); + drop(journal); + + // Open the blob directly and truncate to a page boundary. + // Physical page size = PAGE_SIZE + CHECKSUM_SIZE = 44 + 12 = 56 + let physical_page_size = PAGE_SIZE.get() as u64 + 12; + let (blob, size) = context + .open(&cfg.partition, &0u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + + // Calculate how many full physical pages we have and truncate to lose the last one. + let full_pages = size / physical_page_size; + assert!(full_pages >= 2, "need at least 2 pages for this test"); + let truncate_to = (full_pages - 1) * physical_page_size; + + blob.resize(truncate_to) + .await + .expect("Failed to truncate blob"); + blob.sync().await.expect("Failed to sync blob"); + + // Re-initialize the journal - it should recover by truncating to valid data + let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone()) + .await + .expect("Failed to re-initialize journal after page truncation"); + + // The journal should have fewer items now (those that fit in the remaining pages). + // With logical page size 44 and item size 32: + // - After truncating to (full_pages-1) physical pages, we have (full_pages-1)*44 logical bytes + // - Number of complete items = floor(logical_bytes / 32) + let remaining_logical_bytes = (full_pages - 1) * PAGE_SIZE.get() as u64; + let expected_items = remaining_logical_bytes / 32; // 32 = Digest::SIZE + assert_eq!( + journal.size(), + expected_items, + "Journal should recover to {} items after truncation", + expected_items + ); + + // Verify we can still read the remaining items + for i in 0..expected_items { + let item = journal + .read(i) + .await + .expect("failed to read recovered item"); + assert_eq!(item, test_digest(i), "item {} mismatch after recovery", i); + } + + journal.destroy().await.expect("Failed to destroy journal"); + }); + } } diff --git a/storage/src/journal/contiguous/variable.rs b/storage/src/journal/contiguous/variable.rs index 990cf32f7f..fddddaecf6 100644 --- a/storage/src/journal/contiguous/variable.rs +++ b/storage/src/journal/contiguous/variable.rs @@ -856,13 +856,17 @@ mod tests { use super::*; use crate::journal::contiguous::tests::run_contiguous_tests; use commonware_macros::test_traced; - use commonware_runtime::{buffer::PoolRef, deterministic, Runner}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_runtime::{buffer::pool::PoolRef, deterministic, Runner}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use futures::FutureExt as _; + use std::num::NonZeroU16; // Use some jank sizes to exercise boundary conditions. - const PAGE_SIZE: usize = 101; + const PAGE_SIZE: NonZeroU16 = NZU16!(101); const PAGE_CACHE_SIZE: usize = 2; + // Larger page sizes for tests that need more buffer space. + const LARGE_PAGE_SIZE: NonZeroU16 = NZU16!(1024); + const SMALL_PAGE_SIZE: NonZeroU16 = NZU16!(512); /// Test that complete offsets partition loss after pruning is detected as unrecoverable. /// @@ -878,7 +882,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -928,7 +932,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -993,7 +997,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }, ) @@ -1015,7 +1019,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1103,7 +1107,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1186,7 +1190,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1248,7 +1252,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1285,7 +1289,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1341,7 +1345,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1409,7 +1413,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1469,7 +1473,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1536,7 +1540,7 @@ mod tests { items_per_section: NZU64!(10), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), + buffer_pool: PoolRef::new(LARGE_PAGE_SIZE, NZUsize!(10)), write_buffer: NZUsize!(1024), }; @@ -1578,7 +1582,7 @@ mod tests { items_per_section: NZU64!(5), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)), + buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)), write_buffer: NZUsize!(1024), }; @@ -1611,7 +1615,7 @@ mod tests { items_per_section: NZU64!(5), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)), + buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)), write_buffer: NZUsize!(1024), }; @@ -1650,7 +1654,7 @@ mod tests { items_per_section: NZU64!(5), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)), + buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)), write_buffer: NZUsize!(1024), }; @@ -1684,7 +1688,7 @@ mod tests { items_per_section: NZU64!(5), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)), + buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)), write_buffer: NZUsize!(1024), }; @@ -1736,7 +1740,7 @@ mod tests { items_per_section: NZU64!(5), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)), + buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)), write_buffer: NZUsize!(1024), }; @@ -1767,7 +1771,7 @@ mod tests { items_per_section: NZU64!(5), compression: None, codec_config: (), - buffer_pool: PoolRef::new(NZUsize!(512), NZUsize!(2)), + buffer_pool: PoolRef::new(SMALL_PAGE_SIZE, NZUsize!(2)), write_buffer: NZUsize!(1024), }; @@ -1813,7 +1817,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Initialize journal with sync boundaries when no existing data exists @@ -1851,7 +1855,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Create initial journal with data in multiple sections @@ -1921,7 +1925,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; #[allow(clippy::reversed_empty_ranges)] @@ -1946,7 +1950,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Create initial journal with data exactly matching sync range @@ -2016,7 +2020,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Create initial journal with data beyond sync range @@ -2060,7 +2064,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Create initial journal with stale data @@ -2113,7 +2117,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Create journal with data at section boundaries @@ -2182,7 +2186,7 @@ mod tests { compression: None, codec_config: (), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), }; // Create journal with data in multiple sections diff --git a/storage/src/journal/mod.rs b/storage/src/journal/mod.rs index 9f57cce711..1f72e813a0 100644 --- a/storage/src/journal/mod.rs +++ b/storage/src/journal/mod.rs @@ -29,8 +29,6 @@ pub enum Error { InvalidBlobName(String), #[error("invalid blob size: index={0} size={1}")] InvalidBlobSize(u64, u64), - #[error("checksum mismatch: expected={0} actual={1}")] - ChecksumMismatch(u32, u32), #[error("item too large: size={0}")] ItemTooLarge(usize), #[error("already pruned to section: {0}")] @@ -61,4 +59,6 @@ pub enum Error { Corruption(String), #[error("invalid configuration: {0}")] InvalidConfiguration(String), + #[error("checksum mismatch: expected={0}, found={1}")] + ChecksumMismatch(u32, u32), } diff --git a/storage/src/journal/segmented/fixed.rs b/storage/src/journal/segmented/fixed.rs index 05cc2e0a68..a38d666a76 100644 --- a/storage/src/journal/segmented/fixed.rs +++ b/storage/src/journal/segmented/fixed.rs @@ -2,15 +2,12 @@ //! //! # Format //! -//! Data is stored in one blob per section. Within each blob, items are stored with -//! their checksum (CRC32): +//! Data is stored in one blob per section. Items are stored sequentially: //! //! ```text -//! +--------+-----------+--------+-----------+--------+----------+-------------+ -//! | item_0 | C(Item_0) | item_1 | C(Item_1) | ... | item_n-1 | C(Item_n-1) | -//! +--------+-----------+--------+-----------+--------+----------+-------------+ -//! -//! C = CRC32 +//! +--------+--------+--------+----------+ +//! | item_0 | item_1 | ... | item_n-1 | +//! +--------+--------+--------+----------+ //! ``` //! //! # Sync @@ -25,12 +22,8 @@ use super::manager::{AppendFactory, Config as ManagerConfig, Manager}; use crate::journal::Error; -use bytes::BufMut; -use commonware_codec::{CodecFixed, DecodeExt as _, FixedSize}; -use commonware_runtime::{ - buffer::{PoolRef, Read}, - Blob, Error as RError, Metrics, Storage, -}; +use commonware_codec::{CodecFixed, DecodeExt as _}; +use commonware_runtime::{buffer::PoolRef, Blob, Error as RError, Metrics, Storage}; use futures::{ stream::{self, Stream}, StreamExt, @@ -54,15 +47,15 @@ pub struct Config { /// A segmented journal with fixed-size entries. /// /// Each section is stored in a separate blob. Within each blob, items are -/// fixed-size with a CRC32 checksum appended. +/// fixed-size. pub struct Journal { manager: Manager, _array: PhantomData, } impl> Journal { - /// Size of each entry: item + CRC32 checksum. - pub const CHUNK_SIZE: usize = A::SIZE + u32::SIZE; + /// Size of each entry. + pub const CHUNK_SIZE: usize = A::SIZE; const CHUNK_SIZE_U64: u64 = Self::CHUNK_SIZE as u64; /// Initialize a new `Journal` instance. @@ -100,13 +93,9 @@ impl> Journal { } let position = size / Self::CHUNK_SIZE_U64; - // Pre-allocate exact size and write directly to avoid copying - let mut buf: Vec = Vec::with_capacity(Self::CHUNK_SIZE); - item.write(&mut buf); - let checksum = crc32fast::hash(&buf); - buf.put_u32(checksum); - - blob.append(buf).await?; + // Encode the item + let buf = item.encode_mut(); + blob.append(&buf).await?; trace!(section, position, "appended item"); Ok(position) @@ -136,7 +125,7 @@ impl> Journal { } let buf = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?; - Self::verify_integrity(buf.as_ref()) + A::decode(buf.as_ref()).map_err(Error::Codec) } /// Read the last item in a section, if any. @@ -154,18 +143,7 @@ impl> Journal { let last_position = (size / Self::CHUNK_SIZE_U64) - 1; let offset = last_position * Self::CHUNK_SIZE_U64; let buf = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?; - Self::verify_integrity(buf.as_ref()).map(Some) - } - - /// Verify the integrity of the item + checksum in `buf`. - fn verify_integrity(buf: &[u8]) -> Result { - let stored_checksum = - u32::from_be_bytes(buf[A::SIZE..].try_into().expect("checksum is 4 bytes")); - let checksum = crc32fast::hash(&buf[..A::SIZE]); - if checksum != stored_checksum { - return Err(Error::ChecksumMismatch(stored_checksum, checksum)); - } - A::decode(&buf[..A::SIZE]).map_err(Error::Codec) + A::decode(buf.as_ref()).map_err(Error::Codec).map(Some) } /// Returns a stream of all items starting from the given section. @@ -180,21 +158,22 @@ impl> Journal { start_section: u64, buffer: NonZeroUsize, ) -> Result> + '_, Error> { + // Pre-create readers from blobs (async operation) let mut blob_info = Vec::new(); for (§ion, blob) in self.manager.sections_from(start_section) { - let size = blob.size().await; - blob_info.push((section, blob.clone(), size)); + let blob_size = blob.size().await; + let reader = blob.as_blob_reader(buffer).await?; + blob_info.push((section, blob.clone(), reader, blob_size)); } - Ok( - stream::iter(blob_info).flat_map(move |(section, blob, blob_size)| { - let reader = Read::new(blob, blob_size, buffer); + Ok(stream::iter(blob_info).flat_map( + move |(section, blob, reader, blob_size)| { let buf = vec![0u8; Self::CHUNK_SIZE]; stream::unfold( - (section, buf, reader, 0u64, 0u64), - move |(section, mut buf, mut reader, offset, valid_size)| async move { - if offset >= reader.blob_size() { + (section, buf, blob, reader, 0u64, 0u64, blob_size), + move |(section, mut buf, blob, mut reader, offset, valid_size, blob_size)| async move { + if offset >= blob_size { return None; } @@ -202,25 +181,13 @@ impl> Journal { match reader.read_exact(&mut buf, Self::CHUNK_SIZE).await { Ok(()) => { let next_offset = offset + Self::CHUNK_SIZE_U64; - match Self::verify_integrity(&buf) { + match A::decode(buf.as_slice()).map_err(Error::Codec) { Ok(item) => Some(( Ok((section, position, item)), - (section, buf, reader, next_offset, next_offset), + (section, buf, blob, reader, next_offset, next_offset, blob_size), )), - Err(Error::ChecksumMismatch(expected, found)) => { - warn!( - section, - position, - expected, - found, - new_size = valid_size, - "corruption detected: truncating" - ); - reader.resize(valid_size).await.ok()?; - None - } Err(err) => { - Some((Err(err), (section, buf, reader, offset, valid_size))) + Some((Err(err), (section, buf, blob, reader, offset, valid_size, blob_size))) } } } @@ -231,21 +198,21 @@ impl> Journal { new_size = valid_size, "trailing bytes detected: truncating" ); - reader.resize(valid_size).await.ok()?; + blob.resize(valid_size).await.ok()?; None } Err(err) => { warn!(section, position, ?err, "unexpected error"); Some(( Err(Error::Runtime(err)), - (section, buf, reader, offset, valid_size), + (section, buf, blob, reader, offset, valid_size, blob_size), )) } } }, ) - }), - ) + }, + )) } /// Sync the given section to storage. @@ -316,10 +283,11 @@ mod tests { use commonware_cryptography::{sha256::Digest, Hasher as _, Sha256}; use commonware_macros::test_traced; use commonware_runtime::{buffer::PoolRef, deterministic, Runner}; - use commonware_utils::NZUsize; + use commonware_utils::{NZUsize, NZU16}; + use core::num::NonZeroU16; use futures::{pin_mut, StreamExt}; - const PAGE_SIZE: NonZeroUsize = NZUsize!(44); + const PAGE_SIZE: NonZeroU16 = NZU16!(44); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3); fn test_digest(value: u64) -> Digest { diff --git a/storage/src/journal/segmented/manager.rs b/storage/src/journal/segmented/manager.rs index 3c4ca526e5..e424e3ffe8 100644 --- a/storage/src/journal/segmented/manager.rs +++ b/storage/src/journal/segmented/manager.rs @@ -5,7 +5,7 @@ use crate::journal::Error; use commonware_runtime::{ - buffer::{Append, PoolRef, Write}, + buffer::{pool::Append, PoolRef, Write}, telemetry::metrics::status::GaugeExt, Blob, Error as RError, Metrics, Storage, }; @@ -61,7 +61,7 @@ impl BufferFactory for AppendFactory { type Buffer = Append; async fn create(&self, blob: B, size: u64) -> Result { - Append::new(blob, size, self.write_buffer, self.pool_ref.clone()).await + Append::new(blob, size, self.write_buffer.get(), self.pool_ref.clone()).await } } diff --git a/storage/src/journal/segmented/oversized.rs b/storage/src/journal/segmented/oversized.rs index 18718f9397..baa8c864df 100644 --- a/storage/src/journal/segmented/oversized.rs +++ b/storage/src/journal/segmented/oversized.rs @@ -444,7 +444,7 @@ mod tests { use commonware_codec::{FixedSize, Read, ReadExt, Write}; use commonware_macros::test_traced; use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner}; - use commonware_utils::NZUsize; + use commonware_utils::{NZUsize, NZU16}; /// Convert offset + size to byte end position (for truncation tests). fn byte_end(offset: u64, size: u32) -> u64 { @@ -512,7 +512,7 @@ mod tests { Config { index_partition: "test_index".to_string(), value_partition: "test_values".to_string(), - index_buffer_pool: PoolRef::new(NZUsize!(64), NZUsize!(8)), + index_buffer_pool: PoolRef::new(NZU16!(64), NZUsize!(8)), index_write_buffer: NZUsize!(1024), value_write_buffer: NZUsize!(1024), compression: None, @@ -900,7 +900,18 @@ mod tests { fn test_recovery_corrupted_last_index_entry() { let executor = deterministic::Runner::default(); executor.start(|context| async move { - let cfg = test_cfg(); + // Use page size = entry size so each entry is on its own page. + // This allows corrupting just the last entry's page without affecting others. + // Physical page size = TestEntry::SIZE (20) + 12 (CRC record) = 32 bytes. + let cfg = Config { + index_partition: "test_index".to_string(), + value_partition: "test_values".to_string(), + index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)), + index_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + compression: None, + codec_config: (), + }; // Create and populate let mut oversized: Oversized<_, TestEntry, TestValue> = @@ -908,7 +919,7 @@ mod tests { .await .expect("Failed to init"); - // Append 5 entries + // Append 5 entries (each on its own page) for i in 0..5u8 { let value: TestValue = [i; 16]; let entry = TestEntry::new(i as u64, 0, 0); @@ -920,34 +931,36 @@ mod tests { oversized.sync(1).await.expect("Failed to sync"); drop(oversized); - // Corrupt the last index entry's checksum + // Corrupt the last page's CRC to trigger page-level integrity failure let (blob, size) = context .open(&cfg.index_partition, &1u64.to_be_bytes()) .await .expect("Failed to open blob"); - // Each entry is TestEntry::SIZE (16) + 4 (CRC32) = 20 bytes - // Corrupt the CRC of the last entry - let last_entry_crc_offset = size - 4; - blob.write_at(vec![0xFF, 0xFF, 0xFF, 0xFF], last_entry_crc_offset) + // Physical page size = 20 + 12 = 32 bytes + // 5 entries = 5 pages = 160 bytes total + // Last page CRC starts at offset 160 - 12 = 148 + assert_eq!(size, 160); + let last_page_crc_offset = size - 12; + blob.write_at(vec![0xFF; 12], last_page_crc_offset) .await .expect("Failed to corrupt"); blob.sync().await.expect("Failed to sync"); drop(blob); - // Reinitialize - should detect corruption and scan backwards + // Reinitialize - should detect page corruption and truncate let mut oversized: Oversized<_, TestEntry, TestValue> = Oversized::init(context.clone(), cfg) .await .expect("Failed to reinit"); - // First 4 entries should be valid + // First 4 entries should be valid (on pages 0-3) for i in 0..4u8 { let entry = oversized.get(1, i as u64).await.expect("Failed to get"); assert_eq!(entry.id, i as u64); } - // Entry 4 should be gone (corrupted and rewound) + // Entry 4 should be gone (its page was corrupted) assert!(oversized.get(1, 4).await.is_err()); // Should be able to append after recovery @@ -1423,7 +1436,18 @@ mod tests { fn test_recovery_glob_synced_but_index_not() { let executor = deterministic::Runner::default(); executor.start(|context| async move { - let cfg = test_cfg(); + // Use page size = entry size so each entry is exactly one page. + // This allows truncating by entry count to equal truncating by full pages, + // maintaining page-level integrity. + let cfg = Config { + index_partition: "test_index".to_string(), + value_partition: "test_values".to_string(), + index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)), + index_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + compression: None, + codec_config: (), + }; // Create and populate let mut oversized: Oversized<_, TestEntry, TestValue> = @@ -1452,9 +1476,10 @@ mod tests { .await .expect("Failed to open blob"); - // Keep only first 2 index entries - let chunk_size = (TestEntry::SIZE + u32::SIZE) as u64; // entry + CRC32 - blob.resize(2 * chunk_size) + // Keep only first 2 index entries (2 full pages) + // Physical page size = logical (20) + CRC record (12) = 32 bytes + let physical_page_size = (TestEntry::SIZE + 12) as u64; + blob.resize(2 * physical_page_size) .await .expect("Failed to truncate"); blob.sync().await.expect("Failed to sync"); @@ -1692,7 +1717,18 @@ mod tests { // Simulates crash where index was rewound but glob wasn't let executor = deterministic::Runner::default(); executor.start(|context| async move { - let cfg = test_cfg(); + // Use page size = entry size so each entry is exactly one page. + // This allows truncating by entry count to equal truncating by full pages, + // maintaining page-level integrity. + let cfg = Config { + index_partition: "test_index".to_string(), + value_partition: "test_values".to_string(), + index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)), + index_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + compression: None, + codec_config: (), + }; // Create and populate let mut oversized: Oversized<_, TestEntry, TestValue> = @@ -1719,8 +1755,9 @@ mod tests { .open(&cfg.index_partition, &1u64.to_be_bytes()) .await .expect("Failed to open blob"); - let chunk_size = (TestEntry::SIZE + u32::SIZE) as u64; - blob.resize(2 * chunk_size) + // Physical page size = logical (20) + CRC record (12) = 32 bytes + let physical_page_size = (TestEntry::SIZE + 12) as u64; + blob.resize(2 * physical_page_size) .await .expect("Failed to truncate"); blob.sync().await.expect("Failed to sync"); @@ -1845,15 +1882,20 @@ mod tests { // Size 0 - should fail assert!(oversized.get_value(1, offset, 0).await.is_err()); - // Size < CRC_SIZE (1, 2, 3 bytes) - should fail with BlobInsufficientLength + // Size < value size - should fail with codec error, checksum mismatch, or + // insufficient length (if size < 4 bytes for checksum) for size in 1..4u32 { let result = oversized.get_value(1, offset, size).await; - assert!(matches!( - result, - Err(Error::Runtime( - commonware_runtime::Error::BlobInsufficientLength - )) - )); + assert!( + matches!( + result, + Err(Error::Codec(_)) + | Err(Error::ChecksumMismatch(_, _)) + | Err(Error::Runtime(_)) + ), + "expected error, got: {:?}", + result + ); } oversized.destroy().await.expect("Failed to destroy"); @@ -1877,9 +1919,17 @@ mod tests { .expect("Failed to append"); oversized.sync(1).await.expect("Failed to sync"); - // Size too small (but >= CRC_SIZE) - checksum mismatch + // Size too small - will fail to decode or checksum mismatch + // (checksum mismatch can occur because we read wrong bytes as the checksum) let result = oversized.get_value(1, offset, correct_size - 1).await; - assert!(matches!(result, Err(Error::ChecksumMismatch(_, _)))); + assert!( + matches!( + result, + Err(Error::Codec(_)) | Err(Error::ChecksumMismatch(_, _)) + ), + "expected Codec or ChecksumMismatch error, got: {:?}", + result + ); oversized.destroy().await.expect("Failed to destroy"); }); @@ -2394,7 +2444,16 @@ mod tests { // when added to size is detected as invalid during recovery. let executor = deterministic::Runner::default(); executor.start(|context| async move { - let cfg = test_cfg(); + // Use page size = entry size so one entry per page + let cfg = Config { + index_partition: "test_index".to_string(), + value_partition: "test_values".to_string(), + index_buffer_pool: PoolRef::new(NZU16!(TestEntry::SIZE as u16), NZUsize!(8)), + index_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + compression: None, + codec_config: (), + }; // Create and populate with valid entry let mut oversized: Oversized<_, TestEntry, TestValue> = @@ -2411,25 +2470,38 @@ mod tests { oversized.sync(1).await.expect("Failed to sync"); drop(oversized); - // Corrupt the index entry to have offset near u64::MAX - // Entry format: id (8) + value_offset (8) + value_size (4) + CRC32 (4) = 24 bytes + // Build a corrupted entry with offset near u64::MAX that would overflow. + // We need to write a valid page (with correct page-level CRC) containing + // the semantically-invalid entry data. let (blob, _) = context .open(&cfg.index_partition, &1u64.to_be_bytes()) .await .expect("Failed to open blob"); - // Write a corrupted entry with offset = u64::MAX - 10 and size = 100 - // This would overflow when computing offset + size - let mut corrupted_entry = Vec::new(); - 1u64.write(&mut corrupted_entry); // id - (u64::MAX - 10).write(&mut corrupted_entry); // value_offset (near max) - 100u32.write(&mut corrupted_entry); // value_size - let checksum = crc32fast::hash(&corrupted_entry); - corrupted_entry.put_u32(checksum); - - blob.write_at(corrupted_entry, 0) - .await - .expect("Failed to write corrupted entry"); + // Build entry data: id (8) + value_offset (8) + value_size (4) = 20 bytes + let mut entry_data = Vec::new(); + 1u64.write(&mut entry_data); // id + (u64::MAX - 10).write(&mut entry_data); // value_offset (near max) + 100u32.write(&mut entry_data); // value_size (offset + size overflows) + assert_eq!(entry_data.len(), TestEntry::SIZE); + + // Build page-level CRC record (12 bytes): + // len1 (2) + crc1 (4) + len2 (2) + crc2 (4) + let crc = crc32fast::hash(&entry_data); + let len1 = TestEntry::SIZE as u16; + let mut crc_record = Vec::new(); + crc_record.extend_from_slice(&len1.to_be_bytes()); // len1 + crc_record.extend_from_slice(&crc.to_be_bytes()); // crc1 + crc_record.extend_from_slice(&0u16.to_be_bytes()); // len2 (unused) + crc_record.extend_from_slice(&0u32.to_be_bytes()); // crc2 (unused) + assert_eq!(crc_record.len(), 12); + + // Write the complete physical page: entry_data + crc_record + let mut page = entry_data; + page.extend_from_slice(&crc_record); + blob.write_at(page, 0) + .await + .expect("Failed to write corrupted page"); blob.sync().await.expect("Failed to sync"); drop(blob); diff --git a/storage/src/journal/segmented/variable.rs b/storage/src/journal/segmented/variable.rs index f545f33c9e..2a040540a5 100644 --- a/storage/src/journal/segmented/variable.rs +++ b/storage/src/journal/segmented/variable.rs @@ -11,21 +11,13 @@ //! Within a `section`, data is appended as an `item` with the following format: //! //! ```text -//! +---+---+---+---+---+---+---+---+---+---+---+---+ -//! | 0 ~ 4 | ... | 8 | 9 |10 |11 | -//! +---+---+---+---+---+---+---+---+---+---+---+---+ -//! | Size (varint u32) | Data | C(u32) | -//! +---+---+---+---+---+---+---+---+---+---+---+---+ -//! -//! Size = u32 as varint (1 to 5 bytes) -//! C = CRC32(Size | Data) +//! +---+---+---+---+---+---+---+---+ +//! | 0 ~ 4 | ... | +//! +---+---+---+---+---+---+---+---+ +//! | Size (varint u32) | Data | +//! +---+---+---+---+---+---+---+---+ //! ``` //! -//! _To ensure data returned by `Journal` is correct, a checksum (CRC32) is stored at the end of -//! each item. If the checksum of the read data does not match the stored checksum, an error is -//! returned. This checksum is only verified when data is accessed and not at startup (which would -//! require reading all data in `Journal`)._ -//! //! # Open Blobs //! //! `Journal` uses 1 `commonware-storage::Blob` per `section` to store data. All `Blobs` in a given @@ -64,7 +56,7 @@ //! ```rust //! use commonware_runtime::{Spawner, Runner, deterministic, buffer::PoolRef}; //! use commonware_storage::journal::segmented::variable::{Journal, Config}; -//! use commonware_utils::NZUsize; +//! use commonware_utils::{NZUsize, NZU16}; //! //! let executor = deterministic::Runner::default(); //! executor.start(|context| async move { @@ -73,7 +65,7 @@ //! partition: "partition".to_string(), //! compression: None, //! codec_config: (), -//! buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! buffer_pool: PoolRef::new(NZU16!(1024), NZUsize!(10)), //! write_buffer: NZUsize!(1024 * 1024), //! }).await.unwrap(); //! @@ -88,13 +80,13 @@ use super::manager::{AppendFactory, Config as ManagerConfig, Manager}; use crate::journal::Error; use bytes::{Buf, BufMut}; -use commonware_codec::{varint::UInt, Codec, EncodeSize, FixedSize, ReadExt, Write as CodecWrite}; +use commonware_codec::{varint::UInt, Codec, EncodeSize, ReadExt, Write as CodecWrite}; use commonware_runtime::{ - buffer::{Append, PoolRef, Read}, + buffer::pool::{Append, PoolRef, Read}, Blob, Error as RError, Metrics, Storage, }; use futures::stream::{self, Stream, StreamExt}; -use std::{io::Cursor, num::NonZeroUsize}; +use std::{borrow::Cow, io::Cursor, num::NonZeroUsize}; use tracing::{trace, warn}; use zstd::{bulk::compress, decode_all}; @@ -118,10 +110,69 @@ pub struct Config { pub write_buffer: NonZeroUsize, } -/// Minimum size of any item: 1 byte varint (size=0) + 0 bytes data + 4 bytes checksum. -/// This is also the max varint size for u32, so we can always read this many bytes -/// at the start of an item to get the complete varint. -const MIN_ITEM_SIZE: usize = 5; +/// Maximum size of a varint for u32 (also the minimum useful read size for parsing item headers). +const MAX_VARINT_SIZE: usize = 5; + +/// Decodes a varint length prefix from a buffer. +/// Returns (item_size, varint_len). +#[inline] +fn decode_length_prefix(buf: &[u8]) -> Result<(usize, usize), Error> { + let mut cursor = buf; + let size = UInt::::read(&mut cursor)?.0 as usize; + let varint_len = buf.len() - cursor.remaining(); + Ok((size, varint_len)) +} + +/// Result of finding an item in a buffer. +enum Item<'a> { + /// All item data is available in the buffer. + Complete(&'a [u8]), + /// Need to read more bytes. Buffer has been allocated and prefix copied. + Incomplete { + buffer: Vec, + filled: usize, + /// Offset to read remaining bytes from (for offset-based readers). + read_offset: u64, + }, +} + +/// Find an item in a buffer by decoding its length prefix. +/// +/// Returns (next_offset, size, item). +fn find_item(buf: &[u8], available: usize, offset: u64) -> Result<(u64, u32, Item<'_>), Error> { + let (size, varint_len) = decode_length_prefix(&buf[..available])?; + let next_offset = offset + .checked_add(varint_len as u64) + .ok_or(Error::OffsetOverflow)? + .checked_add(size as u64) + .ok_or(Error::OffsetOverflow)?; + let buffered = available.saturating_sub(varint_len); + + let item = if buffered >= size { + Item::Complete(&buf[varint_len..varint_len + size]) + } else { + let mut buffer = vec![0u8; size]; + buffer[..buffered].copy_from_slice(&buf[varint_len..varint_len + buffered]); + Item::Incomplete { + buffer, + filled: buffered, + read_offset: offset + varint_len as u64 + buffered as u64, + } + }; + + Ok((next_offset, size as u32, item)) +} + +/// Decode item data with optional decompression. +fn decode_item(item_data: &[u8], cfg: &V::Cfg, compressed: bool) -> Result { + if compressed { + let decompressed = + decode_all(Cursor::new(item_data)).map_err(|_| Error::DecompressionFailed)?; + V::decode_cfg(decompressed.as_ref(), cfg).map_err(Error::Codec) + } else { + V::decode_cfg(item_data, cfg).map_err(Error::Codec) + } +} /// Implementation of `Journal` storage. pub struct Journal { @@ -164,108 +215,77 @@ impl Journal { blob: &Append, offset: u64, ) -> Result<(u64, u32, V), Error> { - // Read varint size (max 5 bytes for u32) - let mut hasher = crc32fast::Hasher::new(); - let varint_buf = blob.read_at(vec![0; MIN_ITEM_SIZE], offset).await?; - let mut varint = varint_buf.as_ref(); - let size = UInt::::read(&mut varint).map_err(Error::Codec)?.0 as usize; - let varint_len = MIN_ITEM_SIZE - varint.remaining(); - hasher.update(&varint_buf.as_ref()[..varint_len]); - let offset = offset - .checked_add(varint_len as u64) - .ok_or(Error::OffsetOverflow)?; - - // Read remaining - let buf_size = size.checked_add(u32::SIZE).ok_or(Error::OffsetOverflow)?; - let buf = blob.read_at(vec![0u8; buf_size], offset).await?; - let buf = buf.as_ref(); - let next_offset = offset - .checked_add(buf_size as u64) - .ok_or(Error::OffsetOverflow)?; - - // Read item - let item = &buf[..size]; - hasher.update(item); - - // Verify integrity - let checksum = hasher.finalize(); - let stored_checksum = u32::from_be_bytes(buf[size..].try_into().unwrap()); - if checksum != stored_checksum { - return Err(Error::ChecksumMismatch(stored_checksum, checksum)); - } - - // If compression is enabled, decompress the item - let item = if compressed { - let decompressed = - decode_all(Cursor::new(&item)).map_err(|_| Error::DecompressionFailed)?; - V::decode_cfg(decompressed.as_ref(), cfg).map_err(Error::Codec)? - } else { - V::decode_cfg(item, cfg).map_err(Error::Codec)? + // Read varint header (max 5 bytes for u32) + let buf = vec![0u8; MAX_VARINT_SIZE]; + let (buf, available) = blob.read_up_to(buf, offset).await?; + let (next_offset, size, item) = find_item(buf.as_ref(), available, offset)?; + + // Get item bytes - either from buffer directly or by reading more + let item_data: Cow<'_, [u8]> = match item { + Item::Complete(data) => Cow::Borrowed(data), + Item::Incomplete { + mut buffer, + filled, + read_offset, + } => { + blob.read_into(&mut buffer[filled..], read_offset).await?; + Cow::Owned(buffer) + } }; - // Return item - Ok((next_offset, size as u32, item)) + // Decode item (with optional decompression) + let decoded = decode_item::(item_data.as_ref(), cfg, compressed)?; + Ok((next_offset, size, decoded)) } /// Helper function to read an item from a [Read]. + /// + /// The `varint_buf` parameter is a reusable buffer for reading the varint header to avoid + /// allocating a new buffer for every item. async fn read_buffered( - reader: &mut Read>, + reader: &mut Read, offset: u64, cfg: &V::Cfg, compressed: bool, + varint_buf: &mut Vec, ) -> Result<(u64, u64, u32, V), Error> { // If we're not at the right position, seek to it if reader.position() != offset { reader.seek_to(offset).map_err(Error::Runtime)?; } - // Read varint size (max 5 bytes for u32, and min item size is 5 bytes) - let mut hasher = crc32fast::Hasher::new(); - let mut varint_buf = [0u8; MIN_ITEM_SIZE]; - reader - .read_exact(&mut varint_buf, MIN_ITEM_SIZE) - .await - .map_err(Error::Runtime)?; - let mut varint = varint_buf.as_ref(); - let size = UInt::::read(&mut varint).map_err(Error::Codec)?.0 as usize; - let varint_len = MIN_ITEM_SIZE - varint.remaining(); - hasher.update(&varint_buf[..varint_len]); - - // Read remaining data+checksum (we already have some bytes from the varint read) - let buf_size = size.checked_add(u32::SIZE).ok_or(Error::OffsetOverflow)?; - let already_read = MIN_ITEM_SIZE - varint_len; - let mut buf = vec![0u8; buf_size]; - buf[..already_read].copy_from_slice(&varint_buf[varint_len..]); - if buf_size > already_read { - reader - .read_exact(&mut buf[already_read..], buf_size - already_read) - .await - .map_err(Error::Runtime)?; - } - - // Read item - let item = &buf[..size]; - hasher.update(item); - - // Verify integrity - let checksum = hasher.finalize(); - let stored_checksum = u32::from_be_bytes(buf[size..].try_into().unwrap()); - if checksum != stored_checksum { - return Err(Error::ChecksumMismatch(stored_checksum, checksum)); - } - - // If compression is enabled, decompress the item - let item = if compressed { - let decompressed = - decode_all(Cursor::new(&item)).map_err(|_| Error::DecompressionFailed)?; - V::decode_cfg(decompressed.as_ref(), cfg).map_err(Error::Codec)? - } else { - V::decode_cfg(item, cfg).map_err(Error::Codec)? + // Read varint header (max 5 bytes for u32). Reuse the provided buffer. + varint_buf.clear(); + varint_buf.resize(MAX_VARINT_SIZE, 0); + let buf = std::mem::take(varint_buf); + let (buf, available) = reader.read_up_to(buf).await?; + let (next_offset, size, item) = find_item(buf.as_ref(), available, offset)?; + + // Get item bytes - either from buffer directly or by reading more + let item_data: Cow<'_, [u8]> = match item { + Item::Complete(data) => { + // We already have all the data we need, but reader position may be ahead. + // Seek to the correct next position. + reader.seek_to(next_offset).map_err(Error::Runtime)?; + Cow::Borrowed(data) + } + Item::Incomplete { + mut buffer, filled, .. + } => { + reader + .read_exact(&mut buffer[filled..], size as usize - filled) + .await + .map_err(Error::Runtime)?; + Cow::Owned(buffer) + } }; - // Calculate next offset - let next_offset = reader.position(); - Ok((next_offset, next_offset, size as u32, item)) + // Decode item (with optional decompression) + let decoded = decode_item::(item_data.as_ref(), cfg, compressed)?; + + // Restore the buffer for reuse in the next iteration + *varint_buf = buf.into(); + Ok((next_offset, next_offset, size, decoded)) } /// Returns an ordered stream of all items in the journal starting with the item at the given @@ -286,7 +306,7 @@ impl Journal { mut offset: u64, buffer: NonZeroUsize, ) -> Result> + '_, Error> { - // Collect all blobs to replay + // Collect all blobs to replay (keeping blob reference for potential resize) let codec_config = self.codec_config.clone(); let compressed = self.compression.is_some(); let mut blobs = Vec::new(); @@ -295,6 +315,7 @@ impl Journal { blobs.push(( section, blob.clone(), + blob.as_blob_reader(buffer).await?, blob_size, codec_config.clone(), compressed, @@ -304,9 +325,7 @@ impl Journal { // Replay all blobs in order and stream items as they are read (to avoid occupying too much // memory with buffered data) Ok(stream::iter(blobs).flat_map( - move |(section, blob, blob_size, codec_config, compressed)| { - // Created buffered reader - let mut reader = Read::new(blob, blob_size, buffer); + move |(section, blob, mut reader, blob_size, codec_config, compressed)| { if section == start_section && offset != 0 { if let Err(err) = reader.seek_to(offset) { warn!(section, offset, ?err, "failed to seek to offset"); @@ -317,25 +336,29 @@ impl Journal { offset = 0; } - // Read over the blob + // Read over the blob. Include a reusable buffer for varint parsing. stream::unfold( ( section, + blob, reader, offset, 0u64, blob_size, codec_config, compressed, + Vec::with_capacity(MAX_VARINT_SIZE), ), move |( section, + blob, mut reader, offset, valid_size, blob_size, codec_config, compressed, + mut varint_buf, )| async move { // Check if we are at the end of the blob if offset >= blob_size { @@ -343,8 +366,14 @@ impl Journal { } // Read an item from the buffer - match Self::read_buffered(&mut reader, offset, &codec_config, compressed) - .await + match Self::read_buffered( + &mut reader, + offset, + &codec_config, + compressed, + &mut varint_buf, + ) + .await { Ok((next_offset, next_valid_size, size, item)) => { trace!(blob = section, cursor = offset, "replayed item"); @@ -352,30 +381,17 @@ impl Journal { Ok((section, offset, size, item)), ( section, + blob, reader, next_offset, next_valid_size, blob_size, codec_config, compressed, + varint_buf, ), )) } - Err(Error::ChecksumMismatch(expected, found)) => { - // If we encounter corruption, we prune to the last valid item. This - // can happen during an unclean file close (where pending data is not - // fully synced to disk). - warn!( - blob = section, - bad_offset = offset, - new_size = valid_size, - expected, - found, - "corruption detected: truncating" - ); - reader.resize(valid_size).await.ok()?; - None - } Err(Error::Runtime(RError::BlobInsufficientLength)) => { // If we encounter trailing bytes, we prune to the last // valid item. This can happen during an unclean file close (where @@ -386,7 +402,7 @@ impl Journal { new_size = valid_size, "trailing bytes detected: truncating" ); - reader.resize(valid_size).await.ok()?; + blob.resize(valid_size).await.ok()?; None } Err(err) => { @@ -397,12 +413,14 @@ impl Journal { Err(err), ( section, + blob, reader, offset, valid_size, blob_size, codec_config, compressed, + varint_buf, ), )) } @@ -417,16 +435,8 @@ impl Journal { /// Appends an item to `Journal` in a given `section`, returning the offset /// where the item was written and the size of the item (which may now be smaller /// than the encoded size from the codec, if compression is enabled). - /// - /// # Warning - /// - /// If there exist trailing bytes in the `Blob` of a particular `section` and - /// `replay` is not called before this, it is likely that subsequent data added - /// to the `Blob` will be considered corrupted (as the trailing bytes will fail - /// the checksum verification). It is recommended to call `replay` before calling - /// `append` to prevent this. pub async fn append(&mut self, section: u64, item: V) -> Result<(u64, u32), Error> { - // Create buffer with item data + // Create buffer with item data (no checksum, no alignment) let (buf, item_len) = if let Some(compression) = self.compression { // Compressed: encode first, then compress let encoded = item.encode(); @@ -440,14 +450,11 @@ impl Journal { let size_len = UInt(item_len_u32).encode_size(); let entry_len = size_len .checked_add(item_len) - .and_then(|v| v.checked_add(4)) .ok_or(Error::OffsetOverflow)?; let mut buf = Vec::with_capacity(entry_len); UInt(item_len_u32).write(&mut buf); buf.put_slice(&compressed); - let checksum = crc32fast::hash(&buf); - buf.put_u32(checksum); (buf, item_len) } else { @@ -460,14 +467,11 @@ impl Journal { let size_len = UInt(item_len_u32).encode_size(); let entry_len = size_len .checked_add(item_len) - .and_then(|v| v.checked_add(4)) .ok_or(Error::OffsetOverflow)?; let mut buf = Vec::with_capacity(entry_len); UInt(item_len_u32).write(&mut buf); item.write(&mut buf); - let checksum = crc32fast::hash(&buf); - buf.put_u32(checksum); (buf, item_len) }; @@ -475,11 +479,11 @@ impl Journal { // Get or create blob let blob = self.manager.get_or_create(section).await?; - // Get current position - this is where we'll write + // Get current position - this is where we'll write (no alignment) let offset = blob.size().await; // Append item to blob - blob.append(buf).await?; + blob.append(&buf).await?; trace!(blob = section, offset, "appended item"); Ok((offset, item_len as u32)) } @@ -596,11 +600,12 @@ mod tests { use super::*; use bytes::BufMut; use commonware_macros::test_traced; - use commonware_runtime::{deterministic, Runner}; - use commonware_utils::NZUsize; + use commonware_runtime::{deterministic, Blob, Runner, Storage}; + use commonware_utils::{NZUsize, NZU16}; use futures::{pin_mut, StreamExt}; + use std::num::NonZeroU16; - const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); + const PAGE_SIZE: NonZeroU16 = NZU16!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); #[test_traced] @@ -1110,7 +1115,7 @@ mod tests { let item_size: u32 = 10; // Size indicates 10 bytes of data let mut buf = Vec::new(); UInt(item_size).write(&mut buf); // Varint encoding - let data = [2u8; 5]; // Only 5 bytes, not 10 + 4 (checksum) + let data = [2u8; 5]; BufMut::put_slice(&mut buf, &data); blob.write_at(buf, 0) .await @@ -1343,23 +1348,17 @@ mod tests { } drop(journal); - // Verify that only non-corrupted items were replayed - assert_eq!(items.len(), 3); + // Verify that replay stopped after corruption detected (the second blob). + assert_eq!(items.len(), 1); assert_eq!(items[0].0, 1); assert_eq!(items[0].1, 1); - assert_eq!(items[1].0, data_items[0].0); - assert_eq!(items[1].1, data_items[0].1); - assert_eq!(items[2].0, data_items[1].0); - assert_eq!(items[2].1, data_items[1].1); - // Confirm blob is expected length - // entry = 1 (varint for 4) + 4 (data) + 4 (checksum) = 9 bytes - // Item 2 ends at position 9 + 9 = 18 + // Confirm second blob was truncated. let (_, blob_size) = context .open(&cfg.partition, &2u64.to_be_bytes()) .await .expect("Failed to open blob"); - assert_eq!(blob_size, 18); + assert_eq!(blob_size, 0); // Attempt to replay journal after truncation let mut journal = Journal::init(context.clone(), cfg.clone()) @@ -1383,34 +1382,21 @@ mod tests { } // Verify that only non-corrupted items were replayed - assert_eq!(items.len(), 3); + assert_eq!(items.len(), 1); assert_eq!(items[0].0, 1); assert_eq!(items[0].1, 1); - assert_eq!(items[1].0, data_items[0].0); - assert_eq!(items[1].1, data_items[0].1); - assert_eq!(items[2].0, data_items[1].0); - assert_eq!(items[2].1, data_items[1].1); // Append a new item to truncated partition - let (offset, _) = journal.append(2, 5).await.expect("Failed to append data"); + let (_offset, _) = journal.append(2, 5).await.expect("Failed to append data"); journal.sync(2).await.expect("Failed to sync blob"); - // Get the new item - let item = journal.get(2, offset).await.expect("Failed to get item"); + // Get the new item (offset is 0 since blob was truncated) + let item = journal.get(2, 0).await.expect("Failed to get item"); assert_eq!(item, 5); // Drop the journal (data already synced) drop(journal); - // Confirm blob is expected length - // Items 1 and 2 at positions 0 and 9, item 3 (value 5) at position 18 - // Item 3 = 1 (varint) + 4 (data) + 4 (checksum) = 9 bytes, ends at 27 - let (_, blob_size) = context - .open(&cfg.partition, &2u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - assert_eq!(blob_size, 27); - // Re-initialize the journal to simulate a restart let journal = Journal::init(context.clone(), cfg.clone()) .await @@ -1433,15 +1419,11 @@ mod tests { } // Verify that only non-corrupted items were replayed - assert_eq!(items.len(), 4); + assert_eq!(items.len(), 2); assert_eq!(items[0].0, 1); assert_eq!(items[0].1, 1); - assert_eq!(items[1].0, data_items[0].0); - assert_eq!(items[1].1, data_items[0].1); - assert_eq!(items[2].0, data_items[1].0); - assert_eq!(items[2].1, data_items[1].1); - assert_eq!(items[3].0, 2); - assert_eq!(items[3].1, 5); + assert_eq!(items[1].0, 2); + assert_eq!(items[1].1, 5); }); } @@ -1624,6 +1606,67 @@ mod tests { }); } + #[test_traced] + fn test_journal_small_items() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + partition: "test_partition".into(), + compression: None, + codec_config: (), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(1024), + }; + + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("Failed to initialize journal"); + + // Append many small (1-byte) items to the same section + let num_items = 100; + let mut offsets = Vec::new(); + for i in 0..num_items { + let (offset, size) = journal + .append(1, i as u8) + .await + .expect("Failed to append data"); + assert_eq!(size, 1, "u8 should encode to 1 byte"); + offsets.push(offset); + } + journal.sync(1).await.expect("Failed to sync"); + + // Read each item back via random access + for (i, &offset) in offsets.iter().enumerate() { + let item: u8 = journal.get(1, offset).await.expect("Failed to get item"); + assert_eq!(item, i as u8, "Item mismatch at offset {offset}"); + } + + // Drop and reopen to test replay + drop(journal); + let journal = Journal::<_, u8>::init(context, cfg) + .await + .expect("Failed to re-initialize journal"); + + // Replay and verify all items + let stream = journal + .replay(0, 0, NZUsize!(1024)) + .await + .expect("Failed to setup replay"); + pin_mut!(stream); + + let mut count = 0; + while let Some(result) = stream.next().await { + let (section, offset, size, item) = result.expect("Failed to replay item"); + assert_eq!(section, 1); + assert_eq!(offset, offsets[count]); + assert_eq!(size, 1); + assert_eq!(item, count as u8); + count += 1; + } + assert_eq!(count, num_items, "Should replay all items"); + }); + } + #[test_traced] fn test_journal_rewind_many_sections() { let executor = deterministic::Runner::default(); diff --git a/storage/src/mmr/journaled.rs b/storage/src/mmr/journaled.rs index 4ddb5cc82f..eeed74c371 100644 --- a/storage/src/mmr/journaled.rs +++ b/storage/src/mmr/journaled.rs @@ -835,14 +835,15 @@ mod tests { }; use commonware_macros::test_traced; use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner}; - use commonware_utils::{hex, NZUsize, NZU64}; + use commonware_utils::{hex, NZUsize, NZU16, NZU64}; + use std::num::NonZeroU16; fn test_digest(v: usize) -> Digest { Sha256::hash(&v.to_be_bytes()) } - const PAGE_SIZE: usize = 111; - const PAGE_CACHE_SIZE: usize = 5; + const PAGE_SIZE: NonZeroU16 = NZU16!(111); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(5); fn test_config() -> Config { Config { @@ -851,7 +852,7 @@ mod tests { items_per_blob: NZU64!(7), write_buffer: NZUsize!(1024), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } @@ -1134,16 +1135,17 @@ mod tests { drop(mmr); // The very last element we added (pos=495) resulted in new parents at positions 496 & - // 497. Simulate a partial write by corrupting the last parent's checksum by truncating + // 497. Simulate a partial write by corrupting the last page's checksum by truncating // the last blob by a single byte. let partition: String = "journal_partition".into(); let (blob, len) = context .open(&partition, &71u64.to_be_bytes()) .await .expect("Failed to open blob"); - assert_eq!(len, 36); // N+4 = 36 bytes per node, 1 node in the last blob + // A full page w/ CRC should have been written on sync. + assert_eq!(len, PAGE_SIZE.get() as u64 + 12); - // truncate the blob by one byte to corrupt the checksum of the last parent node. + // truncate the blob by one byte to corrupt the page CRC. blob.resize(len - 1).await.expect("Failed to corrupt blob"); blob.sync().await.expect("Failed to sync blob"); @@ -1161,33 +1163,6 @@ mod tests { .await .unwrap(); assert_eq!(mmr.size(), 498); - drop(mmr); - - // Repeat partial write test though this time truncate the leaf itself not just some - // parent. The leaf is in the *previous* blob so we'll have to delete the most recent - // blob, then appropriately truncate the previous one. - context - .remove(&partition, Some(&71u64.to_be_bytes())) - .await - .expect("Failed to remove blob"); - let (blob, len) = context - .open(&partition, &70u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - assert_eq!(len, 36 * 7); // this blob should be full. - - // The last leaf should be in slot 5 of this blob, truncate last byte of its checksum. - blob.resize(36 * 5 + 35) - .await - .expect("Failed to corrupt blob"); - blob.sync().await.expect("Failed to sync blob"); - - let mmr = Mmr::init(context.clone(), &mut hasher, test_config()) - .await - .unwrap(); - // Since the leaf was corrupted, it should not have been recovered, and the journal's - // size will be the last-valid size. - assert_eq!(mmr.size(), 495); mmr.destroy().await.unwrap(); }); @@ -1464,7 +1439,7 @@ mod tests { items_per_blob: NZU64!(7), write_buffer: NZUsize!(1024), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }, ) .await @@ -1515,7 +1490,7 @@ mod tests { items_per_blob: NZU64!(7), write_buffer: NZUsize!(1024), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }, ) .await @@ -1540,7 +1515,7 @@ mod tests { items_per_blob: NZU64!(7), write_buffer: NZUsize!(1024), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }, ) .await diff --git a/storage/src/qmdb/any/mod.rs b/storage/src/qmdb/any/mod.rs index d2c5b05273..c682e2c9e1 100644 --- a/storage/src/qmdb/any/mod.rs +++ b/storage/src/qmdb/any/mod.rs @@ -155,11 +155,12 @@ pub(crate) mod test { qmdb::any::{FixedConfig, VariableConfig}, translator::TwoCap, }; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; + use std::num::NonZeroU16; // Janky page & cache sizes to exercise boundary conditions. - const PAGE_SIZE: usize = 101; - const PAGE_CACHE_SIZE: usize = 11; + const PAGE_SIZE: NonZeroU16 = NZU16!(101); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11); pub(super) fn fixed_db_config(suffix: &str) -> FixedConfig { FixedConfig { @@ -172,7 +173,7 @@ pub(crate) mod test { log_write_buffer: NZUsize!(1024), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } @@ -189,7 +190,7 @@ pub(crate) mod test { log_codec_config: (), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/any/ordered/fixed.rs b/storage/src/qmdb/any/ordered/fixed.rs index 14e10fcc0a..969d2cf577 100644 --- a/storage/src/qmdb/any/ordered/fixed.rs +++ b/storage/src/qmdb/any/ordered/fixed.rs @@ -87,13 +87,16 @@ mod test { deterministic::{self, Context}, Runner as _, }; - use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; + use commonware_utils::{sequence::FixedBytes, NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, seq::IteratorRandom, RngCore, SeedableRng}; - use std::collections::{BTreeMap, HashMap}; + use std::{ + collections::{BTreeMap, HashMap}, + num::{NonZeroU16, NonZeroUsize}, + }; // Janky page & cache sizes to exercise boundary conditions. - const PAGE_SIZE: usize = 103; - const PAGE_CACHE_SIZE: usize = 13; + const PAGE_SIZE: NonZeroU16 = NZU16!(103); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(13); fn any_db_config(suffix: &str) -> Config { Config { @@ -106,7 +109,7 @@ mod test { log_write_buffer: NZUsize!(1024), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } @@ -138,7 +141,7 @@ mod test { log_write_buffer: NZUsize!(64), translator: t, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/any/unordered/fixed/mod.rs b/storage/src/qmdb/any/unordered/fixed/mod.rs index 33e7c05291..a840b74b24 100644 --- a/storage/src/qmdb/any/unordered/fixed/mod.rs +++ b/storage/src/qmdb/any/unordered/fixed/mod.rs @@ -92,12 +92,13 @@ pub(super) mod test { deterministic::{self, Context}, Runner as _, }; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, RngCore, SeedableRng}; + use std::num::{NonZeroU16, NonZeroUsize}; // Janky page & cache sizes to exercise boundary conditions. - const PAGE_SIZE: usize = 101; - const PAGE_CACHE_SIZE: usize = 11; + const PAGE_SIZE: NonZeroU16 = NZU16!(101); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11); pub(crate) fn any_db_config(suffix: &str) -> Config { Config { @@ -110,7 +111,7 @@ pub(super) mod test { log_write_buffer: NZUsize!(1024), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } @@ -143,7 +144,7 @@ pub(super) mod test { log_write_buffer: NZUsize!(64), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/any/unordered/fixed/sync.rs b/storage/src/qmdb/any/unordered/fixed/sync.rs index d3462c390c..7f10afc0e2 100644 --- a/storage/src/qmdb/any/unordered/fixed/sync.rs +++ b/storage/src/qmdb/any/unordered/fixed/sync.rs @@ -12,7 +12,7 @@ use crate::{ use commonware_codec::CodecFixed; use commonware_cryptography::{DigestOf, Hasher}; use commonware_runtime::{ - buffer::Append, telemetry::metrics::status::GaugeExt, Blob, Clock, Metrics, Storage, + buffer::pool::Append, telemetry::metrics::status::GaugeExt, Blob, Clock, Metrics, Storage, }; use commonware_utils::Array; use prometheus_client::metrics::{counter::Counter, gauge::Gauge}; @@ -225,7 +225,13 @@ pub(crate) async fn init_journal_at_size 0 { tail.resize(tail_size).await?; } @@ -275,13 +281,13 @@ mod tests { deterministic::{self, Context}, Runner as _, }; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rstest::rstest; - use std::num::NonZeroU64; + use std::num::{NonZeroU16, NonZeroU64, NonZeroUsize}; // Janky sizes to test boundary conditions. - const PAGE_SIZE: usize = 99; - const PAGE_CACHE_SIZE: usize = 3; + const PAGE_SIZE: NonZeroU16 = NZU16!(99); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3); fn test_digest(value: u64) -> Digest { Sha256::hash(&value.to_be_bytes()) @@ -440,7 +446,7 @@ mod tests { partition: "test_fresh_start".into(), items_per_blob: NZU64!(5), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; // Initialize journal with sync boundaries when no existing data exists @@ -490,7 +496,7 @@ mod tests { partition: "test_overlap".into(), items_per_blob: NZU64!(4), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; // Create initial journal with 20 operations @@ -556,7 +562,7 @@ mod tests { partition: "test_exact_match".into(), items_per_blob: NZU64!(3), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; // Create initial journal with 20 operations (0-19) @@ -622,7 +628,7 @@ mod tests { partition: "test_unexpected_data".into(), items_per_blob: NZU64!(4), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; // Create initial journal with 30 operations (0-29) @@ -663,7 +669,7 @@ mod tests { partition: "test_invalid_range".into(), items_per_blob: NZU64!(4), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let lower_bound = 6; @@ -686,7 +692,7 @@ mod tests { partition: "test_init_at_size".into(), items_per_blob: NZU64!(5), write_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; // Test 1: Initialize at size 0 (empty journal) @@ -753,7 +759,7 @@ mod tests { // Operations 5-6 should be unreadable (dummy data in tail blob) for i in 5..7 { let result = journal.read(i).await; - assert!(result.is_err()); // Should fail due to invalid data + assert_eq!(result.unwrap(), Sha256::fill(0)); // dummy data is all 0s } // Should be able to append from position 7 @@ -781,10 +787,10 @@ mod tests { assert!(matches!(result, Err(journal::Error::ItemPruned(_)))); } - // Operations 20-22 should be unreadable (dummy data in tail blob) + // Operations 20-22 should be all 0s (dummy data in tail blob) for i in 20..23 { - let result = journal.read(i).await; - assert!(result.is_err()); // Should fail due to invalid data + let result = journal.read(i).await.unwrap(); + assert_eq!(result, Sha256::fill(0)); } // Should be able to append from position 23 diff --git a/storage/src/qmdb/any/unordered/variable/mod.rs b/storage/src/qmdb/any/unordered/variable/mod.rs index 471088a318..bbad906155 100644 --- a/storage/src/qmdb/any/unordered/variable/mod.rs +++ b/storage/src/qmdb/any/unordered/variable/mod.rs @@ -96,11 +96,12 @@ pub(super) mod test { use commonware_macros::test_traced; use commonware_math::algebra::Random; use commonware_runtime::{buffer::PoolRef, deterministic, Runner as _}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::RngCore; + use std::num::{NonZeroU16, NonZeroUsize}; - const PAGE_SIZE: usize = 77; - const PAGE_CACHE_SIZE: usize = 9; + const PAGE_SIZE: NonZeroU16 = NZU16!(77); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(9); fn db_config(suffix: &str) -> VariableConfig, ())> { VariableConfig { @@ -115,7 +116,7 @@ pub(super) mod test { log_codec_config: ((0..=10000).into(), ()), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/any/unordered/variable/sync.rs b/storage/src/qmdb/any/unordered/variable/sync.rs index a5f9ced5b9..101e7528f9 100644 --- a/storage/src/qmdb/any/unordered/variable/sync.rs +++ b/storage/src/qmdb/any/unordered/variable/sync.rs @@ -137,13 +137,13 @@ mod tests { buffer::PoolRef, deterministic::{self, Context}, }; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, RngCore as _, SeedableRng as _}; use rstest::rstest; - use std::num::NonZeroU64; + use std::num::{NonZeroU16, NonZeroU64, NonZeroUsize}; - const PAGE_SIZE: usize = 99; - const PAGE_CACHE_SIZE: usize = 3; + const PAGE_SIZE: NonZeroU16 = NZU16!(99); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3); type VarConfig = qmdb::any::VariableConfig, ())>; @@ -160,7 +160,7 @@ mod tests { log_codec_config: ((0..=10000).into(), ()), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/benches/fixed/mod.rs b/storage/src/qmdb/benches/fixed/mod.rs index 1c2ba8e723..f099c14b64 100644 --- a/storage/src/qmdb/benches/fixed/mod.rs +++ b/storage/src/qmdb/benches/fixed/mod.rs @@ -24,9 +24,9 @@ use commonware_storage::{ }, translator::EightCap, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, RngCore, SeedableRng}; -use std::num::{NonZeroU64, NonZeroUsize}; +use std::num::{NonZeroU16, NonZeroU64, NonZeroUsize}; pub mod generate; pub mod init; @@ -77,7 +77,7 @@ const CHUNK_SIZE: usize = 32; const THREADS: NonZeroUsize = NZUsize!(8); /// Use a "prod sized" page size to test the performance of the journal. -const PAGE_SIZE: NonZeroUsize = NZUsize!(16384); +const PAGE_SIZE: NonZeroU16 = NZU16!(16384); /// The number of pages to cache in the buffer pool. const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10_000); diff --git a/storage/src/qmdb/benches/keyless_generate.rs b/storage/src/qmdb/benches/keyless_generate.rs index f3e1455d1c..62a6dc913a 100644 --- a/storage/src/qmdb/benches/keyless_generate.rs +++ b/storage/src/qmdb/benches/keyless_generate.rs @@ -10,11 +10,11 @@ use commonware_storage::qmdb::{ keyless::{Config as KConfig, Keyless}, NonDurable, Unmerkleized, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use criterion::{criterion_group, Criterion}; use rand::{rngs::StdRng, RngCore, SeedableRng}; use std::{ - num::{NonZeroU64, NonZeroUsize}, + num::{NonZeroU16, NonZeroU64, NonZeroUsize}, time::{Duration, Instant}, }; @@ -24,7 +24,7 @@ const ITEMS_PER_BLOB: NonZeroU64 = NZU64!(50_000); const PARTITION_SUFFIX: &str = "keyless_bench_partition"; /// Use a "prod sized" page size to test the performance of the journal. -const PAGE_SIZE: NonZeroUsize = NZUsize!(16384); +const PAGE_SIZE: NonZeroU16 = NZU16!(16384); /// The number of pages to cache in the buffer pool. const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10_000); diff --git a/storage/src/qmdb/benches/variable/mod.rs b/storage/src/qmdb/benches/variable/mod.rs index 853d510864..7f70a7eacf 100644 --- a/storage/src/qmdb/benches/variable/mod.rs +++ b/storage/src/qmdb/benches/variable/mod.rs @@ -16,9 +16,9 @@ use commonware_storage::{ }, translator::EightCap, }; -use commonware_utils::{NZUsize, NZU64}; +use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, RngCore, SeedableRng}; -use std::num::{NonZeroU64, NonZeroUsize}; +use std::num::{NonZeroU16, NonZeroU64, NonZeroUsize}; pub mod generate; pub mod init; @@ -50,7 +50,7 @@ const PARTITION_SUFFIX: &str = "any_variable_bench_partition"; const THREADS: NonZeroUsize = NZUsize!(8); /// Use a "prod sized" page size to test the performance of the journal. -const PAGE_SIZE: NonZeroUsize = NZUsize!(16384); +const PAGE_SIZE: NonZeroU16 = NZU16!(16384); /// The number of pages to cache in the buffer pool. const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10_000); diff --git a/storage/src/qmdb/current/ordered/fixed.rs b/storage/src/qmdb/current/ordered/fixed.rs index 8484a8c05c..73fa824e2f 100644 --- a/storage/src/qmdb/current/ordered/fixed.rs +++ b/storage/src/qmdb/current/ordered/fixed.rs @@ -988,13 +988,16 @@ pub mod test { use commonware_cryptography::{sha256::Digest, Sha256}; use commonware_macros::test_traced; use commonware_runtime::{buffer::PoolRef, deterministic, Runner as _}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, RngCore, SeedableRng}; - use std::collections::HashMap; + use std::{ + collections::HashMap, + num::{NonZeroU16, NonZeroUsize}, + }; use tracing::warn; - const PAGE_SIZE: usize = 88; - const PAGE_CACHE_SIZE: usize = 8; + const PAGE_SIZE: NonZeroU16 = NZU16!(88); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(8); fn current_db_config(partition_prefix: &str) -> Config { Config { @@ -1008,7 +1011,7 @@ pub mod test { bitmap_metadata_partition: format!("{partition_prefix}_bitmap_metadata_partition"), translator: OneCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/current/unordered/fixed.rs b/storage/src/qmdb/current/unordered/fixed.rs index 5ddee2788d..7c91484669 100644 --- a/storage/src/qmdb/current/unordered/fixed.rs +++ b/storage/src/qmdb/current/unordered/fixed.rs @@ -869,13 +869,16 @@ pub mod test { use commonware_cryptography::{sha256::Digest, Sha256}; use commonware_macros::test_traced; use commonware_runtime::{buffer::PoolRef, deterministic, Runner as _}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::{rngs::StdRng, RngCore, SeedableRng}; - use std::collections::HashMap; + use std::{ + collections::HashMap, + num::{NonZeroU16, NonZeroUsize}, + }; use tracing::warn; - const PAGE_SIZE: usize = 88; - const PAGE_CACHE_SIZE: usize = 8; + const PAGE_SIZE: NonZeroU16 = NZU16!(88); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(8); fn current_db_config(partition_prefix: &str) -> Config { Config { @@ -889,7 +892,7 @@ pub mod test { bitmap_metadata_partition: format!("{partition_prefix}_bitmap_metadata_partition"), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/immutable/mod.rs b/storage/src/qmdb/immutable/mod.rs index 397c14133f..bd64b300f0 100644 --- a/storage/src/qmdb/immutable/mod.rs +++ b/storage/src/qmdb/immutable/mod.rs @@ -598,10 +598,11 @@ pub(super) mod test { deterministic::{self}, Runner as _, }; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; + use std::num::NonZeroU16; - const PAGE_SIZE: usize = 77; - const PAGE_CACHE_SIZE: usize = 9; + const PAGE_SIZE: NonZeroU16 = NZU16!(77); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(9); const ITEMS_PER_SECTION: u64 = 5; pub(crate) fn db_config( @@ -619,7 +620,7 @@ pub(super) mod test { log_write_buffer: NZUsize!(1024), translator: TwoCap, thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/immutable/sync/mod.rs b/storage/src/qmdb/immutable/sync/mod.rs index 3698541083..2798acf078 100644 --- a/storage/src/qmdb/immutable/sync/mod.rs +++ b/storage/src/qmdb/immutable/sync/mod.rs @@ -168,13 +168,13 @@ mod tests { use commonware_macros::test_traced; use commonware_math::algebra::Random; use commonware_runtime::{buffer::PoolRef, deterministic, Runner as _, RwLock}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use futures::{channel::mpsc, SinkExt as _}; use rand::{rngs::StdRng, RngCore as _, SeedableRng as _}; use rstest::rstest; use std::{ collections::HashMap, - num::{NonZeroU64, NonZeroUsize}, + num::{NonZeroU16, NonZeroU64, NonZeroUsize}, sync::Arc, }; @@ -200,7 +200,7 @@ mod tests { /// Create a simple config for sync tests fn create_sync_config(suffix: &str) -> immutable::Config { - const PAGE_SIZE: NonZeroUsize = NZUsize!(77); + const PAGE_SIZE: NonZeroU16 = NZU16!(77); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(9); const ITEMS_PER_SECTION: NonZeroU64 = NZU64!(5); diff --git a/storage/src/qmdb/keyless/mod.rs b/storage/src/qmdb/keyless/mod.rs index 845e86e3fc..f17080fbcf 100644 --- a/storage/src/qmdb/keyless/mod.rs +++ b/storage/src/qmdb/keyless/mod.rs @@ -405,12 +405,13 @@ mod test { use commonware_cryptography::Sha256; use commonware_macros::test_traced; use commonware_runtime::{deterministic, Runner as _}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; use rand::Rng; + use std::num::NonZeroU16; // Use some weird sizes here to test boundary conditions. - const PAGE_SIZE: usize = 101; - const PAGE_CACHE_SIZE: usize = 11; + const PAGE_SIZE: NonZeroU16 = NZU16!(101); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(11); fn db_config(suffix: &str) -> Config<(commonware_codec::RangeCfg, ())> { Config { @@ -424,7 +425,7 @@ mod test { log_codec_config: ((0..=10000).into(), ()), log_items_per_section: NZU64!(7), thread_pool: None, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), } } diff --git a/storage/src/qmdb/store/db.rs b/storage/src/qmdb/store/db.rs index 9e34b99655..d7f75445cd 100644 --- a/storage/src/qmdb/store/db.rs +++ b/storage/src/qmdb/store/db.rs @@ -18,12 +18,13 @@ //! qmdb::store::db::{Config, Db}, //! translator::TwoCap, //! }; -//! use commonware_utils::{NZUsize, NZU64}; +//! use commonware_utils::{NZUsize, NZU16, NZU64}; //! use commonware_cryptography::{blake3::Digest, Digest as _}; //! use commonware_math::algebra::Random; //! use commonware_runtime::{buffer::PoolRef, deterministic::Runner, Metrics, Runner as _}; //! -//! const PAGE_SIZE: usize = 8192; +//! use std::num::NonZeroU16; +//! const PAGE_SIZE: NonZeroU16 = NZU16!(8192); //! const PAGE_CACHE_SIZE: usize = 100; //! //! let executor = Runner::default(); @@ -35,7 +36,7 @@ //! log_codec_config: (), //! log_items_per_section: NZU64!(4), //! translator: TwoCap, -//! buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), +//! buffer_pool: PoolRef::new(PAGE_SIZE, NZUsize!(PAGE_CACHE_SIZE)), //! }; //! let db = //! Db::<_, Digest, Digest, TwoCap>::init(ctx.with_label("store"), config) @@ -596,10 +597,11 @@ mod test { use commonware_macros::test_traced; use commonware_math::algebra::Random; use commonware_runtime::{deterministic, Runner}; - use commonware_utils::{NZUsize, NZU64}; + use commonware_utils::{NZUsize, NZU16, NZU64}; + use std::num::NonZeroU16; - const PAGE_SIZE: usize = 77; - const PAGE_CACHE_SIZE: usize = 9; + const PAGE_SIZE: NonZeroU16 = NZU16!(77); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(9); /// The type of the store used in tests. type TestStore = Db, TwoCap, Durable>; @@ -612,7 +614,7 @@ mod test { log_codec_config: ((0..=10000).into(), ()), log_items_per_section: NZU64!(7), translator: TwoCap, - buffer_pool: PoolRef::new(NZUsize!(PAGE_SIZE), NZUsize!(PAGE_CACHE_SIZE)), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; TestStore::init(context, cfg).await.unwrap() }