diff --git a/consensus/src/marshal/actor.rs b/consensus/src/marshal/actor.rs index 0a8d917555..8ea189bb26 100644 --- a/consensus/src/marshal/actor.rs +++ b/consensus/src/marshal/actor.rs @@ -180,8 +180,9 @@ where partition_prefix: format!("{}-cache", config.partition_prefix.clone()), prunable_items_per_section: config.prunable_items_per_section, replay_buffer: config.replay_buffer, - write_buffer: config.write_buffer, - freezer_journal_buffer_pool: config.buffer_pool.clone(), + key_write_buffer: config.key_write_buffer, + value_write_buffer: config.value_write_buffer, + key_buffer_pool: config.buffer_pool.clone(), }; let cache = cache::Manager::init( context.with_label("cache"), diff --git a/consensus/src/marshal/cache.rs b/consensus/src/marshal/cache.rs index bb1abe6ac0..92c7b972af 100644 --- a/consensus/src/marshal/cache.rs +++ b/consensus/src/marshal/cache.rs @@ -28,8 +28,9 @@ pub(crate) struct Config { pub partition_prefix: String, pub prunable_items_per_section: NonZero, pub replay_buffer: NonZeroUsize, - pub write_buffer: NonZeroUsize, - pub freezer_journal_buffer_pool: PoolRef, + pub key_write_buffer: NonZeroUsize, + pub value_write_buffer: NonZeroUsize, + pub key_buffer_pool: PoolRef, } /// Prunable archives for a single epoch. @@ -189,14 +190,16 @@ impl Manager< ) -> prunable::Archive { let start = Instant::now(); let cfg = prunable::Config { - partition: format!("{}-cache-{epoch}-{name}", self.cfg.partition_prefix), translator: TwoCap, + key_partition: format!("{}-cache-{epoch}-{name}-key", self.cfg.partition_prefix), + key_buffer_pool: self.cfg.key_buffer_pool.clone(), + value_partition: format!("{}-cache-{epoch}-{name}-value", self.cfg.partition_prefix), items_per_section: self.cfg.prunable_items_per_section, compression: None, codec_config, - buffer_pool: self.cfg.freezer_journal_buffer_pool.clone(), replay_buffer: self.cfg.replay_buffer, - write_buffer: self.cfg.write_buffer, + key_write_buffer: self.cfg.key_write_buffer, + value_write_buffer: self.cfg.value_write_buffer, }; let archive = prunable::Archive::init(self.context.with_label(name), cfg) .await diff --git a/consensus/src/marshal/config.rs b/consensus/src/marshal/config.rs index 49f1380494..3449fd41ea 100644 --- a/consensus/src/marshal/config.rs +++ b/consensus/src/marshal/config.rs @@ -39,8 +39,11 @@ where /// The size of the replay buffer for storage archives. pub replay_buffer: NonZeroUsize, - /// The size of the write buffer for storage archives. - pub write_buffer: NonZeroUsize, + /// The size of the write buffer for the key journal of storage archives. + pub key_write_buffer: NonZeroUsize, + + /// The size of the write buffer for the value journal of storage archives. + pub value_write_buffer: NonZeroUsize, /// Codec configuration for block type. pub block_codec_config: B::Cfg, diff --git a/consensus/src/marshal/mod.rs b/consensus/src/marshal/mod.rs index 9b5b52e6c8..3e483a9a39 100644 --- a/consensus/src/marshal/mod.rs +++ b/consensus/src/marshal/mod.rs @@ -189,7 +189,8 @@ mod tests { partition_prefix: format!("validator-{}", validator.clone()), prunable_items_per_section: NZU64!(10), replay_buffer: NZUsize!(1024), - write_buffer: NZUsize!(1024), + key_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; @@ -237,13 +238,17 @@ mod tests { freezer_table_initial_size: 64, freezer_table_resize_frequency: 10, freezer_table_resize_chunk_size: 10, - freezer_journal_partition: format!( - "{}-finalizations-by-height-freezer-journal", + freezer_key_partition: format!( + "{}-finalizations-by-height-freezer-key", config.partition_prefix ), - freezer_journal_target_size: 1024, - freezer_journal_compression: None, - freezer_journal_buffer_pool: config.buffer_pool.clone(), + freezer_key_buffer_pool: config.buffer_pool.clone(), + freezer_value_partition: format!( + "{}-finalizations-by-height-freezer-value", + config.partition_prefix + ), + freezer_value_target_size: 1024, + freezer_value_compression: None, ordinal_partition: format!( "{}-finalizations-by-height-ordinal", config.partition_prefix @@ -251,7 +256,9 @@ mod tests { items_per_section: NZU64!(10), codec_config: S::certificate_codec_config_unbounded(), replay_buffer: config.replay_buffer, - write_buffer: config.write_buffer, + freezer_key_write_buffer: config.key_write_buffer, + freezer_value_write_buffer: config.value_write_buffer, + ordinal_write_buffer: config.key_write_buffer, }, ) .await @@ -274,18 +281,24 @@ mod tests { freezer_table_initial_size: 64, freezer_table_resize_frequency: 10, freezer_table_resize_chunk_size: 10, - freezer_journal_partition: format!( - "{}-finalized_blocks-freezer-journal", + freezer_key_partition: format!( + "{}-finalized_blocks-freezer-key", + config.partition_prefix + ), + freezer_key_buffer_pool: config.buffer_pool.clone(), + freezer_value_partition: format!( + "{}-finalized_blocks-freezer-value", config.partition_prefix ), - freezer_journal_target_size: 1024, - freezer_journal_compression: None, - freezer_journal_buffer_pool: config.buffer_pool.clone(), + freezer_value_target_size: 1024, + freezer_value_compression: None, ordinal_partition: format!("{}-finalized_blocks-ordinal", config.partition_prefix), items_per_section: NZU64!(10), codec_config: config.block_codec_config, replay_buffer: config.replay_buffer, - write_buffer: config.write_buffer, + freezer_key_write_buffer: config.key_write_buffer, + freezer_value_write_buffer: config.value_write_buffer, + ordinal_write_buffer: config.key_write_buffer, }, ) .await diff --git a/examples/reshare/src/engine.rs b/examples/reshare/src/engine.rs index 14fbd6a7a9..331f3dc9aa 100644 --- a/examples/reshare/src/engine.rs +++ b/examples/reshare/src/engine.rs @@ -41,8 +41,8 @@ const PRUNABLE_ITEMS_PER_SECTION: NonZero = NZU64!(4_096); const IMMUTABLE_ITEMS_PER_SECTION: NonZero = NZU64!(262_144); const FREEZER_TABLE_RESIZE_FREQUENCY: u8 = 4; const FREEZER_TABLE_RESIZE_CHUNK_SIZE: u32 = 2u32.pow(16); // 3MB -const FREEZER_JOURNAL_TARGET_SIZE: u64 = 1024 * 1024 * 1024; // 1GB -const FREEZER_JOURNAL_COMPRESSION: Option = Some(3); +const FREEZER_VALUE_TARGET_SIZE: u64 = 1024 * 1024 * 1024; // 1GB +const FREEZER_VALUE_COMPRESSION: Option = Some(3); const REPLAY_BUFFER: NonZero = NZUsize!(8 * 1024 * 1024); // 8MB const WRITE_BUFFER: NonZero = NZUsize!(1024 * 1024); // 1MB const BUFFER_POOL_PAGE_SIZE: NonZero = NZUsize!(4_096); // 4KB @@ -171,13 +171,17 @@ where freezer_table_initial_size: config.freezer_table_initial_size, freezer_table_resize_frequency: FREEZER_TABLE_RESIZE_FREQUENCY, freezer_table_resize_chunk_size: FREEZER_TABLE_RESIZE_CHUNK_SIZE, - freezer_journal_partition: format!( - "{}-finalizations-by-height-freezer-journal", + freezer_key_partition: format!( + "{}-finalizations-by-height-freezer-key", config.partition_prefix ), - freezer_journal_target_size: FREEZER_JOURNAL_TARGET_SIZE, - freezer_journal_compression: FREEZER_JOURNAL_COMPRESSION, - freezer_journal_buffer_pool: buffer_pool.clone(), + freezer_key_buffer_pool: buffer_pool.clone(), + freezer_value_partition: format!( + "{}-finalizations-by-height-freezer-value", + config.partition_prefix + ), + freezer_value_target_size: FREEZER_VALUE_TARGET_SIZE, + freezer_value_compression: FREEZER_VALUE_COMPRESSION, ordinal_partition: format!( "{}-finalizations-by-height-ordinal", config.partition_prefix @@ -185,7 +189,9 @@ where items_per_section: IMMUTABLE_ITEMS_PER_SECTION, codec_config: S::certificate_codec_config_unbounded(), replay_buffer: REPLAY_BUFFER, - write_buffer: WRITE_BUFFER, + freezer_key_write_buffer: WRITE_BUFFER, + freezer_value_write_buffer: WRITE_BUFFER, + ordinal_write_buffer: WRITE_BUFFER, }, ) .await @@ -208,18 +214,24 @@ where freezer_table_initial_size: config.freezer_table_initial_size, freezer_table_resize_frequency: FREEZER_TABLE_RESIZE_FREQUENCY, freezer_table_resize_chunk_size: FREEZER_TABLE_RESIZE_CHUNK_SIZE, - freezer_journal_partition: format!( - "{}-finalized_blocks-freezer-journal", + freezer_key_partition: format!( + "{}-finalized_blocks-freezer-key", + config.partition_prefix + ), + freezer_key_buffer_pool: buffer_pool.clone(), + freezer_value_partition: format!( + "{}-finalized_blocks-freezer-value", config.partition_prefix ), - freezer_journal_target_size: FREEZER_JOURNAL_TARGET_SIZE, - freezer_journal_compression: FREEZER_JOURNAL_COMPRESSION, - freezer_journal_buffer_pool: buffer_pool.clone(), + freezer_value_target_size: FREEZER_VALUE_TARGET_SIZE, + freezer_value_compression: FREEZER_VALUE_COMPRESSION, ordinal_partition: format!("{}-finalized_blocks-ordinal", config.partition_prefix), items_per_section: IMMUTABLE_ITEMS_PER_SECTION, codec_config: num_participants, replay_buffer: REPLAY_BUFFER, - write_buffer: WRITE_BUFFER, + freezer_key_write_buffer: WRITE_BUFFER, + freezer_value_write_buffer: WRITE_BUFFER, + ordinal_write_buffer: WRITE_BUFFER, }, ) .await @@ -259,7 +271,8 @@ where prunable_items_per_section: PRUNABLE_ITEMS_PER_SECTION, buffer_pool: buffer_pool.clone(), replay_buffer: REPLAY_BUFFER, - write_buffer: WRITE_BUFFER, + key_write_buffer: WRITE_BUFFER, + value_write_buffer: WRITE_BUFFER, block_codec_config: num_participants, max_repair: MAX_REPAIR, }, diff --git a/runtime/src/deterministic.rs b/runtime/src/deterministic.rs index 71aa8ee710..db4b28149a 100644 --- a/runtime/src/deterministic.rs +++ b/runtime/src/deterministic.rs @@ -941,6 +941,11 @@ impl Context { self.executor().auditor.clone() } + /// Compute a [Sha256] digest of all storage contents. + pub fn storage_audit(&self) -> Digest { + self.storage.inner().inner().audit() + } + /// Register a DNS mapping for a hostname. /// /// If `addrs` is `None`, the mapping is removed. diff --git a/runtime/src/storage/audited.rs b/runtime/src/storage/audited.rs index 484df67003..c79bebe024 100644 --- a/runtime/src/storage/audited.rs +++ b/runtime/src/storage/audited.rs @@ -13,6 +13,11 @@ impl Storage { pub const fn new(inner: S, auditor: Arc) -> Self { Self { inner, auditor } } + + /// Get a reference to the inner storage. + pub const fn inner(&self) -> &S { + &self.inner + } } impl crate::Storage for Storage { diff --git a/runtime/src/storage/memory.rs b/runtime/src/storage/memory.rs index 9403785641..aa3705dce4 100644 --- a/runtime/src/storage/memory.rs +++ b/runtime/src/storage/memory.rs @@ -1,6 +1,7 @@ use super::Header; use commonware_codec::Encode; use commonware_utils::{hex, StableBuf}; +use sha2::{Digest, Sha256}; use std::{ collections::BTreeMap, ops::RangeInclusive, @@ -21,6 +22,24 @@ impl Default for Storage { } } +impl Storage { + /// Compute a [Sha256] digest of all blob contents. + pub fn audit(&self) -> [u8; 32] { + let partitions = self.partitions.lock().unwrap(); + let mut hasher = Sha256::new(); + + for (partition_name, blobs) in partitions.iter() { + for (blob_name, content) in blobs.iter() { + hasher.update(partition_name.as_bytes()); + hasher.update(blob_name); + hasher.update(content); + } + } + + hasher.finalize().into() + } +} + impl crate::Storage for Storage { type Blob = Blob; diff --git a/runtime/src/storage/metered.rs b/runtime/src/storage/metered.rs index 5380e56de1..37ad7d1c10 100644 --- a/runtime/src/storage/metered.rs +++ b/runtime/src/storage/metered.rs @@ -72,6 +72,11 @@ impl Storage { metrics: Metrics::new(registry).into(), } } + + /// Get a reference to the inner storage. + pub const fn inner(&self) -> &S { + &self.inner + } } impl crate::Storage for Storage { diff --git a/storage/conformance.toml b/storage/conformance.toml index 9e4d9e6926..02770d0057 100644 --- a/storage/conformance.toml +++ b/storage/conformance.toml @@ -1,10 +1,18 @@ +["commonware_storage::archive::conformance::ArchiveImmutable"] +n_cases = 128 +hash = "8e578ed38733486716d072e565e62fe5d9ba7185ffb6e26ec7db8611c69b90b8" + +["commonware_storage::archive::conformance::ArchivePrunable"] +n_cases = 128 +hash = "674e81c769c06a3965dc691b1f8c0327374f427e8a4bf67895c6ad4e566fed20" + ["commonware_storage::archive::immutable::storage::conformance::CodecConformance"] n_cases = 65536 hash = "892ba87ed9fa60ee60f694f927de60c3ee7fcdddd5a04d330c7511533cc2ca8d" -["commonware_storage::archive::prunable::storage::conformance::CodecConformance>>"] +["commonware_storage::archive::prunable::storage::conformance::CodecConformance>"] n_cases = 65536 -hash = "e681f76832577cad99b881684c56d820f9059d117ed1fef9fa3c7735c2eb24f9" +hash = "3cb6882637c1c1a929a50b3ab425311f3ef342184dc46a80b1eae616ca7b64a4" ["commonware_storage::bmt::tests::conformance::CodecConformance>"] n_cases = 65536 @@ -28,23 +36,39 @@ hash = "2066c89982e914e02272e613a7577bd22d15f4f7ce0a03c6b2fb5d3bdaff78fd" ["commonware_storage::freezer::storage::conformance::CodecConformance"] n_cases = 65536 -hash = "074b9674e0997e724949866ca8ae5c52a15d270140c1ae82626ea6d917e089f8" +hash = "fa5e588e9074776440c9d7e46729cc60c8d0cea87e5afc4326640106abcea7ba" ["commonware_storage::freezer::storage::conformance::CodecConformance"] n_cases = 65536 -hash = "8c43d6d3a0ffb749db46d02a461cdc131c14b331aed670ee36e76bd716cdbaec" +hash = "572cd3da262e0bb13d2c6780c5be7bbadb21f93e4dbf2c787febecdba62671df" -["commonware_storage::freezer::storage::conformance::CodecConformance>"] +["commonware_storage::freezer::storage::conformance::CodecConformance>"] n_cases = 65536 -hash = "b4f2eb51a158d964317fb1714dbe708ffbe6673453dc648eabdd72409fb30440" +hash = "13b3e99a8c74b50dc18150194a92306de670b94e6642758feb6d9b6e9881f827" + +["commonware_storage::journal::conformance::ContiguousFixed"] +n_cases = 512 +hash = "134bb8b838241c2dedf98d96130f014bea19f1bc7580307c9798540466eb81c6" + +["commonware_storage::journal::conformance::ContiguousVariable"] +n_cases = 512 +hash = "29d37f2309943dd27d4344710a900bb3b992c0a1089ff9734cddbfa78c039200" + +["commonware_storage::journal::conformance::SegmentedFixed"] +n_cases = 512 +hash = "505611ba11d6380254c159eb6234f87cc19a62b0919bc96d59e83de498b458fa" + +["commonware_storage::journal::conformance::SegmentedGlob"] +n_cases = 512 +hash = "adb1efeef12c203c05879ce4d1d03ef443c767737a6c6b57433189100eec9197" -["commonware_storage::journal::conformance::FixedJournal"] +["commonware_storage::journal::conformance::SegmentedOversized"] n_cases = 512 -hash = "9cd764e31b5dbc0bd78cd0908851ba1d645f083884beacd2c8a63f66de0fb9db" +hash = "b98d56d2eb039657b3452135666795eeeefdc83e9d6f3cb070e7ca114b4621a1" -["commonware_storage::journal::conformance::VariableJournal"] +["commonware_storage::journal::conformance::SegmentedVariable"] n_cases = 512 -hash = "c0af6899248693a3262f31b9a8554cd64c014d9b59f2514840c8828ad74ddf85" +hash = "cd79e09ca53917f78c290e67efe08bf17b3ec0d0faf1b5f6507d4665749574b1" ["commonware_storage::mmr::proof::tests::conformance::CodecConformance>"] n_cases = 65536 @@ -86,22 +110,6 @@ hash = "f5456580eb69727a23bfb0281e33f467acc0c91273efebd334d3e82c9770ae76" n_cases = 65536 hash = "a3fbb5f749fa5b73a684e9f0bebd8973c456e5ee43a0a3db75a99aa550dee302" -["commonware_storage::qmdb::any::ordered::operation::tests::conformance::CodecConformance>"] -n_cases = 65536 -hash = "3256e6553b9d4f0eb7230b8a70c8c9d6366cd01fe1c284e355445c9a26cb3210" - -["commonware_storage::qmdb::any::ordered::operation::tests::conformance::CodecConformance>>"] -n_cases = 65536 -hash = "1cf77ea327f4785cebf25d5d854a985cd84d07fc84f0c295cc845078ed1f0b40" - -["commonware_storage::qmdb::any::unordered::operation::tests::conformance::CodecConformance>"] -n_cases = 65536 -hash = "6fe29c8b0ca576ab04c8beedb314c123ab003099aaa6d90680469f7e58f0acad" - -["commonware_storage::qmdb::any::unordered::operation::tests::conformance::CodecConformance>>"] -n_cases = 65536 -hash = "c483b36790002dbbe4e5d4cd937dbd96659df946d74ee173b113f19d47984d30" - ["commonware_storage::qmdb::immutable::operation::tests::conformance::CodecConformance>"] n_cases = 65536 hash = "fdca5df62d243b28676ee15034663694cd219d5ef80749079126b0ed73effe0d" diff --git a/storage/fuzz/Cargo.toml b/storage/fuzz/Cargo.toml index 33ce9a53ef..0a9daedba3 100644 --- a/storage/fuzz/Cargo.toml +++ b/storage/fuzz/Cargo.toml @@ -174,13 +174,6 @@ test = false doc = false bench = false -[[bin]] -name = "cache_operations" -path = "fuzz_targets/cache_operations.rs" -test = false -doc = false -bench = false - [[bin]] name = "qmdb_immutable" path = "fuzz_targets/qmdb_immutable.rs" @@ -222,3 +215,17 @@ path = "fuzz_targets/proof_store.rs" test = false doc = false bench = false + +[[bin]] +name = "cache_operations" +path = "fuzz_targets/cache_operations.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "oversized_recovery" +path = "fuzz_targets/oversized_recovery.rs" +test = false +doc = false +bench = false diff --git a/storage/fuzz/fuzz_targets/archive_operations.rs b/storage/fuzz/fuzz_targets/archive_operations.rs index 2edae579ed..1a2ffe1fa6 100644 --- a/storage/fuzz/fuzz_targets/archive_operations.rs +++ b/storage/fuzz/fuzz_targets/archive_operations.rs @@ -48,14 +48,16 @@ fn fuzz(data: FuzzInput) { runner.start(|context| async move { let cfg = Config { - partition: "test".into(), - items_per_section: NZU64!(1024), - write_buffer: NZUsize!(1024), translator: EightCap, - replay_buffer: NZUsize!(1024*1024), + key_partition: "test_key".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), + items_per_section: NZU64!(1024), + key_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + replay_buffer: NZUsize!(1024 * 1024), compression: None, codec_config: (), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::<_, _, Key, Value>::init(context.clone(), cfg.clone()).await.expect("init failed"); diff --git a/storage/fuzz/fuzz_targets/freezer_operations.rs b/storage/fuzz/fuzz_targets/freezer_operations.rs index 950c3131bb..44a4ae4e89 100644 --- a/storage/fuzz/fuzz_targets/freezer_operations.rs +++ b/storage/fuzz/fuzz_targets/freezer_operations.rs @@ -48,11 +48,13 @@ fn fuzz(input: FuzzInput) { runner.start(|context| async move { let cfg = Config { - journal_partition: "fuzz_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(1024 * 1024), - journal_target_size: 10 * 1024 * 1024, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "fuzz_key".into(), + key_write_buffer: NZUsize!(1024 * 1024), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "fuzz_value".into(), + value_compression: None, + value_write_buffer: NZUsize!(1024 * 1024), + value_target_size: 10 * 1024 * 1024, table_partition: "fuzz_table".into(), table_initial_size: 256, table_resize_frequency: 4, diff --git a/storage/fuzz/fuzz_targets/oversized_recovery.rs b/storage/fuzz/fuzz_targets/oversized_recovery.rs new file mode 100644 index 0000000000..bb52d6348f --- /dev/null +++ b/storage/fuzz/fuzz_targets/oversized_recovery.rs @@ -0,0 +1,329 @@ +#![no_main] + +//! Fuzz test for oversized journal crash recovery. +//! +//! This test creates valid data, randomly corrupts storage, and verifies +//! that recovery doesn't panic and leaves the journal in a consistent state. + +use arbitrary::{Arbitrary, Result, Unstructured}; +use bytes::{Buf, BufMut}; +use commonware_codec::{FixedSize, Read, ReadExt, Write}; +use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner, Storage as _}; +use commonware_storage::journal::segmented::oversized::{Config, Oversized, Record}; +use commonware_utils::NZUsize; +use libfuzzer_sys::fuzz_target; +use std::num::NonZeroUsize; + +/// Test index entry that stores a u64 id and references a value. +#[derive(Debug, Clone, PartialEq)] +struct TestEntry { + id: u64, + value_offset: u64, + value_size: u32, +} + +impl TestEntry { + fn new(id: u64) -> Self { + Self { + id, + value_offset: 0, + value_size: 0, + } + } +} + +impl Write for TestEntry { + fn write(&self, buf: &mut impl BufMut) { + self.id.write(buf); + self.value_offset.write(buf); + self.value_size.write(buf); + } +} + +impl Read for TestEntry { + type Cfg = (); + + fn read_cfg( + buf: &mut impl Buf, + _: &Self::Cfg, + ) -> std::result::Result { + let id = u64::read(buf)?; + let value_offset = u64::read(buf)?; + let value_size = u32::read(buf)?; + Ok(Self { + id, + value_offset, + value_size, + }) + } +} + +impl FixedSize for TestEntry { + const SIZE: usize = u64::SIZE + u64::SIZE + u32::SIZE; +} + +impl Record for TestEntry { + fn value_location(&self) -> (u64, u32) { + (self.value_offset, self.value_size) + } + + fn with_location(mut self, offset: u64, size: u32) -> Self { + self.value_offset = offset; + self.value_size = size; + self + } +} + +type TestValue = [u8; 16]; + +#[derive(Debug, Clone)] +enum CorruptionType { + /// Truncate index to a random size + TruncateIndex { section: u64, size_factor: u8 }, + /// Truncate glob to a random size + TruncateGlob { section: u64, size_factor: u8 }, + /// Write random bytes at a random offset in index + CorruptIndexBytes { + section: u64, + offset_factor: u8, + data: [u8; 4], + }, + /// Write random bytes at a random offset in glob + CorruptGlobBytes { + section: u64, + offset_factor: u8, + data: [u8; 4], + }, + /// Delete index section + DeleteIndex { section: u64 }, + /// Delete glob section + DeleteGlob { section: u64 }, + /// Extend index with garbage + ExtendIndex { section: u64, garbage: [u8; 32] }, + /// Extend glob with garbage + ExtendGlob { section: u64, garbage: [u8; 64] }, +} + +impl<'a> Arbitrary<'a> for CorruptionType { + fn arbitrary(u: &mut Unstructured<'a>) -> Result { + let variant = u.int_in_range(0..=7)?; + match variant { + 0 => Ok(CorruptionType::TruncateIndex { + section: u.int_in_range(1..=3)?, + size_factor: u.arbitrary()?, + }), + 1 => Ok(CorruptionType::TruncateGlob { + section: u.int_in_range(1..=3)?, + size_factor: u.arbitrary()?, + }), + 2 => Ok(CorruptionType::CorruptIndexBytes { + section: u.int_in_range(1..=3)?, + offset_factor: u.arbitrary()?, + data: u.arbitrary()?, + }), + 3 => Ok(CorruptionType::CorruptGlobBytes { + section: u.int_in_range(1..=3)?, + offset_factor: u.arbitrary()?, + data: u.arbitrary()?, + }), + 4 => Ok(CorruptionType::DeleteIndex { + section: u.int_in_range(1..=3)?, + }), + 5 => Ok(CorruptionType::DeleteGlob { + section: u.int_in_range(1..=3)?, + }), + 6 => Ok(CorruptionType::ExtendIndex { + section: u.int_in_range(1..=3)?, + garbage: u.arbitrary()?, + }), + _ => Ok(CorruptionType::ExtendGlob { + section: u.int_in_range(1..=3)?, + garbage: u.arbitrary()?, + }), + } + } +} + +#[derive(Arbitrary, Debug)] +struct FuzzInput { + /// Number of entries per section (1-10) + entries_per_section: [u8; 3], + /// Corruptions to apply before recovery + corruptions: Vec, + /// Whether to sync before corruption + sync_before_corrupt: bool, +} + +const PAGE_SIZE: NonZeroUsize = NZUsize!(128); +const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(4); +const INDEX_PARTITION: &str = "fuzz_index"; +const VALUE_PARTITION: &str = "fuzz_values"; + +fn test_cfg() -> Config<()> { + Config { + index_partition: INDEX_PARTITION.to_string(), + value_partition: VALUE_PARTITION.to_string(), + index_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + index_write_buffer: NZUsize!(512), + value_write_buffer: NZUsize!(512), + compression: None, + codec_config: (), + } +} + +fn fuzz(input: FuzzInput) { + let runner = deterministic::Runner::default(); + + runner.start(|context| async move { + let cfg = test_cfg(); + + // Phase 1: Create valid data + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut entry_id = 0u64; + for (section_idx, &count) in input.entries_per_section.iter().enumerate() { + let section = (section_idx + 1) as u64; + let count = (count % 10) + 1; // 1-10 entries per section + + for _ in 0..count { + let value: TestValue = [entry_id as u8; 16]; + let entry = TestEntry::new(entry_id); + let _ = oversized.append(section, entry, &value).await; + entry_id += 1; + } + let _ = oversized.sync(section).await; + } + + if input.sync_before_corrupt { + let _ = oversized.sync_all().await; + } + drop(oversized); + + // Phase 2: Apply corruptions + for corruption in &input.corruptions { + match corruption { + CorruptionType::TruncateIndex { + section, + size_factor, + } => { + if let Ok((blob, size)) = + context.open(INDEX_PARTITION, §ion.to_be_bytes()).await + { + let new_size = (size * (*size_factor as u64)) / 256; + let _ = blob.resize(new_size).await; + let _ = blob.sync().await; + } + } + CorruptionType::TruncateGlob { + section, + size_factor, + } => { + if let Ok((blob, size)) = + context.open(VALUE_PARTITION, §ion.to_be_bytes()).await + { + let new_size = (size * (*size_factor as u64)) / 256; + let _ = blob.resize(new_size).await; + let _ = blob.sync().await; + } + } + CorruptionType::CorruptIndexBytes { + section, + offset_factor, + data, + } => { + if let Ok((blob, size)) = + context.open(INDEX_PARTITION, §ion.to_be_bytes()).await + { + if size > 0 { + let offset = (size * (*offset_factor as u64)) / 256; + let _ = blob.write_at(data.to_vec(), offset).await; + let _ = blob.sync().await; + } + } + } + CorruptionType::CorruptGlobBytes { + section, + offset_factor, + data, + } => { + if let Ok((blob, size)) = + context.open(VALUE_PARTITION, §ion.to_be_bytes()).await + { + if size > 0 { + let offset = (size * (*offset_factor as u64)) / 256; + let _ = blob.write_at(data.to_vec(), offset).await; + let _ = blob.sync().await; + } + } + } + CorruptionType::DeleteIndex { section } => { + let _ = context + .remove(INDEX_PARTITION, Some(§ion.to_be_bytes())) + .await; + } + CorruptionType::DeleteGlob { section } => { + let _ = context + .remove(VALUE_PARTITION, Some(§ion.to_be_bytes())) + .await; + } + CorruptionType::ExtendIndex { section, garbage } => { + if let Ok((blob, size)) = + context.open(INDEX_PARTITION, §ion.to_be_bytes()).await + { + let _ = blob.write_at(garbage.to_vec(), size).await; + let _ = blob.sync().await; + } + } + CorruptionType::ExtendGlob { section, garbage } => { + if let Ok((blob, size)) = + context.open(VALUE_PARTITION, §ion.to_be_bytes()).await + { + let _ = blob.write_at(garbage.to_vec(), size).await; + let _ = blob.sync().await; + } + } + } + } + + // Phase 3: Recovery - this should not panic + let mut recovered: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Recovery should not fail"); + + // Phase 4: Verify get operations don't panic + // Note: Value checksums are verified lazily on read, not during recovery. + // So an entry may exist but get_value() may return ChecksumMismatch - this is expected. + for section in 1u64..=3 { + let mut pos = 0u64; + while let Ok(entry) = recovered.get(section, pos).await { + // Entry exists, verify get_value doesn't panic (may return error) + let (offset, size) = entry.value_location(); + let _ = recovered.get_value(section, offset, size).await; + pos += 1; + } + } + + // Phase 5: Verify we can append after recovery + for section in 1u64..=3 { + let value: TestValue = [0xFF; 16]; + let entry = TestEntry::new(u64::MAX); + let append_result = recovered.append(section, entry, &value).await; + + // Append should succeed (recovery should have left journal in appendable state) + assert!( + append_result.is_ok(), + "Should be able to append to section {section} after recovery" + ); + } + + let _ = recovered.destroy().await; + }); +} + +fuzz_target!(|input: FuzzInput| { + fuzz(input); +}); diff --git a/storage/src/archive/benches/get.rs b/storage/src/archive/benches/get.rs index 408fe4db3c..cb6a81a276 100644 --- a/storage/src/archive/benches/get.rs +++ b/storage/src/archive/benches/get.rs @@ -13,8 +13,16 @@ use rand::{rngs::StdRng, Rng, SeedableRng}; use std::{hint::black_box, time::Instant}; /// Items pre-loaded into the archive. +#[cfg(not(full_bench))] +const ITEMS: u64 = 10_000; +#[cfg(full_bench)] const ITEMS: u64 = 250_000; +#[cfg(not(full_bench))] +const READS: [usize; 1] = [1_000]; +#[cfg(full_bench)] +const READS: [usize; 3] = [1_000, 10_000, 50_000]; + fn select_keys(keys: &[Key], reads: usize) -> Vec { let mut rng = StdRng::seed_from_u64(42); let mut selected_keys = Vec::with_capacity(reads); @@ -76,7 +84,7 @@ fn bench_get(c: &mut Criterion) { let runner = tokio::Runner::new(cfg.clone()); for mode in ["serial", "concurrent"] { for pattern in ["key", "index"] { - for reads in [1_000, 10_000, 50_000] { + for reads in READS { let label = format!( "{}/variant={} mode={} pattern={} comp={} reads={}", module_path!(), diff --git a/storage/src/archive/benches/put.rs b/storage/src/archive/benches/put.rs index 3413348a32..677411ab26 100644 --- a/storage/src/archive/benches/put.rs +++ b/storage/src/archive/benches/put.rs @@ -4,11 +4,16 @@ use commonware_storage::archive::Archive as _; use criterion::{criterion_group, Criterion}; use std::time::{Duration, Instant}; +#[cfg(not(full_bench))] +const ITEMS: [u64; 1] = [10_000]; +#[cfg(full_bench)] +const ITEMS: [u64; 3] = [10_000, 50_000, 100_000]; + fn bench_put(c: &mut Criterion) { let runner = tokio::Runner::default(); for variant in [Variant::Prunable, Variant::Immutable] { for compression in [None, Some(3)] { - for items in [10_000, 50_000, 100_000] { + for items in ITEMS { let label = format!( "{}/variant={} items={} comp={}", module_path!(), diff --git a/storage/src/archive/benches/restart.rs b/storage/src/archive/benches/restart.rs index f9635c7958..4b7cef964c 100644 --- a/storage/src/archive/benches/restart.rs +++ b/storage/src/archive/benches/restart.rs @@ -8,12 +8,17 @@ use commonware_storage::archive::Archive as _; use criterion::{criterion_group, Criterion}; use std::time::{Duration, Instant}; +#[cfg(not(full_bench))] +const ITEMS: [u64; 1] = [10_000]; +#[cfg(full_bench)] +const ITEMS: [u64; 4] = [10_000, 50_000, 100_000, 500_000]; + fn bench_restart(c: &mut Criterion) { // Create a config we can use across all benchmarks (with a fixed `storage_directory`). let cfg = Config::default(); for variant in [Variant::Prunable, Variant::Immutable] { for compression in [None, Some(3)] { - for items in [10_000, 50_000, 100_000, 500_000] { + for items in ITEMS { let builder = commonware_runtime::tokio::Runner::new(cfg.clone()); builder.start(|ctx| async move { let mut a = Archive::init(ctx, variant, compression).await; diff --git a/storage/src/archive/benches/utils.rs b/storage/src/archive/benches/utils.rs index 21c08b5b7e..45d13694e6 100644 --- a/storage/src/archive/benches/utils.rs +++ b/storage/src/archive/benches/utils.rs @@ -1,5 +1,6 @@ //! Helpers shared by the Archive benchmarks. +use commonware_codec::config::RangeCfg; use commonware_runtime::{buffer::PoolRef, tokio::Context}; use commonware_storage::{ archive::{immutable, prunable, Archive as ArchiveTrait, Identifier}, @@ -11,23 +12,26 @@ use std::num::NonZeroUsize; /// Number of bytes that can be buffered in a section before being written to a /// [commonware_runtime::Blob]. -const WRITE_BUFFER: usize = 1024 * 1024; // 1MB +const WRITE_BUFFER: usize = 8 * 1024 * 1024; // 8MB /// Number of items per section (the granularity of pruning). -const ITEMS_PER_SECTION: u64 = 1_024; +const ITEMS_PER_SECTION: u64 = 16_384; /// Number of bytes to buffer when replaying a [commonware_runtime::Blob]. const REPLAY_BUFFER: usize = 1024 * 1024; // 1MB -/// Use a "prod sized" page size to test the performance of the journal. -const PAGE_SIZE: NonZeroUsize = NZUsize!(16_384); +/// Page size for the index buffer pool. +const PAGE_SIZE: NonZeroUsize = NZUsize!(4_096); -/// The number of pages to cache in the buffer pool. -const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10_000); +/// The number of pages to cache in the buffer pool (8,192 × 4KB = 32MB). +const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(8_192); -/// Fixed-length key and value types. +/// Key type (fixed-length) and value type (variable-length for large values). pub type Key = FixedBytes<64>; -pub type Val = FixedBytes<32>; +pub type Val = Vec; + +/// Size of values in bytes (64KB, representative of block data). +const VALUE_SIZE: usize = 65536; /// Archive variant to benchmark. #[derive(Debug, Clone, Copy)] @@ -63,28 +67,33 @@ impl Archive { freezer_table_initial_size: 131_072, freezer_table_resize_frequency: 4, freezer_table_resize_chunk_size: 1024, - freezer_journal_partition: "archive_bench_journal".into(), - freezer_journal_target_size: 1024 * 1024 * 10, // 10MB - freezer_journal_compression: compression, - freezer_journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_key_partition: "archive_bench_key".into(), + freezer_key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_value_partition: "archive_bench_value".into(), + freezer_value_target_size: 128 * 1024 * 1024, + freezer_value_compression: compression, ordinal_partition: "archive_bench_ordinal".into(), items_per_section: NZU64!(ITEMS_PER_SECTION), - write_buffer: NZUsize!(WRITE_BUFFER), + freezer_key_write_buffer: NZUsize!(WRITE_BUFFER), + freezer_value_write_buffer: NZUsize!(WRITE_BUFFER), + ordinal_write_buffer: NZUsize!(WRITE_BUFFER), replay_buffer: NZUsize!(REPLAY_BUFFER), - codec_config: (), + codec_config: (RangeCfg::new(..), ()), }; Self::Immutable(immutable::Archive::init(ctx, cfg).await.unwrap()) } Variant::Prunable => { let cfg = prunable::Config { - partition: "archive_bench_partition".into(), translator: TwoCap, + key_partition: "archive_bench_key".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "archive_bench_value".into(), compression, - codec_config: (), + codec_config: (RangeCfg::new(..), ()), items_per_section: NZU64!(ITEMS_PER_SECTION), - write_buffer: NZUsize!(WRITE_BUFFER), + key_write_buffer: NZUsize!(WRITE_BUFFER), + value_write_buffer: NZUsize!(WRITE_BUFFER), replay_buffer: NZUsize!(REPLAY_BUFFER), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; Self::Prunable(prunable::Archive::init(ctx, cfg).await.unwrap()) } @@ -183,15 +192,16 @@ impl ArchiveTrait for Archive { pub async fn append_random(archive: &mut Archive, count: u64) -> Vec { let mut rng = StdRng::seed_from_u64(0); let mut key_buf = [0u8; 64]; - let mut val_buf = [0u8; 32]; let mut keys = Vec::with_capacity(count as usize); for i in 0..count { rng.fill_bytes(&mut key_buf); let key = Key::new(key_buf); keys.push(key.clone()); + + let mut val_buf = vec![0u8; VALUE_SIZE]; rng.fill_bytes(&mut val_buf); - archive.put(i, key, Val::new(val_buf)).await.unwrap(); + archive.put(i, key, val_buf).await.unwrap(); } archive.sync().await.unwrap(); keys diff --git a/storage/src/archive/conformance.rs b/storage/src/archive/conformance.rs new file mode 100644 index 0000000000..556ff7febe --- /dev/null +++ b/storage/src/archive/conformance.rs @@ -0,0 +1,111 @@ +//! Archive conformance tests + +use crate::{ + archive::{immutable, prunable, Archive as _}, + translator::TwoCap, +}; +use commonware_codec::DecodeExt; +use commonware_conformance::{conformance_tests, Conformance}; +use commonware_runtime::{buffer::PoolRef, deterministic, Metrics, Runner}; +use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; +use core::num::{NonZeroU64, NonZeroUsize}; +use rand::Rng; + +const WRITE_BUFFER: NonZeroUsize = NZUsize!(1024); +const ITEMS_PER_SECTION: NonZeroU64 = NZU64!(1024); +const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); +const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); + +struct ArchivePrunable; + +impl Conformance for ArchivePrunable { + async fn commit(seed: u64) -> Vec { + let runner = deterministic::Runner::seeded(seed); + runner.start(|mut context| async move { + let config = prunable::Config { + translator: TwoCap, + key_partition: format!("archive-prunable-key-{seed}"), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: format!("archive-prunable-value-{seed}"), + compression: None, + codec_config: (), + items_per_section: ITEMS_PER_SECTION, + key_write_buffer: WRITE_BUFFER, + value_write_buffer: WRITE_BUFFER, + replay_buffer: WRITE_BUFFER, + }; + let mut archive = prunable::Archive::<_, _, FixedBytes<64>, i32>::init( + context.with_label("archive"), + config, + ) + .await + .unwrap(); + + // Write random items + let items_count = context.gen_range(100..500); + for i in 0..items_count { + let mut key_bytes = [0u8; 64]; + context.fill(&mut key_bytes); + let key = FixedBytes::<64>::decode(key_bytes.as_ref()).unwrap(); + let value: i32 = context.gen(); + archive.put(i as u64, key, value).await.unwrap(); + } + archive.sync().await.unwrap(); + + context.storage_audit().to_vec() + }) + } +} + +struct ArchiveImmutable; + +impl Conformance for ArchiveImmutable { + async fn commit(seed: u64) -> Vec { + let runner = deterministic::Runner::seeded(seed); + runner.start(|mut context| async move { + let config = immutable::Config { + metadata_partition: format!("archive-immutable-metadata-{seed}"), + freezer_table_partition: format!("archive-immutable-table-{seed}"), + freezer_table_initial_size: 64, + freezer_table_resize_frequency: 2, + freezer_table_resize_chunk_size: 32, + freezer_key_partition: format!("archive-immutable-key-{seed}"), + freezer_key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_value_partition: format!("archive-immutable-value-{seed}"), + freezer_value_target_size: 1024 * 1024, + freezer_value_compression: None, + ordinal_partition: format!("archive-immutable-ordinal-{seed}"), + items_per_section: ITEMS_PER_SECTION, + freezer_key_write_buffer: WRITE_BUFFER, + freezer_value_write_buffer: WRITE_BUFFER, + ordinal_write_buffer: WRITE_BUFFER, + replay_buffer: WRITE_BUFFER, + codec_config: (), + }; + let mut archive = immutable::Archive::<_, FixedBytes<64>, i32>::init( + context.with_label("archive"), + config, + ) + .await + .unwrap(); + + // Write random items + let items_count = context.gen_range(100..500); + for i in 0..items_count { + let mut key_bytes = [0u8; 64]; + context.fill(&mut key_bytes); + let key = FixedBytes::<64>::decode(key_bytes.as_ref()).unwrap(); + let value: i32 = context.gen(); + archive.put(i as u64, key, value).await.unwrap(); + } + archive.sync().await.unwrap(); + + context.storage_audit().to_vec() + }) + } +} + +conformance_tests! { + ArchivePrunable => 128, + ArchiveImmutable => 128, +} diff --git a/storage/src/archive/immutable/mod.rs b/storage/src/archive/immutable/mod.rs index a68acdde24..cc1cc85d19 100644 --- a/storage/src/archive/immutable/mod.rs +++ b/storage/src/archive/immutable/mod.rs @@ -43,13 +43,16 @@ //! freezer_table_initial_size: 65_536, //! freezer_table_resize_frequency: 4, //! freezer_table_resize_chunk_size: 16_384, -//! freezer_journal_partition: "journal".into(), -//! freezer_journal_target_size: 1024, -//! freezer_journal_compression: Some(3), -//! freezer_journal_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! freezer_key_partition: "key".into(), +//! freezer_key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! freezer_value_partition: "value".into(), +//! freezer_value_target_size: 1024, +//! freezer_value_compression: Some(3), //! ordinal_partition: "ordinal".into(), //! items_per_section: NZU64!(1024), -//! write_buffer: NZUsize!(1024), +//! freezer_key_write_buffer: NZUsize!(1024), +//! freezer_value_write_buffer: NZUsize!(1024), +//! ordinal_write_buffer: NZUsize!(1024), //! replay_buffer: NZUsize!(1024), //! codec_config: (), //! }; @@ -85,17 +88,20 @@ pub struct Config { /// The number of items to move during each resize operation (many may be required to complete a resize). pub freezer_table_resize_chunk_size: u32, - /// The partition to use for the archive's freezer journal. - pub freezer_journal_partition: String, + /// The partition to use for the archive's freezer keys. + pub freezer_key_partition: String, - /// The target size of the archive's freezer journal. - pub freezer_journal_target_size: u64, + /// The buffer pool to use for the archive's freezer keys. + pub freezer_key_buffer_pool: PoolRef, - /// The compression level to use for the archive's freezer journal. - pub freezer_journal_compression: Option, + /// The partition to use for the archive's freezer values. + pub freezer_value_partition: String, - /// The buffer pool to use for the archive's freezer journal. - pub freezer_journal_buffer_pool: PoolRef, + /// The target size of the archive's freezer value sections. + pub freezer_value_target_size: u64, + + /// The compression level to use for the archive's freezer values. + pub freezer_value_compression: Option, /// The partition to use for the archive's ordinal. pub ordinal_partition: String, @@ -103,9 +109,17 @@ pub struct Config { /// The number of items per section. pub items_per_section: NonZeroU64, - /// The amount of bytes that can be buffered in a section before being written to a - /// [commonware_runtime::Blob]. - pub write_buffer: NonZeroUsize, + /// The amount of bytes that can be buffered for the freezer key journal before being + /// written to a [commonware_runtime::Blob]. + pub freezer_key_write_buffer: NonZeroUsize, + + /// The amount of bytes that can be buffered for the freezer value journal before being + /// written to a [commonware_runtime::Blob]. + pub freezer_value_write_buffer: NonZeroUsize, + + /// The amount of bytes that can be buffered for the ordinal journal before being + /// written to a [commonware_runtime::Blob]. + pub ordinal_write_buffer: NonZeroUsize, /// The buffer size to use when replaying a [commonware_runtime::Blob]. pub replay_buffer: NonZeroUsize, @@ -135,13 +149,16 @@ mod tests { freezer_table_initial_size: 8192, // Must be power of 2 freezer_table_resize_frequency: 4, freezer_table_resize_chunk_size: 8192, - freezer_journal_partition: "test_journal2".into(), - freezer_journal_target_size: 1024 * 1024, - freezer_journal_compression: Some(3), - freezer_journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_key_partition: "test_key2".into(), + freezer_key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_value_partition: "test_value2".into(), + freezer_value_target_size: 1024 * 1024, + freezer_value_compression: Some(3), ordinal_partition: "test_ordinal2".into(), items_per_section: NZU64!(512), - write_buffer: NZUsize!(1024), + freezer_key_write_buffer: NZUsize!(1024), + freezer_value_write_buffer: NZUsize!(1024), + ordinal_write_buffer: NZUsize!(1024), replay_buffer: NZUsize!(1024), codec_config: (), }; diff --git a/storage/src/archive/immutable/storage.rs b/storage/src/archive/immutable/storage.rs index c49e2de0fa..a67cfa7a4f 100644 --- a/storage/src/archive/immutable/storage.rs +++ b/storage/src/archive/immutable/storage.rs @@ -126,11 +126,13 @@ impl Archive { let freezer = Freezer::init_with_checkpoint( context.with_label("freezer"), freezer::Config { - journal_partition: cfg.freezer_journal_partition, - journal_compression: cfg.freezer_journal_compression, - journal_write_buffer: cfg.write_buffer, - journal_target_size: cfg.freezer_journal_target_size, - journal_buffer_pool: cfg.freezer_journal_buffer_pool, + key_partition: cfg.freezer_key_partition, + key_write_buffer: cfg.freezer_key_write_buffer, + key_buffer_pool: cfg.freezer_key_buffer_pool, + value_partition: cfg.freezer_value_partition, + value_compression: cfg.freezer_value_compression, + value_write_buffer: cfg.freezer_value_write_buffer, + value_target_size: cfg.freezer_value_target_size, table_partition: cfg.freezer_table_partition, table_initial_size: cfg.freezer_table_initial_size, table_resize_frequency: cfg.freezer_table_resize_frequency, @@ -165,7 +167,7 @@ impl Archive { ordinal::Config { partition: cfg.ordinal_partition, items_per_blob: cfg.items_per_section, - write_buffer: cfg.write_buffer, + write_buffer: cfg.ordinal_write_buffer, replay_buffer: cfg.replay_buffer, }, Some(section_bits), diff --git a/storage/src/archive/mod.rs b/storage/src/archive/mod.rs index 1767c2e084..012f348be7 100644 --- a/storage/src/archive/mod.rs +++ b/storage/src/archive/mod.rs @@ -11,6 +11,9 @@ use thiserror::Error; pub mod immutable; pub mod prunable; +#[cfg(all(test, feature = "arbitrary"))] +mod conformance; + /// Subject of a `get` or `has` operation. pub enum Identifier<'a, K: Array> { Index(u64), @@ -139,14 +142,16 @@ mod tests { compression: Option, ) -> impl Archive, Value = i32> { let cfg = prunable::Config { - partition: "test".into(), translator: TwoCap, + key_partition: "test_key".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), compression, codec_config: (), items_per_section: NZU64!(1024), - write_buffer: NZUsize!(1024), + key_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), replay_buffer: NZUsize!(1024), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; prunable::Archive::init(context, cfg).await.unwrap() } @@ -161,13 +166,16 @@ mod tests { freezer_table_initial_size: 64, freezer_table_resize_frequency: 2, freezer_table_resize_chunk_size: 32, - freezer_journal_partition: "test_journal".into(), - freezer_journal_target_size: 1024 * 1024, - freezer_journal_compression: compression, - freezer_journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_key_partition: "test_key".into(), + freezer_key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + freezer_value_partition: "test_value".into(), + freezer_value_target_size: 1024 * 1024, + freezer_value_compression: compression, ordinal_partition: "test_ordinal".into(), items_per_section: NZU64!(1024), - write_buffer: NZUsize!(1024 * 1024), + freezer_key_write_buffer: NZUsize!(1024 * 1024), + freezer_value_write_buffer: NZUsize!(1024 * 1024), + ordinal_write_buffer: NZUsize!(1024 * 1024), replay_buffer: NZUsize!(1024 * 1024), codec_config: (), }; diff --git a/storage/src/archive/prunable/mod.rs b/storage/src/archive/prunable/mod.rs index dcbb79fe6f..a6a7c2f550 100644 --- a/storage/src/archive/prunable/mod.rs +++ b/storage/src/archive/prunable/mod.rs @@ -1,23 +1,31 @@ //! A prunable key-value store for ordered data. //! -//! Data is stored in [crate::journal::segmented::variable::Journal] (an append-only log) and the -//! location of written data is stored in-memory by both index and key (via -//! [crate::index::unordered::Index]) to enable **single-read lookups** for both query patterns over -//! archived data. +//! Data is stored across two backends: [crate::journal::segmented::fixed] for fixed-size index entries and +//! [crate::journal::segmented::glob::Glob] for values (managed by [crate::journal::segmented::oversized]). +//! The location of written data is stored in-memory by both index and key (via [crate::index::unordered::Index]) +//! to enable efficient lookups (on average). //! //! _Notably, [Archive] does not make use of compaction nor on-disk indexes (and thus has no read //! nor write amplification during normal operation). //! //! # Format //! -//! [Archive] stores data in the following format: +//! [Archive] uses a two-journal structure for efficient buffer pool usage: //! +//! **Index Journal (segmented/fixed)** - Fixed-size entries for fast startup replay: //! ```text -//! +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ -//! | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 | ... | -//! +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ -//! | Index(u64) | Key(Fixed Size) | Data | -//! +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +//! +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +//! | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |18 |19 |20 |21 |22 |23 | +//! +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +//! | Index(u64) |Key(Fixed Size)| val_offset(u64) | val_size(u32) | +//! +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +//! ``` +//! +//! **Value Blob** - Raw values with CRC32 checksums (direct reads, no buffer pool): +//! ```text +//! +---+---+---+---+---+---+---+---+---+---+---+---+ +//! | Compressed Data (variable) | CRC32 | +//! +---+---+---+---+---+---+---+---+---+---+---+---+ //! ``` //! //! # Uniqueness @@ -46,15 +54,13 @@ //! _To avoid random memory reads in the common case, the in-memory index directly stores the first //! item in the linked list instead of a pointer to the first item._ //! -//! `index` is the key to the map used to serve lookups by `index` that stores the location of data -//! in a given `Blob` (selected by `section = index & section_mask` to minimize the number of open -//! [crate::journal::segmented::variable::Journal]s): +//! `index` is the key to the map used to serve lookups by `index` that stores the position in the +//! index journal (selected by `section = index / items_per_section * items_per_section` to minimize +//! the number of open blobs): //! -//! ```rust -//! struct Location { -//! offset: u32, -//! len: u32, -//! } +//! ```text +//! // Maps index -> position in index journal +//! indices: BTreeMap //! ``` //! //! _If the [Translator] provided by the caller does not uniformly distribute keys across the key @@ -64,7 +70,7 @@ //! ## Memory Overhead //! //! [Archive] uses two maps to enable lookups by both index and key. The memory used to track each -//! index item is `8 + 4 + 4` (where `8` is the index, `4` is the offset, and `4` is the length). +//! index item is `8 + 8` (where `8` is the index and `8` is the position in the index journal). //! The memory used to track each key item is `~translated(key).len() + 16` bytes (where `16` is the //! size of the `Record` struct). This means that an [Archive] employing a [Translator] that uses //! the first `8` bytes of a key will use `~40` bytes to index each key. @@ -78,17 +84,17 @@ //! ## Lazy Index Cleanup //! //! Instead of performing a full iteration of the in-memory index, storing an additional in-memory -//! index per `section`, or replaying a `section` of [crate::journal::segmented::variable::Journal], +//! index per `section`, or replaying a `section` of the value blob, //! [Archive] lazily cleans up the [crate::index::unordered::Index] after pruning. When a new key is //! stored that overlaps (same translated value) with a pruned key, the pruned key is removed from //! the in-memory index. //! -//! # Single Operation Reads +//! # Read Path //! -//! To enable single operation reads (i.e. reading all of an item in a single call to -//! [commonware_runtime::Blob]), [Archive] caches the length of each item in its in-memory index. -//! While it increases the footprint per key stored, the benefit of only ever performing a single -//! operation to read a key (when there are no conflicts) is worth the tradeoff. +//! All reads (by index or key) first read the index entry from the index journal to get the +//! value location (offset and size), then read the value from the value blob. The index journal +//! uses a buffer pool for caching, so hot entries are served from memory. Values are read directly +//! from disk without caching to avoid polluting the buffer pool with large values. //! //! # Compression //! @@ -121,13 +127,15 @@ //! // Create an archive //! let cfg = Config { //! translator: FourCap, -//! partition: "demo".into(), +//! key_partition: "demo_index".into(), +//! key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! value_partition: "demo_value".into(), //! compression: Some(3), //! codec_config: (), //! items_per_section: NZU64!(1024), -//! write_buffer: NZUsize!(1024 * 1024), +//! key_write_buffer: NZUsize!(1024 * 1024), +//! value_write_buffer: NZUsize!(1024 * 1024), //! replay_buffer: NZUsize!(4096), -//! buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), //! }; //! let mut archive = Archive::init(context, cfg).await.unwrap(); //! @@ -155,10 +163,16 @@ pub struct Config { /// If that is not the case, lookups may be O(n) instead of O(1). pub translator: T, - /// The partition to use for the archive's [crate::journal] storage. - pub partition: String, + /// The partition to use for the key journal (stores index+key metadata). + pub key_partition: String, + + /// The buffer pool to use for the key journal. + pub key_buffer_pool: PoolRef, - /// The compression level to use for the archive's [crate::journal] storage. + /// The partition to use for the value blob (stores values). + pub value_partition: String, + + /// The compression level to use for the value blob. pub compression: Option, /// The [commonware_codec::Codec] configuration to use for the value stored in the archive. @@ -167,15 +181,16 @@ pub struct Config { /// The number of items per section (the granularity of pruning). pub items_per_section: NonZeroU64, - /// The amount of bytes that can be buffered in a section before being written to a + /// The amount of bytes that can be buffered for the key journal before being written to a + /// [commonware_runtime::Blob]. + pub key_write_buffer: NonZeroUsize, + + /// The amount of bytes that can be buffered for the value journal before being written to a /// [commonware_runtime::Blob]. - pub write_buffer: NonZeroUsize, + pub value_write_buffer: NonZeroUsize, /// The buffer size to use when replaying a [commonware_runtime::Blob]. pub replay_buffer: NonZeroUsize, - - /// The buffer pool to use for the archive's [crate::journal] storage. - pub buffer_pool: PoolRef, } #[cfg(test)] @@ -186,7 +201,7 @@ mod tests { journal::Error as JournalError, translator::{FourCap, TwoCap}, }; - use commonware_codec::{varint::UInt, DecodeExt, EncodeSize, Error as CodecError}; + use commonware_codec::{DecodeExt, Error as CodecError}; use commonware_macros::{test_group, test_traced}; use commonware_runtime::{deterministic, Blob, Metrics, Runner, Storage}; use commonware_utils::{sequence::FixedBytes, NZUsize, NZU64}; @@ -214,14 +229,16 @@ mod tests { executor.start(|context| async move { // Initialize the archive let cfg = Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: Some(3), - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::init(context.clone(), cfg.clone()) .await @@ -240,21 +257,33 @@ mod tests { archive.sync().await.expect("Failed to sync archive"); drop(archive); - // Initialize the archive again without compression + // Initialize the archive again without compression. + // Index journal replay succeeds (no compression), but value reads will fail. let cfg = Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; - let result = Archive::<_, _, FixedBytes<64>, i32>::init(context, cfg.clone()).await; + let archive = Archive::<_, _, FixedBytes<64>, i32>::init(context, cfg.clone()) + .await + .unwrap(); + + // Getting the value should fail because compression settings mismatch. + // Without compression, the codec sees extra bytes after decoding the value + // (because the compressed data doesn't match the expected format). + let result: Result, _> = archive.get(Identifier::Index(index)).await; assert!(matches!( result, - Err(Error::Journal(JournalError::Codec(CodecError::EndOfBuffer))) + Err(Error::Journal(JournalError::Codec(CodecError::ExtraData( + _ + )))) )); }); } @@ -266,14 +295,16 @@ mod tests { executor.start(|context| async move { // Initialize the archive let cfg = Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::init(context.clone(), cfg.clone()) .await @@ -293,31 +324,33 @@ mod tests { archive.sync().await.expect("Failed to sync archive"); drop(archive); - // Corrupt the value + // Corrupt the index journal let section = (index / DEFAULT_ITEMS_PER_SECTION) * DEFAULT_ITEMS_PER_SECTION; let (blob, _) = context - .open("test_partition", §ion.to_be_bytes()) + .open("test_index", §ion.to_be_bytes()) .await .unwrap(); - let value_location = 4 /* journal size */ + UInt(1u64).encode_size() as u64 /* index */ + 64 + 4 /* value length */; - blob.write_at(b"testdaty".to_vec(), value_location).await.unwrap(); + blob.write_at(b"corrupt!".to_vec(), 8).await.unwrap(); blob.sync().await.unwrap(); // Initialize the archive again let archive = Archive::<_, _, FixedBytes<64>, i32>::init( context, Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }, ) - .await.expect("Failed to initialize archive"); + .await + .expect("Failed to initialize archive"); // Check that the archive is empty let retrieved: Option = archive @@ -335,14 +368,16 @@ mod tests { executor.start(|context| async move { // Initialize the archive let cfg = Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::init(context.clone(), cfg.clone()) .await @@ -398,14 +433,16 @@ mod tests { executor.start(|context| async move { // Initialize the archive let cfg = Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(DEFAULT_ITEMS_PER_SECTION), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::init(context.clone(), cfg.clone()) .await @@ -455,14 +492,16 @@ mod tests { executor.start(|context| async move { // Initialize the archive let cfg = Config { - partition: "test_partition".into(), translator: FourCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(1), // no mask - each item is its own section - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::init(context.clone(), cfg.clone()) .await @@ -541,14 +580,16 @@ mod tests { // Initialize the archive let items_per_section = 256u64; let cfg = Config { - partition: "test_partition".into(), translator: TwoCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(items_per_section), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::init(context.clone(), cfg.clone()) .await @@ -600,14 +641,16 @@ mod tests { // Reinitialize the archive let cfg = Config { - partition: "test_partition".into(), translator: TwoCap, + key_partition: "test_index".into(), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value".into(), codec_config: (), compression: None, - write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), replay_buffer: NZUsize!(DEFAULT_REPLAY_BUFFER), items_per_section: NZU64!(items_per_section), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), }; let mut archive = Archive::<_, _, _, FixedBytes<1024>>::init(context.clone(), cfg.clone()) diff --git a/storage/src/archive/prunable/storage.rs b/storage/src/archive/prunable/storage.rs index 639a0ad504..9b1f48d318 100644 --- a/storage/src/archive/prunable/storage.rs +++ b/storage/src/archive/prunable/storage.rs @@ -2,11 +2,13 @@ use super::{Config, Translator}; use crate::{ archive::{Error, Identifier}, index::{unordered::Index, Unordered}, - journal::segmented::variable::{Config as JConfig, Journal}, + journal::segmented::oversized::{ + Config as OversizedConfig, Oversized, Record as OversizedRecord, + }, rmap::RMap, }; use bytes::{Buf, BufMut}; -use commonware_codec::{varint::UInt, Codec, EncodeSize, Read, ReadExt, Write}; +use commonware_codec::{Codec, FixedSize, Read, ReadExt, Write}; use commonware_runtime::{telemetry::metrics::status::GaugeExt, Metrics, Storage}; use commonware_utils::Array; use futures::{future::try_join_all, pin_mut, StreamExt}; @@ -14,76 +16,111 @@ use prometheus_client::metrics::{counter::Counter, gauge::Gauge}; use std::collections::{BTreeMap, BTreeSet}; use tracing::debug; -/// Record stored in the `Archive`. -struct Record { +/// Index entry for the archive. +#[derive(Debug, Clone, PartialEq)] +struct Record { + /// The index for this entry. index: u64, + /// The key for this entry. key: K, - value: V, + /// Byte offset in value journal (same section). + value_offset: u64, + /// Size of value data in the value journal. + value_size: u32, } -impl Record { - /// Create a new `Record`. - const fn new(index: u64, key: K, value: V) -> Self { - Self { index, key, value } +impl Record { + /// Create a new [Record]. + const fn new(index: u64, key: K, value_offset: u64, value_size: u32) -> Self { + Self { + index, + key, + value_offset, + value_size, + } } } -impl Write for Record { +impl Write for Record { fn write(&self, buf: &mut impl BufMut) { - UInt(self.index).write(buf); + self.index.write(buf); self.key.write(buf); - self.value.write(buf); + self.value_offset.write(buf); + self.value_size.write(buf); } } -impl Read for Record { - type Cfg = V::Cfg; +impl Read for Record { + type Cfg = (); - fn read_cfg(buf: &mut impl Buf, cfg: &Self::Cfg) -> Result { - let index = UInt::read(buf)?.into(); + fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { + let index = u64::read(buf)?; let key = K::read(buf)?; - let value = V::read_cfg(buf, cfg)?; - Ok(Self { index, key, value }) + let value_offset = u64::read(buf)?; + let value_size = u32::read(buf)?; + Ok(Self { + index, + key, + value_offset, + value_size, + }) } } -impl EncodeSize for Record { - fn encode_size(&self) -> usize { - UInt(self.index).encode_size() + K::SIZE + self.value.encode_size() +impl FixedSize for Record { + // index + key + value_offset + value_size + const SIZE: usize = u64::SIZE + K::SIZE + u64::SIZE + u32::SIZE; +} + +impl OversizedRecord for Record { + fn value_location(&self) -> (u64, u32) { + (self.value_offset, self.value_size) + } + + fn with_location(mut self, offset: u64, size: u32) -> Self { + self.value_offset = offset; + self.value_size = size; + self } } #[cfg(feature = "arbitrary")] -impl arbitrary::Arbitrary<'_> for Record +impl arbitrary::Arbitrary<'_> for Record where K: for<'a> arbitrary::Arbitrary<'a>, - V: for<'a> arbitrary::Arbitrary<'a>, { fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { - Ok(Self::new( - u.arbitrary::()?, - u.arbitrary::()?, - u.arbitrary::()?, - )) + Ok(Self { + index: u64::arbitrary(u)?, + key: K::arbitrary(u)?, + value_offset: u64::arbitrary(u)?, + value_size: u32::arbitrary(u)?, + }) } } /// Implementation of `Archive` storage. pub struct Archive { items_per_section: u64, - journal: Journal>, + + /// Combined index + value storage with crash recovery. + oversized: Oversized, V>, + pending: BTreeSet, - // Oldest allowed section to read from. This is updated when `prune` is called. + /// Oldest allowed section to read from. Updated when `prune` is called. oldest_allowed: Option, - // To efficiently serve `get` and `has` requests, we map a translated representation of each key - // to its corresponding index. To avoid iterating over this keys map during pruning, we map said - // indexes to their locations in the journal. + /// Maps translated key representation to its corresponding index. keys: Index, - indices: BTreeMap, + + /// Maps index to position in index journal. + indices: BTreeMap, + + /// Interval tracking for gap detection. intervals: RMap, + // Metrics items_tracked: Gauge, indices_pruned: Counter, unnecessary_reads: Counter, @@ -101,41 +138,40 @@ impl Archive) -> Result { - // Initialize journal - let journal = Journal::>::init( - context.with_label("journal"), - JConfig { - partition: cfg.partition, - compression: cfg.compression, - codec_config: cfg.codec_config, - buffer_pool: cfg.buffer_pool, - write_buffer: cfg.write_buffer, - }, - ) - .await?; - - // Initialize keys and run corruption check + // Initialize oversized journal + let oversized_cfg = OversizedConfig { + index_partition: cfg.key_partition, + value_partition: cfg.value_partition, + index_buffer_pool: cfg.key_buffer_pool, + index_write_buffer: cfg.key_write_buffer, + value_write_buffer: cfg.value_write_buffer, + compression: cfg.compression, + codec_config: cfg.codec_config, + }; + let oversized: Oversized, V> = + Oversized::init(context.with_label("oversized"), oversized_cfg).await?; + + // Initialize keys and replay index journal (no values read!) let mut indices = BTreeMap::new(); let mut keys = Index::new(context.with_label("index"), cfg.translator.clone()); let mut intervals = RMap::new(); { - debug!("initializing archive"); - let stream = journal.replay(0, 0, cfg.replay_buffer).await?; + debug!("initializing archive from index journal"); + let stream = oversized.replay(0, cfg.replay_buffer).await?; pin_mut!(stream); while let Some(result) = stream.next().await { - // Extract key from record - let (_, offset, _, data) = result?; + let (_section, position, entry) = result?; - // Store index - indices.insert(data.index, offset); + // Store index location (position in index journal) + indices.insert(entry.index, position); // Store index in keys - keys.insert(&data.key, data.index); + keys.insert(&entry.key, entry.index); // Store index in intervals - intervals.insert(data.index); + intervals.insert(entry.index); } debug!("archive initialized"); } @@ -170,7 +206,7 @@ impl Archive Archive *offset, + let position = match self.indices.get(&index) { + Some(pos) => *pos, None => return Ok(None), }; - // Fetch item from disk + // Fetch index entry to get value location let section = self.section(index); - let record = self.journal.get(section, offset).await?; - Ok(Some(record.value)) + let entry = self.oversized.get(section, position).await?; + let (value_offset, value_size) = entry.value_location(); + + // Fetch value directly from blob storage (bypasses buffer pool) + let value = self + .oversized + .get_value(section, value_offset, value_size) + .await?; + Ok(Some(value)) } async fn get_key(&self, key: &K) -> Result, Error> { @@ -214,14 +257,22 @@ impl Archive Archive Archive crate::archive::Ar return Ok(()); } - // Store item in journal - let record = Record::new(index, key.clone(), data); + // Write value and index entry atomically (glob first, then index) let section = self.section(index); - let (offset, _) = self.journal.append(section, record).await?; + let entry = Record::new(index, key.clone(), 0, 0); + let (position, _, _) = self.oversized.append(section, entry, &data).await?; - // Store index - self.indices.insert(index, offset); + // Store index location + self.indices.insert(index, position); // Store interval self.intervals.insert(index); @@ -345,12 +395,14 @@ impl crate::archive::Ar } async fn sync(&mut self) -> Result<(), Error> { - let mut syncs = Vec::with_capacity(self.pending.len()); - for section in self.pending.iter() { - syncs.push(self.journal.sync(*section)); - self.syncs.inc(); - } + // Collect pending sections and update metrics + let pending: Vec = self.pending.iter().copied().collect(); + self.syncs.inc_by(pending.len() as u64); + + // Sync oversized journal (handles both index and values) + let syncs: Vec<_> = pending.iter().map(|s| self.oversized.sync(*s)).collect(); try_join_all(syncs).await?; + self.pending.clear(); Ok(()) } @@ -376,7 +428,7 @@ impl crate::archive::Ar } async fn destroy(self) -> Result<(), Error> { - self.journal.destroy().await.map_err(Error::Journal) + Ok(self.oversized.destroy().await?) } } @@ -387,6 +439,6 @@ mod conformance { use commonware_utils::sequence::U64; commonware_conformance::conformance_tests! { - CodecConformance>> + CodecConformance> } } diff --git a/storage/src/cache/storage.rs b/storage/src/cache/storage.rs index 8fb9d0bdca..c1f681dc5f 100644 --- a/storage/src/cache/storage.rs +++ b/storage/src/cache/storage.rs @@ -66,7 +66,7 @@ pub struct Cache { // Oldest allowed section to read from. This is updated when `prune` is called. oldest_allowed: Option, - indices: BTreeMap, + indices: BTreeMap, intervals: RMap, items_tracked: Gauge, diff --git a/storage/src/freezer/benches/get.rs b/storage/src/freezer/benches/get.rs index b989dc0a58..8bb885a427 100644 --- a/storage/src/freezer/benches/get.rs +++ b/storage/src/freezer/benches/get.rs @@ -11,8 +11,16 @@ use rand::{rngs::StdRng, Rng, SeedableRng}; use std::{hint::black_box, time::Instant}; /// Items pre-loaded into the store. +#[cfg(not(full_bench))] +const ITEMS: u64 = 10_000; +#[cfg(full_bench)] const ITEMS: u64 = 250_000; +#[cfg(not(full_bench))] +const READS: [usize; 1] = [1_000]; +#[cfg(full_bench)] +const READS: [usize; 3] = [1_000, 10_000, 50_000]; + /// Select random keys for benchmarking. pub fn select_keys(count: usize, keys: &[Key]) -> Vec { let mut rng = StdRng::seed_from_u64(42); @@ -65,7 +73,7 @@ fn bench_get(c: &mut Criterion) { let runner = tokio::Runner::new(cfg.clone()); for pattern in ["random", "recent"] { for mode in ["serial", "concurrent"] { - for reads in [1_000, 10_000, 50_000] { + for reads in READS { let label = format!( "{}/pattern={} mode={} reads={}", module_path!(), diff --git a/storage/src/freezer/benches/put.rs b/storage/src/freezer/benches/put.rs index 9a5bfd80b5..cfabed5bbd 100644 --- a/storage/src/freezer/benches/put.rs +++ b/storage/src/freezer/benches/put.rs @@ -3,9 +3,14 @@ use commonware_runtime::benchmarks::{context, tokio}; use criterion::{criterion_group, Criterion}; use std::time::{Duration, Instant}; +#[cfg(not(full_bench))] +const ITEMS: [u64; 1] = [10_000]; +#[cfg(full_bench)] +const ITEMS: [u64; 4] = [10_000, 50_000, 100_000, 250_000]; + fn bench_put(c: &mut Criterion) { let runner = tokio::Runner::default(); - for items in [10_000, 50_000, 100_000, 250_000] { + for items in ITEMS { let label = format!("{}/items={}", module_path!(), items); c.bench_function(&label, |b| { b.to_async(&runner).iter_custom(move |iters| async move { diff --git a/storage/src/freezer/benches/restart.rs b/storage/src/freezer/benches/restart.rs index 1767744a32..f15780d74a 100644 --- a/storage/src/freezer/benches/restart.rs +++ b/storage/src/freezer/benches/restart.rs @@ -7,9 +7,14 @@ use commonware_runtime::{ use criterion::{criterion_group, Criterion}; use std::time::{Duration, Instant}; +#[cfg(not(full_bench))] +const ITEMS: [u64; 1] = [10_000]; +#[cfg(full_bench)] +const ITEMS: [u64; 4] = [10_000, 50_000, 100_000, 500_000]; + fn bench_restart(c: &mut Criterion) { let cfg = Config::default(); - for items in [10_000, 50_000, 100_000, 500_000] { + for items in ITEMS { // Populate the freezer with random keys let builder = commonware_runtime::tokio::Runner::new(cfg.clone()); builder.start(|ctx| async move { diff --git a/storage/src/freezer/benches/utils.rs b/storage/src/freezer/benches/utils.rs index 7899c5a1cb..87ebc0d555 100644 --- a/storage/src/freezer/benches/utils.rs +++ b/storage/src/freezer/benches/utils.rs @@ -7,10 +7,10 @@ use rand::{rngs::StdRng, RngCore, SeedableRng}; use std::num::NonZeroUsize; /// Number of bytes that can be buffered before being written to disk. -const JOURNAL_WRITE_BUFFER: usize = 1024 * 1024; // 1MB +const WRITE_BUFFER: usize = 1024 * 1024; // 1MB -/// Target size of each journal section before creating a new one. -const JOURNAL_TARGET_SIZE: u64 = 100 * 1024 * 1024; // 100MB +/// Target size of each value section before creating a new one. +const VALUE_TARGET_SIZE: u64 = 100 * 1024 * 1024; // 100MB /// Initial size of the table. const TABLE_INITIAL_SIZE: u32 = 65_536; @@ -24,8 +24,11 @@ const TABLE_RESIZE_CHUNK_SIZE: u32 = 1024; /// Size of the replay buffer when scanning the table. const TABLE_REPLAY_BUFFER: usize = 1024 * 1024; // 1MB -/// Partition for [Freezer] journal benchmarks. -pub const JOURNAL_PARTITION: &str = "freezer_bench_journal"; +/// Partition for [Freezer] keys. +pub const KEY_PARTITION: &str = "freezer_bench_key"; + +/// Partition for [Freezer] values. +pub const VALUE_PARTITION: &str = "freezer_bench_value"; /// Partition for [Freezer] table benchmarks. pub const TABLE_PARTITION: &str = "freezer_bench_table"; @@ -46,11 +49,13 @@ pub type FreezerType = Freezer; /// Open (or create) a freezer store. pub async fn init(ctx: Context) -> FreezerType { let cfg = Config { - journal_partition: JOURNAL_PARTITION.into(), - journal_compression: None, - journal_write_buffer: NZUsize!(JOURNAL_WRITE_BUFFER), - journal_target_size: JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: KEY_PARTITION.into(), + key_write_buffer: NZUsize!(WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: VALUE_PARTITION.into(), + value_compression: None, + value_write_buffer: NZUsize!(WRITE_BUFFER), + value_target_size: VALUE_TARGET_SIZE, table_partition: TABLE_PARTITION.into(), table_initial_size: TABLE_INITIAL_SIZE, table_resize_frequency: TABLE_RESIZE_FREQUENCY, diff --git a/storage/src/freezer/mod.rs b/storage/src/freezer/mod.rs index 7dda5392ae..f30492cedc 100644 --- a/storage/src/freezer/mod.rs +++ b/storage/src/freezer/mod.rs @@ -10,8 +10,13 @@ //! //! # Format //! -//! The [Freezer] uses a two-level architecture: an extendible hash table (written in a single [commonware_runtime::Blob]) -//! that maps keys to locations and a [crate::journal::segmented::variable::Journal] that stores key-value data. +//! The [Freezer] uses a three-level architecture: +//! 1. An extendible hash table (written in a single [commonware_runtime::Blob]) that maps keys to locations +//! 2. A key index journal ([crate::journal::segmented::fixed]) that stores keys and collision chain pointers +//! 3. A value journal ([crate::journal::segmented::glob]) that stores the actual values +//! +//! These journals are combined via [crate::journal::segmented::oversized], which coordinates +//! crash recovery between them. //! //! ```text //! +-----------------------------------------------------------------+ @@ -23,10 +28,18 @@ //! | | | | | | //! v v v v v v //! +-----------------------------------------------------------------+ -//! | Journal | -//! | Section 0: [Record 0][Record 1][Record 2]... | -//! | Section 1: [Record 10][Record 11][Record 12]... | -//! | Section N: [Record 100][Record 101][Record 102]... | +//! | Key Index Journal | +//! | Section 0: [Entry 0][Entry 1][Entry 2]... | +//! | Section 1: [Entry 10][Entry 11][Entry 12]... | +//! | Section N: [Entry 100][Entry 101][Entry 102]... | +//! +-------|---------|---------|---------|---------|---------|-------+ +//! | | | | | | +//! v v v v v v +//! +-----------------------------------------------------------------+ +//! | Value Journal | +//! | Section 0: [Value 0][Value 1][Value 2]... | +//! | Section 1: [Value 10][Value 11][Value 12]... | +//! | Section N: [Value 100][Value 101][Value 102]... | //! +-----------------------------------------------------------------+ //! ``` //! @@ -50,50 +63,61 @@ //! +-----------------+-------------------+ //! ``` //! -//! The journal stores variable-sized records, each containing a key-value pair and an optional pointer -//! to the next record in the collision chain (for keys that hash to the same table index). +//! The key index journal stores fixed-size entries containing a key, a pointer to the value in the +//! value journal, and an optional pointer to the next entry in the collision chain (for keys that +//! hash to the same table index). //! //! ```text //! +-------------------------------------+ -//! | Journal Record | +//! | Key Index Entry | //! +-------------------------------------+ -//! | Key: Array | -//! | Value: Codec | -//! | Next: Option<(u64, u32)> | +//! | Key: Array | +//! | Value Offset: u64 | +//! | Value Size: u32 | +//! | Next: Option<(u64, u32)> | //! +-------------------------------------+ //! ``` //! +//! The value journal stores the actual encoded values at the offsets referenced by the key index entries. +//! //! # Traversing Conflicts //! -//! When multiple keys hash to the same table index, they form a linked list within the journal: +//! When multiple keys hash to the same table index, they form a linked list within the key index +//! journal. Each key index entry points to its value in the value journal: //! //! ```text //! Hash Table: -//! [Index 42] +-------------------+ -//! | section: 2 | -//! | offset: 768 | -//! +---------+---------+ -//! | -//! Journal: v -//! [Section 2] +-----+------------+-----+-----+-----+-----+-----+-----+ -//! | ... | Key: "foo" | ... | ... | ... | ... | ... | ... | -//! | | Value: 42 | | | | | | | -//! | | Next:(1,512)---+ | | | | | | -//! +-----+------------+---+-+-----+-----+-----+-----+-----+ -//! | -//! v -//! [Section 1] +-----+-----+-----+------------+-----+-----+-----+-----+ -//! | ... | ... | ... | Key: "bar" | ... | ... | ... | ... | -//! | | | | Value: 84 | | | | | -//! | | | +---| Next:(0,256) | | | | -//! +-----+-----+-+---+------------+-----+-----+-----+-----+ -//! | -//! v -//! [Section 0] +-----+------------+-----+-----+-----+-----+-----+-----+ -//! | ... | Key: "baz" | ... | ... | ... | ... | ... | ... | -//! | | Value: 126 | | | | | | | -//! | | Next: None | | | | | | | -//! +-----+------------+-----+-----+-----+-----+-----+-----+ +//! [Index 42] +-------------------+ +//! | section: 2 | +//! | offset: 768 | +//! +---------+---------+ +//! | +//! Key Index Journal: v +//! [Section 2] +-----------------------+ +//! | Key: "foo" | +//! | ValOff: 100 | +//! | ValSize: 20 | +//! | Next: (1, 512) -------+---+ +//! +-----------------------+ | +//! v +//! [Section 1] +-----------------------+ +//! | Key: "bar" | +//! | ValOff: 50 | +//! | ValSize: 20 | +//! | Next: (0, 256) -------+---+ +//! +-----------------------+ | +//! v +//! [Section 0] +-----------------------+ +//! | Key: "baz" | +//! | ValOff: 0 | +//! | ValSize: 20 | +//! | Next: None | +//! +-----------------------+ +//! +//! Value Journal: +//! [Section 0] [Value: 126 @ offset 0 ] +//! [Section 1] [Value: 84 @ offset 50] +//! [Section 2] [Value: 42 @ offset 100] //! ``` //! //! New entries are prepended to the chain, becoming the new head. During lookup, the chain @@ -146,11 +170,13 @@ //! executor.start(|context| async move { //! // Create a freezer //! let cfg = Config { -//! journal_partition: "freezer_journal".into(), -//! journal_compression: Some(3), -//! journal_write_buffer: NZUsize!(1024 * 1024), // 1MB -//! journal_target_size: 100 * 1024 * 1024, // 100MB -//! journal_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! key_partition: "freezer_key_index".into(), +//! key_write_buffer: NZUsize!(1024 * 1024), // 1MB +//! key_buffer_pool: PoolRef::new(NZUsize!(1024), NZUsize!(10)), +//! value_partition: "freezer_value_journal".into(), +//! value_compression: Some(3), +//! value_write_buffer: NZUsize!(1024 * 1024), // 1MB +//! value_target_size: 100 * 1024 * 1024, // 100MB //! table_partition: "freezer_table".into(), //! table_initial_size: 65_536, // ~3MB initial table size //! table_resize_frequency: 4, // Force resize once 4 writes to the same entry occur @@ -203,20 +229,26 @@ pub enum Error { /// Configuration for [Freezer]. #[derive(Clone)] pub struct Config { - /// The [commonware_runtime::Storage] partition to use for storing the journal. - pub journal_partition: String, + /// The [commonware_runtime::Storage] partition for the key index journal. + pub key_partition: String, + + /// The size of the write buffer for the key index journal. + pub key_write_buffer: NonZeroUsize, + + /// The buffer pool for the key index journal. + pub key_buffer_pool: PoolRef, - /// The compression level to use for the [crate::journal::segmented::variable::Journal]. - pub journal_compression: Option, + /// The [commonware_runtime::Storage] partition for the value journal. + pub value_partition: String, - /// The size of the write buffer to use for the journal. - pub journal_write_buffer: NonZeroUsize, + /// The compression level for the value journal. + pub value_compression: Option, - /// The target size of each journal before creating a new one. - pub journal_target_size: u64, + /// The size of the write buffer for the value journal. + pub value_write_buffer: NonZeroUsize, - /// The buffer pool to use for the journal. - pub journal_buffer_pool: PoolRef, + /// The target size of each value journal section before creating a new one. + pub value_target_size: u64, /// The [commonware_runtime::Storage] partition to use for storing the table. pub table_partition: String, @@ -247,8 +279,8 @@ mod tests { use commonware_utils::{hex, sequence::FixedBytes, NZUsize}; use rand::{Rng, RngCore}; - const DEFAULT_JOURNAL_WRITE_BUFFER: usize = 1024; - const DEFAULT_JOURNAL_TARGET_SIZE: u64 = 10 * 1024 * 1024; + const DEFAULT_WRITE_BUFFER: usize = 1024; + const DEFAULT_VALUE_TARGET_SIZE: u64 = 10 * 1024 * 1024; const DEFAULT_TABLE_INITIAL_SIZE: u32 = 256; const DEFAULT_TABLE_RESIZE_FREQUENCY: u8 = 4; const DEFAULT_TABLE_RESIZE_CHUNK_SIZE: u32 = 128; // force multiple chunks @@ -270,11 +302,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: compression, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: compression, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -338,11 +372,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -389,11 +425,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer with a very small table to force collisions let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: 4, // Very small to force collisions table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -450,11 +488,13 @@ mod tests { let executor = deterministic::Runner::default(); executor.start(|context| async move { let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -520,11 +560,13 @@ mod tests { let executor = deterministic::Runner::default(); executor.start(|context| async move { let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -619,11 +661,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -678,11 +722,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -736,11 +782,13 @@ mod tests { let executor = deterministic::Runner::default(); executor.start(|context| async move { let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -800,11 +848,13 @@ mod tests { let executor = deterministic::Runner::default(); executor.start(|context| async move { let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, @@ -875,11 +925,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: 2, // Very small initial size to force multiple resizes table_resize_frequency: 2, // Resize after 2 items per entry @@ -944,11 +996,13 @@ mod tests { let executor = deterministic::Runner::default(); executor.start(|context| async move { let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: 2, table_resize_frequency: 1, @@ -1012,11 +1066,13 @@ mod tests { let executor = deterministic::Runner::default(); executor.start(|context| async move { let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: 2, table_resize_frequency: 1, @@ -1074,11 +1130,13 @@ mod tests { executor.start(|mut context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: 128, // Force multiple journal sections - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: 128, // Force multiple journal sections table_partition: "test_table".into(), table_initial_size: 8, // Small table to force collisions table_resize_frequency: 2, // Force resize frequently @@ -1228,11 +1286,13 @@ mod tests { executor.start(|context| async move { // Initialize the freezer let cfg = Config { - journal_partition: "test_journal".into(), - journal_compression: None, - journal_write_buffer: NZUsize!(DEFAULT_JOURNAL_WRITE_BUFFER), - journal_target_size: DEFAULT_JOURNAL_TARGET_SIZE, - journal_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + key_partition: "test_key_index".into(), + key_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + key_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + value_partition: "test_value_journal".into(), + value_compression: None, + value_write_buffer: NZUsize!(DEFAULT_WRITE_BUFFER), + value_target_size: DEFAULT_VALUE_TARGET_SIZE, table_partition: "test_table".into(), table_initial_size: DEFAULT_TABLE_INITIAL_SIZE, table_resize_frequency: DEFAULT_TABLE_RESIZE_FREQUENCY, diff --git a/storage/src/freezer/storage.rs b/storage/src/freezer/storage.rs index 51a0fd74a8..5b1c133f8f 100644 --- a/storage/src/freezer/storage.rs +++ b/storage/src/freezer/storage.rs @@ -1,10 +1,12 @@ use super::{Config, Error, Identifier}; use crate::{ - journal::segmented::variable::{Config as JournalConfig, Journal}, + journal::segmented::oversized::{ + Config as OversizedConfig, Oversized, Record as OversizedRecord, + }, kv, Persistable, }; use bytes::{Buf, BufMut}; -use commonware_codec::{Codec, Encode, EncodeSize, FixedSize, Read, ReadExt, Write as CodecWrite}; +use commonware_codec::{Codec, Encode, FixedSize, Read, ReadExt, Write as CodecWrite}; use commonware_runtime::{buffer, Blob, Clock, Metrics, Storage}; use commonware_utils::{Array, Span}; use futures::future::{try_join, try_join_all}; @@ -25,14 +27,15 @@ const RESIZE_THRESHOLD: u64 = 50; #[derive(Hash, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[repr(transparent)] -pub struct Cursor([u8; u64::SIZE + u32::SIZE]); +pub struct Cursor([u8; u64::SIZE + u64::SIZE + u32::SIZE]); impl Cursor { /// Create a new [Cursor]. - fn new(section: u64, offset: u32) -> Self { - let mut buf = [0u8; u64::SIZE + u32::SIZE]; + fn new(section: u64, offset: u64, size: u32) -> Self { + let mut buf = [0u8; u64::SIZE + u64::SIZE + u32::SIZE]; buf[..u64::SIZE].copy_from_slice(§ion.to_be_bytes()); - buf[u64::SIZE..].copy_from_slice(&offset.to_be_bytes()); + buf[u64::SIZE..u64::SIZE + u64::SIZE].copy_from_slice(&offset.to_be_bytes()); + buf[u64::SIZE + u64::SIZE..].copy_from_slice(&size.to_be_bytes()); Self(buf) } @@ -42,8 +45,13 @@ impl Cursor { } /// Get the offset of the cursor. - fn offset(&self) -> u32 { - u32::from_be_bytes(self.0[u64::SIZE..].try_into().unwrap()) + fn offset(&self) -> u64 { + u64::from_be_bytes(self.0[u64::SIZE..u64::SIZE + u64::SIZE].try_into().unwrap()) + } + + /// Get the size of the value. + fn size(&self) -> u32 { + u32::from_be_bytes(self.0[u64::SIZE + u64::SIZE..].try_into().unwrap()) } } @@ -51,7 +59,7 @@ impl Read for Cursor { type Cfg = (); fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { - <[u8; u64::SIZE + u32::SIZE]>::read(buf).map(Self) + <[u8; u64::SIZE + u64::SIZE + u32::SIZE]>::read(buf).map(Self) } } @@ -62,7 +70,7 @@ impl CodecWrite for Cursor { } impl FixedSize for Cursor { - const SIZE: usize = u64::SIZE + u32::SIZE; + const SIZE: usize = u64::SIZE + u64::SIZE + u32::SIZE; } impl Span for Cursor {} @@ -86,9 +94,10 @@ impl std::fmt::Debug for Cursor { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "Cursor(section={}, offset={})", + "Cursor(section={}, offset={}, size={})", self.section(), - self.offset() + self.offset(), + self.size() ) } } @@ -97,9 +106,10 @@ impl std::fmt::Display for Cursor { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "Cursor(section={}, offset={})", + "Cursor(section={}, offset={}, size={})", self.section(), - self.offset() + self.offset(), + self.size() ) } } @@ -115,8 +125,8 @@ pub struct Checkpoint { epoch: u64, /// The section of the last committed operation. section: u64, - /// The size of the journal in the last committed section. - size: u64, + /// The size of the oversized index journal in the last committed section. + oversized_size: u64, /// The size of the table. table_size: u32, } @@ -128,7 +138,7 @@ impl Checkpoint { table_size, epoch: 0, section: 0, - size: 0, + oversized_size: 0, } } } @@ -138,12 +148,12 @@ impl Read for Checkpoint { fn read_cfg(buf: &mut impl Buf, _: &()) -> Result { let epoch = u64::read(buf)?; let section = u64::read(buf)?; - let size = u64::read(buf)?; + let oversized_size = u64::read(buf)?; let table_size = u32::read(buf)?; Ok(Self { epoch, section, - size, + oversized_size, table_size, }) } @@ -153,7 +163,7 @@ impl CodecWrite for Checkpoint { fn write(&self, buf: &mut impl BufMut) { self.epoch.write(buf); self.section.write(buf); - self.size.write(buf); + self.oversized_size.write(buf); self.table_size.write(buf); } } @@ -173,11 +183,11 @@ struct Entry { epoch: u64, // Section in which this slot was written section: u64, - // Offset in the section where this slot was written - offset: u32, + // Position in the key index for this section + position: u64, // Number of items added to this entry since last resize added: u8, - // CRC of (epoch | section | offset | added) + // CRC of (epoch | section | position | added) crc: u32, } @@ -186,46 +196,46 @@ impl Entry { const FULL_SIZE: usize = Self::SIZE * 2; /// Compute a checksum for [Entry]. - fn compute_crc(epoch: u64, section: u64, offset: u32, added: u8) -> u32 { + fn compute_crc(epoch: u64, section: u64, position: u64, added: u8) -> u32 { let mut hasher = crc32fast::Hasher::new(); hasher.update(&epoch.to_be_bytes()); hasher.update(§ion.to_be_bytes()); - hasher.update(&offset.to_be_bytes()); + hasher.update(&position.to_be_bytes()); hasher.update(&added.to_be_bytes()); hasher.finalize() } /// Create a new [Entry]. - fn new(epoch: u64, section: u64, offset: u32, added: u8) -> Self { + fn new(epoch: u64, section: u64, position: u64, added: u8) -> Self { Self { epoch, section, - offset, + position, added, - crc: Self::compute_crc(epoch, section, offset, added), + crc: Self::compute_crc(epoch, section, position, added), } } /// Check if this entry is empty (all zeros). const fn is_empty(&self) -> bool { - self.section == 0 && self.offset == 0 && self.crc == 0 + self.section == 0 && self.position == 0 && self.crc == 0 } /// Check if this entry is valid. fn is_valid(&self) -> bool { - Self::compute_crc(self.epoch, self.section, self.offset, self.added) == self.crc + Self::compute_crc(self.epoch, self.section, self.position, self.added) == self.crc } } impl FixedSize for Entry { - const SIZE: usize = u64::SIZE + u64::SIZE + u32::SIZE + u8::SIZE + u32::SIZE; + const SIZE: usize = u64::SIZE + u64::SIZE + u64::SIZE + u8::SIZE + u32::SIZE; } impl CodecWrite for Entry { fn write(&self, buf: &mut impl BufMut) { self.epoch.write(buf); self.section.write(buf); - self.offset.write(buf); + self.position.write(buf); self.added.write(buf); self.crc.write(buf); } @@ -236,70 +246,126 @@ impl Read for Entry { fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { let epoch = u64::read(buf)?; let section = u64::read(buf)?; - let offset = u32::read(buf)?; + let position = u64::read(buf)?; let added = u8::read(buf)?; let crc = u32::read(buf)?; Ok(Self { epoch, section, - offset, + position, added, crc, }) } } -/// A key-value pair stored in the [Journal]. -struct Record { +/// Sentinel value indicating no next entry in the collision chain. +const NO_NEXT_SECTION: u64 = u64::MAX; +const NO_NEXT_POSITION: u64 = u64::MAX; + +/// Key entry stored in the segmented/fixed key index journal. +/// +/// All fields are fixed size, enabling efficient collision chain traversal +/// without reading large values. +/// +/// The `next` pointer uses sentinel values (u64::MAX, u64::MAX) to indicate +/// "no next entry" instead of Option, ensuring fixed-size encoding. +#[derive(Debug, Clone, PartialEq)] +struct Record { + /// The key for this entry. key: K, - value: V, - next: Option<(u64, u32)>, + /// Pointer to next entry in collision chain (section, position in key index). + /// Uses (u64::MAX, u64::MAX) as sentinel for "no next". + next_section: u64, + next_position: u64, + /// Byte offset in value journal (same section). + value_offset: u64, + /// Size of value data in the value journal. + value_size: u32, } -impl Record { +impl Record { /// Create a new [Record]. - const fn new(key: K, value: V, next: Option<(u64, u32)>) -> Self { - Self { key, value, next } + fn new(key: K, next: Option<(u64, u64)>, value_offset: u64, value_size: u32) -> Self { + let (next_section, next_position) = next.unwrap_or((NO_NEXT_SECTION, NO_NEXT_POSITION)); + Self { + key, + next_section, + next_position, + value_offset, + value_size, + } + } + + /// Get the next entry in the collision chain, if any. + const fn next(&self) -> Option<(u64, u64)> { + if self.next_section == NO_NEXT_SECTION && self.next_position == NO_NEXT_POSITION { + None + } else { + Some((self.next_section, self.next_position)) + } } } -impl CodecWrite for Record { +impl CodecWrite for Record { fn write(&self, buf: &mut impl BufMut) { self.key.write(buf); - self.value.write(buf); - self.next.write(buf); + self.next_section.write(buf); + self.next_position.write(buf); + self.value_offset.write(buf); + self.value_size.write(buf); } } -impl Read for Record { - type Cfg = V::Cfg; - fn read_cfg(buf: &mut impl Buf, cfg: &Self::Cfg) -> Result { +impl Read for Record { + type Cfg = (); + fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { let key = K::read(buf)?; - let value = V::read_cfg(buf, cfg)?; - let next = Option::<(u64, u32)>::read_cfg(buf, &((), ()))?; + let next_section = u64::read(buf)?; + let next_position = u64::read(buf)?; + let value_offset = u64::read(buf)?; + let value_size = u32::read(buf)?; - Ok(Self { key, value, next }) + Ok(Self { + key, + next_section, + next_position, + value_offset, + value_size, + }) } } -impl EncodeSize for Record { - fn encode_size(&self) -> usize { - K::SIZE + self.value.encode_size() + self.next.encode_size() +impl FixedSize for Record { + // key + next_section + next_position + value_offset + value_size + const SIZE: usize = K::SIZE + u64::SIZE + u64::SIZE + u64::SIZE + u32::SIZE; +} + +impl OversizedRecord for Record { + fn value_location(&self) -> (u64, u32) { + (self.value_offset, self.value_size) + } + + fn with_location(mut self, offset: u64, size: u32) -> Self { + self.value_offset = offset; + self.value_size = size; + self } } #[cfg(feature = "arbitrary")] -impl arbitrary::Arbitrary<'_> for Record +impl arbitrary::Arbitrary<'_> for Record where K: for<'a> arbitrary::Arbitrary<'a>, - V: for<'a> arbitrary::Arbitrary<'a>, { fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { Ok(Self { key: K::arbitrary(u)?, - value: V::arbitrary(u)?, - next: Option::<(u64, u32)>::arbitrary(u)?, + next_section: u64::arbitrary(u)?, + next_position: u64::arbitrary(u)?, + value_offset: u64::arbitrary(u)?, + value_size: u32::arbitrary(u)?, }) } } @@ -316,12 +382,14 @@ pub struct Freezer { table_resize_frequency: u8, table_resize_chunk_size: u32, - // Table blob that maps slots to journal chain heads + // Table blob that maps slots to key index chain heads table: E::Blob, - // Variable journal for storing entries - journal: Journal>, - journal_target_size: u64, + // Combined key index + value storage with crash recovery + oversized: Oversized, V>, + + // Target size for value blob sections + blob_target_size: u64, // Current section for new writes current_section: u64, @@ -491,20 +559,20 @@ impl Freezer { } /// Read the latest valid entry from two table slots. - fn read_latest_entry(entry1: &Entry, entry2: &Entry) -> Option<(u64, u32, u8)> { + fn read_latest_entry(entry1: &Entry, entry2: &Entry) -> Option<(u64, u64, u8)> { match ( !entry1.is_empty() && entry1.is_valid(), !entry2.is_empty() && entry2.is_valid(), ) { (true, true) => match entry1.epoch.cmp(&entry2.epoch) { - Ordering::Greater => Some((entry1.section, entry1.offset, entry1.added)), - Ordering::Less => Some((entry2.section, entry2.offset, entry2.added)), + Ordering::Greater => Some((entry1.section, entry1.position, entry1.added)), + Ordering::Less => Some((entry2.section, entry2.position, entry2.added)), Ordering::Equal => { unreachable!("two valid entries with the same epoch") } }, - (true, false) => Some((entry1.section, entry1.offset, entry1.added)), - (false, true) => Some((entry2.section, entry2.offset, entry2.added)), + (true, false) => Some((entry1.section, entry1.position, entry1.added)), + (false, true) => Some((entry2.section, entry2.position, entry2.added)), (false, false) => None, } } @@ -556,15 +624,18 @@ impl Freezer { "table_initial_size must be a power of 2" ); - // Initialize variable journal with a separate partition - let journal_config = JournalConfig { - partition: config.journal_partition, - compression: config.journal_compression, + // Initialize oversized journal (handles crash recovery) + let oversized_cfg = OversizedConfig { + index_partition: config.key_partition.clone(), + value_partition: config.value_partition.clone(), + index_buffer_pool: config.key_buffer_pool.clone(), + index_write_buffer: config.key_write_buffer, + value_write_buffer: config.value_write_buffer, + compression: config.value_compression, codec_config: config.codec_config, - write_buffer: config.journal_write_buffer, - buffer_pool: config.journal_buffer_pool, }; - let mut journal = Journal::init(context.with_label("journal"), journal_config).await?; + let mut oversized: Oversized, V> = + Oversized::init(context.with_label("oversized"), oversized_cfg).await?; // Open table blob let (table, table_len) = context @@ -583,7 +654,7 @@ impl Freezer { (0, Some(checkpoint)) => { assert_eq!(checkpoint.epoch, 0); assert_eq!(checkpoint.section, 0); - assert_eq!(checkpoint.size, 0); + assert_eq!(checkpoint.oversized_size, 0); assert_eq!(checkpoint.table_size, 0); Self::init_table(&table, config.table_initial_size).await?; @@ -597,9 +668,13 @@ impl Freezer { "table_size must be a power of 2" ); - // Rewind the journal to the committed section and offset - journal.rewind(checkpoint.section, checkpoint.size).await?; - journal.sync(checkpoint.section).await?; + // Rewind oversized to the committed section and key size + oversized + .rewind(checkpoint.section, checkpoint.oversized_size) + .await?; + + // Sync oversized + oversized.sync(checkpoint.section).await?; // Resize table if needed let expected_table_len = Self::table_offset(checkpoint.table_size); @@ -649,11 +724,14 @@ impl Freezer { table.sync().await?; } + // Get sizes from oversized (crash recovery already ran during init) + let oversized_size = oversized.size(max_section).await?; + ( Checkpoint { epoch: max_epoch, section: max_section, - size: journal.size(max_section).await?, + oversized_size, table_size, }, resizable, @@ -693,8 +771,8 @@ impl Freezer { table_resize_frequency: config.table_resize_frequency, table_resize_chunk_size: config.table_resize_chunk_size, table, - journal, - journal_target_size: config.journal_target_size, + oversized, + blob_target_size: config.value_target_size, current_section: checkpoint.section, next_epoch: checkpoint.epoch.checked_add(1).expect("epoch overflow"), modified_sections: BTreeSet::new(), @@ -730,15 +808,19 @@ impl Freezer { self.resizable as u64 >= self.table_resize_threshold } - /// Determine which journal section to write to based on current journal size. + /// Determine which blob section to write to based on current blob size. async fn update_section(&mut self) -> Result<(), Error> { - // Get the current section size - let size = self.journal.size(self.current_section).await?; + // Get the current value blob section size + let value_size = self.oversized.value_size(self.current_section).await?; // If the current section has reached the target size, create a new section - if size >= self.journal_target_size { + if value_size >= self.blob_target_size { self.current_section += 1; - debug!(size, section = self.current_section, "updated section"); + debug!( + size = value_size, + section = self.current_section, + "updated section" + ); } Ok(()) @@ -757,15 +839,19 @@ impl Freezer { let (entry1, entry2) = Self::read_table(&self.table, table_index).await?; let head = Self::read_latest_entry(&entry1, &entry2); - // Create new head of the chain - let entry = Record::new( + // Create key entry with pointer to previous head (value location set by oversized.append) + let key_entry = Record::new( key, - value, - head.map(|(section, offset, _)| (section, offset)), + head.map(|(section, position, _)| (section, position)), + 0, + 0, ); - // Append entry to the variable journal - let (offset, _) = self.journal.append(self.current_section, entry).await?; + // Write value and key entry (glob first, then index) + let (position, value_offset, value_size) = self + .oversized + .append(self.current_section, key_entry, &value) + .await?; // Update the number of items added to the entry. // @@ -780,7 +866,7 @@ impl Freezer { // Update the old position self.modified_sections.insert(self.current_section); - let new_entry = Entry::new(self.next_epoch, self.current_section, offset, added); + let new_entry = Entry::new(self.next_epoch, self.current_section, position, added); Self::update_head(&self.table, table_index, &entry1, &entry2, new_entry).await?; // If we're mid-resize and this entry has already been processed, update the new position too @@ -797,20 +883,23 @@ impl Freezer { // // The entries are still identical to the old ones, so we don't need to read them again. let new_table_index = self.table_size + table_index; - let new_entry = Entry::new(self.next_epoch, self.current_section, offset, added); + let new_entry = Entry::new(self.next_epoch, self.current_section, position, added); Self::update_head(&self.table, new_table_index, &entry1, &entry2, new_entry) .await?; } } - Ok(Cursor::new(self.current_section, offset)) + Ok(Cursor::new(self.current_section, value_offset, value_size)) } /// Get the value for a given [Cursor]. async fn get_cursor(&self, cursor: Cursor) -> Result { - let entry = self.journal.get(cursor.section(), cursor.offset()).await?; + let value = self + .oversized + .get_value(cursor.section(), cursor.offset(), cursor.size()) + .await?; - Ok(entry.value) + Ok(value) } /// Get the first value for a given key. @@ -820,29 +909,33 @@ impl Freezer { // Get head of the chain from table let table_index = self.table_index(key); let (entry1, entry2) = Self::read_table(&self.table, table_index).await?; - let Some((mut section, mut offset, _)) = Self::read_latest_entry(&entry1, &entry2) else { + let Some((mut section, mut position, _)) = Self::read_latest_entry(&entry1, &entry2) else { return Ok(None); }; // Follow the linked list chain to find the first matching key loop { - // Get the entry from the variable journal - let entry = self.journal.get(section, offset).await?; + // Get the key entry from the fixed key index (efficient, good cache locality) + let key_entry = self.oversized.get(section, position).await?; // Check if this key matches - if entry.key.as_ref() == key.as_ref() { - return Ok(Some(entry.value)); + if key_entry.key.as_ref() == key.as_ref() { + let value = self + .oversized + .get_value(section, key_entry.value_offset, key_entry.value_size) + .await?; + return Ok(Some(value)); } // Increment unnecessary reads self.unnecessary_reads.inc(); // Follow the chain - let Some(next) = entry.next else { + let Some(next) = key_entry.next() else { break; // End of chain }; section = next.0; - offset = next.1; + position = next.1; } Ok(None) @@ -918,7 +1011,7 @@ impl Freezer { let (entry1, entry2) = Self::parse_entries(entry_buf)?; // Get the current head - let (section, offset, added) = + let (section, position, added) = Self::read_latest_entry(&entry1, &entry2).unwrap_or((0, 0, 0)); // If the entry was over the threshold, decrement the resizable entries @@ -927,7 +1020,7 @@ impl Freezer { } // Rewrite the entries - let reset_entry = Entry::new(self.next_epoch, section, offset, 0); + let reset_entry = Entry::new(self.next_epoch, section, position, 0); Self::rewrite_entries(&mut writes, &entry1, &entry2, &reset_entry); } @@ -965,12 +1058,13 @@ impl Freezer { /// Each sync will process up to `table_resize_chunk_size` entries until the resize /// is complete. pub async fn sync(&mut self) -> Result { - // Sync all modified journal sections - let mut updates = Vec::with_capacity(self.modified_sections.len()); - for section in &self.modified_sections { - updates.push(self.journal.sync(*section)); - } - try_join_all(updates).await?; + // Sync all modified sections for oversized journal + let syncs: Vec<_> = self + .modified_sections + .iter() + .map(|section| self.oversized.sync(*section)) + .collect(); + try_join_all(syncs).await?; self.modified_sections.clear(); // Start a resize (if needed) @@ -988,10 +1082,13 @@ impl Freezer { let stored_epoch = self.next_epoch; self.next_epoch = self.next_epoch.checked_add(1).expect("epoch overflow"); + // Get size from oversized + let oversized_size = self.oversized.size(self.current_section).await?; + Ok(Checkpoint { epoch: stored_epoch, section: self.current_section, - size: self.journal.size(self.current_section).await?, + oversized_size, table_size: self.table_size, }) } @@ -1011,8 +1108,8 @@ impl Freezer { /// Close and remove any underlying blobs created by the [Freezer]. pub async fn destroy(self) -> Result<(), Error> { - // Destroy the journal (removes all journal sections) - self.journal.destroy().await?; + // Destroy oversized journal + self.oversized.destroy().await?; // Destroy the table drop(self.table); @@ -1085,6 +1182,6 @@ mod conformance { CodecConformance, CodecConformance, CodecConformance, - CodecConformance> + CodecConformance> } } diff --git a/storage/src/journal/conformance.rs b/storage/src/journal/conformance.rs index a67f4157f1..48936ef478 100644 --- a/storage/src/journal/conformance.rs +++ b/storage/src/journal/conformance.rs @@ -1,11 +1,16 @@ //! Journal conformance tests -use crate::journal::contiguous::{fixed, variable}; -use commonware_codec::RangeCfg; +use crate::journal::{ + contiguous::{fixed, variable}, + segmented::{fixed as segmented_fixed, glob, oversized, variable as segmented_variable}, +}; +use bytes::{Buf, BufMut}; +use commonware_codec::{FixedSize, RangeCfg, Read, ReadExt, Write}; use commonware_conformance::{conformance_tests, Conformance}; -use commonware_runtime::{buffer::PoolRef, deterministic, Blob, Metrics, Runner}; +use commonware_runtime::{buffer::PoolRef, deterministic, Metrics, Runner}; use commonware_utils::{NZUsize, NZU64}; use core::num::{NonZeroU64, NonZeroUsize}; +use oversized::Record; use rand::Rng; const WRITE_BUFFER: NonZeroUsize = NZUsize!(1024); @@ -13,14 +18,14 @@ const ITEMS_PER_BLOB: NonZeroU64 = NZU64!(4096); const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); -struct FixedJournal; +struct ContiguousFixed; -impl Conformance for FixedJournal { +impl Conformance for ContiguousFixed { async fn commit(seed: u64) -> Vec { let runner = deterministic::Runner::seeded(seed); runner.start(|mut context| async move { let config = fixed::Config { - partition: format!("fixed-journal-conformance-{seed}"), + partition: format!("contiguous-fixed-conformance-{seed}"), items_per_blob: ITEMS_PER_BLOB, buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), write_buffer: WRITE_BUFFER, @@ -37,35 +42,21 @@ impl Conformance for FixedJournal { journal.append(*item).await.unwrap(); } journal.sync().await.unwrap(); + drop(journal); - assert_eq!( - journal.blobs.len(), - data_to_write.len() / ITEMS_PER_BLOB.get() as usize - ); - - let mut contents: Vec = Vec::with_capacity(data_to_write.len() * size_of::()); - - // Read all blobs and the tail into a single buffer. - for (_, blob) in journal.blobs.iter() { - let buf = vec![0u8; blob.size().await as usize]; - contents.extend(blob.read_at(buf, 0).await.unwrap().as_ref()); - } - let buf = vec![0u8; journal.tail.size().await as usize]; - contents.extend(journal.tail.read_at(buf, 0).await.unwrap().as_ref()); - - contents + context.storage_audit().to_vec() }) } } -struct VariableJournal; +struct ContiguousVariable; -impl Conformance for VariableJournal { +impl Conformance for ContiguousVariable { async fn commit(seed: u64) -> Vec { let runner = deterministic::Runner::seeded(seed); runner.start(|mut context| async move { let config = variable::Config { - partition: format!("variable-journal-conformance-{seed}"), + partition: format!("contiguous-variable-conformance-{seed}"), items_per_section: ITEMS_PER_BLOB, buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), write_buffer: WRITE_BUFFER, @@ -84,41 +75,249 @@ impl Conformance for VariableJournal { item.resize(size, 0); context.fill(item.as_mut_slice()); } - let data_len = data_to_write.len(); - let data_flat_len = data_to_write.iter().map(|v| v.len()).sum(); for item in data_to_write { journal.append(item).await.unwrap(); } journal.sync().await.unwrap(); + drop(journal); + + context.storage_audit().to_vec() + }) + } +} + +struct SegmentedFixed; + +impl Conformance for SegmentedFixed { + async fn commit(seed: u64) -> Vec { + let runner = deterministic::Runner::seeded(seed); + runner.start(|mut context| async move { + let config = segmented_fixed::Config { + partition: format!("segmented-fixed-conformance-{seed}"), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: WRITE_BUFFER, + }; + let mut journal = + segmented_fixed::Journal::<_, u64>::init(context.with_label("journal"), config) + .await + .unwrap(); + + // Write items across multiple sections + let items_count = context.gen_range(0..(ITEMS_PER_BLOB.get() as usize) * 4); + let mut data_to_write = vec![0u64; items_count]; + context.fill(&mut data_to_write[..]); + + // Distribute items across sections 0, 1, 2 + for (i, item) in data_to_write.iter().enumerate() { + let section = (i % 3) as u64; + journal.append(section, *item).await.unwrap(); + } + + // Sync all sections + for section in 0..3 { + journal.sync(section).await.unwrap(); + } + drop(journal); + + context.storage_audit().to_vec() + }) + } +} + +struct SegmentedGlob; + +impl Conformance for SegmentedGlob { + async fn commit(seed: u64) -> Vec { + let runner = deterministic::Runner::seeded(seed); + runner.start(|mut context| async move { + let config = glob::Config { + partition: format!("segmented-glob-conformance-{seed}"), + compression: None, + codec_config: (RangeCfg::new(0..256), ()), + write_buffer: WRITE_BUFFER, + }; + let mut journal = glob::Glob::<_, Vec>::init(context.with_label("journal"), config) + .await + .unwrap(); + + // Write variable-size items across multiple sections + let items_count = context.gen_range(0..(ITEMS_PER_BLOB.get() as usize) * 4); + let mut data_to_write = vec![Vec::new(); items_count]; + for item in data_to_write.iter_mut() { + let size = context.gen_range(0..256); + item.resize(size, 0); + context.fill(item.as_mut_slice()); + } + + // Distribute items across sections 0, 1, 2 + for (i, item) in data_to_write.iter().enumerate() { + let section = (i % 3) as u64; + journal.append(section, item).await.unwrap(); + } + + // Sync all sections + for section in 0..3 { + journal.sync(section).await.unwrap(); + } + drop(journal); + + context.storage_audit().to_vec() + }) + } +} - assert_eq!( - journal.data.blobs.len(), - data_len.div_ceil(ITEMS_PER_BLOB.get() as usize), - ); +struct SegmentedVariable; - let mut contents: Vec = Vec::with_capacity(data_flat_len); +impl Conformance for SegmentedVariable { + async fn commit(seed: u64) -> Vec { + let runner = deterministic::Runner::seeded(seed); + runner.start(|mut context| async move { + let config = segmented_variable::Config { + partition: format!("segmented-variable-conformance-{seed}"), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: WRITE_BUFFER, + compression: None, + codec_config: (RangeCfg::new(0..256), ()), + }; + let mut journal = segmented_variable::Journal::<_, Vec>::init( + context.with_label("journal"), + config, + ) + .await + .unwrap(); + + // Write variable-size items across multiple sections + let items_count = context.gen_range(0..(ITEMS_PER_BLOB.get() as usize) * 4); + let mut data_to_write = vec![Vec::new(); items_count]; + for item in data_to_write.iter_mut() { + let size = context.gen_range(0..256); + item.resize(size, 0); + context.fill(item.as_mut_slice()); + } + + // Distribute items across sections 0, 1, 2 + for (i, item) in data_to_write.iter().enumerate() { + let section = (i % 3) as u64; + journal.append(section, item.clone()).await.unwrap(); + } + + // Sync all sections + for section in 0..3 { + journal.sync(section).await.unwrap(); + } + drop(journal); + + context.storage_audit().to_vec() + }) + } +} + +/// Test entry for SegmentedOversized conformance. +#[derive(Clone)] +struct TestEntry { + id: u64, + value_offset: u64, + value_size: u32, +} + +impl Write for TestEntry { + fn write(&self, buf: &mut impl BufMut) { + self.id.write(buf); + self.value_offset.write(buf); + self.value_size.write(buf); + } +} + +impl Read for TestEntry { + type Cfg = (); + + fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { + let id = u64::read(buf)?; + let value_offset = u64::read(buf)?; + let value_size = u32::read(buf)?; + Ok(Self { + id, + value_offset, + value_size, + }) + } +} + +impl FixedSize for TestEntry { + const SIZE: usize = u64::SIZE + u64::SIZE + u32::SIZE; +} + +impl Record for TestEntry { + fn value_location(&self) -> (u64, u32) { + (self.value_offset, self.value_size) + } + + fn with_location(mut self, offset: u64, size: u32) -> Self { + self.value_offset = offset; + self.value_size = size; + self + } +} + +struct SegmentedOversized; + +impl Conformance for SegmentedOversized { + async fn commit(seed: u64) -> Vec { + let runner = deterministic::Runner::seeded(seed); + runner.start(|mut context| async move { + let config = oversized::Config { + index_partition: format!("segmented-oversized-index-conformance-{seed}"), + value_partition: format!("segmented-oversized-value-conformance-{seed}"), + index_buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + index_write_buffer: WRITE_BUFFER, + value_write_buffer: WRITE_BUFFER, + compression: None, + codec_config: (RangeCfg::new(0..256), ()), + }; + let mut journal = oversized::Oversized::<_, TestEntry, Vec>::init( + context.with_label("journal"), + config, + ) + .await + .unwrap(); + + // Write variable-size items across multiple sections + let items_count = context.gen_range(0..(ITEMS_PER_BLOB.get() as usize) * 4); + let mut data_to_write = vec![Vec::new(); items_count]; + for item in data_to_write.iter_mut() { + let size = context.gen_range(0..256); + item.resize(size, 0); + context.fill(item.as_mut_slice()); + } - // Read all of the data journal's blobs into the buffer. - for (_, blob) in journal.data.blobs.iter() { - let buf = vec![0u8; blob.size().await as usize]; - contents.extend(blob.read_at(buf, 0).await.unwrap().as_ref()); + // Distribute items across sections 0, 1, 2 + for (i, item) in data_to_write.iter().enumerate() { + let section = (i % 3) as u64; + let entry = TestEntry { + id: i as u64, + value_offset: 0, + value_size: 0, + }; + journal.append(section, entry, item).await.unwrap(); } - // Read all of the offsets journal's blobs into the buffer. - for (_, blob) in journal.offsets.blobs.iter() { - let buf = vec![0u8; blob.size().await as usize]; - contents.extend(blob.read_at(buf, 0).await.unwrap().as_ref()); + // Sync all sections + for section in 0..3 { + journal.sync(section).await.unwrap(); } - let buf = vec![0u8; journal.offsets.tail.size().await as usize]; - contents.extend(journal.offsets.tail.read_at(buf, 0).await.unwrap().as_ref()); + drop(journal); - contents + context.storage_audit().to_vec() }) } } conformance_tests! { - FixedJournal => 512, - VariableJournal => 512, + ContiguousFixed => 512, + ContiguousVariable => 512, + SegmentedFixed => 512, + SegmentedGlob => 512, + SegmentedVariable => 512, + SegmentedOversized => 512, } diff --git a/storage/src/journal/contiguous/fixed.rs b/storage/src/journal/contiguous/fixed.rs index d2b70df875..cf88ea435d 100644 --- a/storage/src/journal/contiguous/fixed.rs +++ b/storage/src/journal/contiguous/fixed.rs @@ -137,7 +137,7 @@ pub struct Journal { } impl> Journal { - pub(crate) const CHUNK_SIZE: usize = u32::SIZE + A::SIZE; + const CHUNK_SIZE: usize = u32::SIZE + A::SIZE; pub(crate) const CHUNK_SIZE_U64: u64 = Self::CHUNK_SIZE as u64; /// Initialize a new `Journal` instance. @@ -332,10 +332,11 @@ impl> Journal { let mut size = self.tail.size().await; assert!(size < self.cfg.items_per_blob.get() * Self::CHUNK_SIZE_U64); assert_eq!(size % Self::CHUNK_SIZE_U64, 0); + + // Pre-allocate exact size and write directly to avoid copying let mut buf: Vec = Vec::with_capacity(Self::CHUNK_SIZE); - let item = item.encode(); - let checksum = crc32fast::hash(&item); - buf.extend_from_slice(&item); + item.write(&mut buf); + let checksum = crc32fast::hash(&buf); buf.put_u32(checksum); // Write the item to the blob @@ -697,10 +698,7 @@ mod tests { use super::*; use commonware_cryptography::{sha256::Digest, Hasher as _, Sha256}; use commonware_macros::test_traced; - use commonware_runtime::{ - deterministic::{self, Context}, - Blob, Runner, Storage, - }; + use commonware_runtime::{deterministic, Blob, Runner, Storage}; use commonware_utils::{NZUsize, NZU64}; use futures::{pin_mut, StreamExt}; @@ -1489,71 +1487,4 @@ mod tests { journal.destroy().await.unwrap(); }); } - - /// Protect against accidental changes to the journal disk format. - #[test_traced] - fn test_journal_conformance() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - - // Start the test within the executor - executor.start(|context| async move { - // Create a journal configuration - let cfg = test_cfg(NZU64!(60)); - - // Initialize the journal - let mut journal = Journal::init(context.clone(), cfg.clone()) - .await - .expect("failed to initialize journal"); - - // Append 100 items to the journal - for i in 0..100 { - journal - .append(test_digest(i)) - .await - .expect("Failed to append data"); - } - - // Sync and drop the journal - journal.sync().await.expect("Failed to sync journal"); - drop(journal); - - // Hash blob contents - let (blob, size) = context - .open(&cfg.partition, &0u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - assert!(size > 0); - let buf = blob - .read_at(vec![0u8; size as usize], 0) - .await - .expect("Failed to read blob"); - let digest = Sha256::hash(buf.as_ref()); - assert_eq!( - hex(&digest), - "ed2ea67208cde2ee8c16cca5aa4f369f55b1402258c6b7760e5baf134e38944a", - ); - blob.sync().await.expect("Failed to sync blob"); - let (blob, size) = context - .open(&cfg.partition, &1u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - assert!(size > 0); - let buf = blob - .read_at(vec![0u8; size as usize], 0) - .await - .expect("Failed to read blob"); - let digest = Sha256::hash(buf.as_ref()); - assert_eq!( - hex(&digest), - "cc7efd4fc999aff36b9fd4213ba8da5810dc1849f92ae2ddf7c6dc40545f9aff", - ); - blob.sync().await.expect("Failed to sync blob"); - - let journal = Journal::::init(context.clone(), cfg.clone()) - .await - .expect("failed to initialize journal"); - journal.destroy().await.unwrap(); - }); - } } diff --git a/storage/src/journal/contiguous/variable.rs b/storage/src/journal/contiguous/variable.rs index 8da7bebecd..990cf32f7f 100644 --- a/storage/src/journal/contiguous/variable.rs +++ b/storage/src/journal/contiguous/variable.rs @@ -119,11 +119,11 @@ impl Config { /// before the offsets journal. pub struct Journal { /// The underlying variable-length data journal. - pub(crate) data: variable::Journal, + data: variable::Journal, /// Index mapping positions to byte offsets within the data journal. /// The section can be calculated from the position using items_per_section. - pub(crate) offsets: fixed::Journal, + offsets: fixed::Journal, /// The number of items per section. /// @@ -597,13 +597,13 @@ impl Journal { /// Returns `(oldest_retained_pos, size)` for the contiguous journal. async fn align_journals( data: &mut variable::Journal, - offsets: &mut fixed::Journal, + offsets: &mut fixed::Journal, items_per_section: u64, ) -> Result<(u64, u64), Error> { // === Handle empty data journal case === - let items_in_last_section = match data.blobs.last_key_value() { - Some((last_section, _)) => { - let stream = data.replay(*last_section, 0, REPLAY_BUFFER_SIZE).await?; + let items_in_last_section = match data.newest_section() { + Some(last_section) => { + let stream = data.replay(last_section, 0, REPLAY_BUFFER_SIZE).await?; futures::pin_mut!(stream); let mut count = 0u64; while let Some(result) = stream.next().await { @@ -619,16 +619,17 @@ impl Journal { // The latter should only occur if a crash occured after opening a data journal blob but // before writing to it. let data_empty = - data.blobs.is_empty() || (data.blobs.len() == 1 && items_in_last_section == 0); + data.is_empty() || (data.num_sections() == 1 && items_in_last_section == 0); if data_empty { let size = offsets.size(); - if !data.blobs.is_empty() { + if !data.is_empty() { // A section exists but contains 0 items. This can happen in two cases: // 1. Rewind crash: we rewound the data journal but crashed before rewinding offsets // 2. First append crash: we opened the first section blob but crashed before writing to it // In both cases, calculate target position from the first remaining section - let first_section = *data.blobs.first_key_value().unwrap().0; + // SAFETY: data is non-empty (checked above) + let first_section = data.oldest_section().unwrap(); let target_pos = first_section * items_per_section; info!("crash repair: rewinding offsets from {size} to {target_pos}"); @@ -655,9 +656,9 @@ impl Journal { // === Handle non-empty data journal case === let (data_oldest_pos, data_size) = { - // Data exists -- count items - let first_section = *data.blobs.first_key_value().unwrap().0; - let last_section = *data.blobs.last_key_value().unwrap().0; + // SAFETY: data is non-empty (empty case returns early above) + let first_section = data.oldest_section().unwrap(); + let last_section = data.newest_section().unwrap(); let oldest_pos = first_section * items_per_section; @@ -733,16 +734,16 @@ impl Journal { /// /// # Warning /// - /// - Panics if `data.blobs` is empty + /// - Panics if data journal is empty /// - Panics if `offsets_size` >= `data.size()` async fn add_missing_offsets( data: &variable::Journal, - offsets: &mut fixed::Journal, + offsets: &mut fixed::Journal, offsets_size: u64, items_per_section: u64, ) -> Result<(), Error> { assert!( - !data.blobs.is_empty(), + !data.is_empty(), "rebuild_offsets called with empty data journal" ); @@ -756,14 +757,14 @@ impl Journal { (last_section, last_offset, true) } else { // Offsets fully pruned but data has items -- start from first data section - // SAFETY: data.blobs is non-empty (checked above) - let first_section = *data.blobs.first_key_value().unwrap().0; + // SAFETY: data is non-empty (checked above) + let first_section = data.oldest_section().unwrap(); (first_section, 0, false) } } else { // Offsets empty -- start from first data section - // SAFETY: data.blobs is non-empty (checked above) - let first_section = *data.blobs.first_key_value().unwrap().0; + // SAFETY: data is non-empty (checked above) + let first_section = data.oldest_section().unwrap(); (first_section, 0, false) }; diff --git a/storage/src/journal/mod.rs b/storage/src/journal/mod.rs index ace260bbcb..9f57cce711 100644 --- a/storage/src/journal/mod.rs +++ b/storage/src/journal/mod.rs @@ -55,6 +55,8 @@ pub enum Error { CompressionFailed, #[error("decompression failed")] DecompressionFailed, + #[error("value too large (> u32::MAX)")] + ValueTooLarge, #[error("corruption detected: {0}")] Corruption(String), #[error("invalid configuration: {0}")] diff --git a/storage/src/journal/segmented/fixed.rs b/storage/src/journal/segmented/fixed.rs new file mode 100644 index 0000000000..05cc2e0a68 --- /dev/null +++ b/storage/src/journal/segmented/fixed.rs @@ -0,0 +1,738 @@ +//! Segmented journal for fixed-size items. +//! +//! # Format +//! +//! Data is stored in one blob per section. Within each blob, items are stored with +//! their checksum (CRC32): +//! +//! ```text +//! +--------+-----------+--------+-----------+--------+----------+-------------+ +//! | item_0 | C(Item_0) | item_1 | C(Item_1) | ... | item_n-1 | C(Item_n-1) | +//! +--------+-----------+--------+-----------+--------+----------+-------------+ +//! +//! C = CRC32 +//! ``` +//! +//! # Sync +//! +//! Data written to `Journal` may not be immediately persisted to `Storage`. Use the +//! `sync` method to force pending data to be written. +//! +//! # Pruning +//! +//! All data must be assigned to a `section`. This allows pruning entire sections +//! (and their corresponding blobs) independently. + +use super::manager::{AppendFactory, Config as ManagerConfig, Manager}; +use crate::journal::Error; +use bytes::BufMut; +use commonware_codec::{CodecFixed, DecodeExt as _, FixedSize}; +use commonware_runtime::{ + buffer::{PoolRef, Read}, + Blob, Error as RError, Metrics, Storage, +}; +use futures::{ + stream::{self, Stream}, + StreamExt, +}; +use std::{marker::PhantomData, num::NonZeroUsize}; +use tracing::{trace, warn}; + +/// Configuration for the fixed segmented journal. +#[derive(Clone)] +pub struct Config { + /// The partition to use for storing blobs. + pub partition: String, + + /// The buffer pool to use for caching data. + pub buffer_pool: PoolRef, + + /// The size of the write buffer to use for each blob. + pub write_buffer: NonZeroUsize, +} + +/// A segmented journal with fixed-size entries. +/// +/// Each section is stored in a separate blob. Within each blob, items are +/// fixed-size with a CRC32 checksum appended. +pub struct Journal { + manager: Manager, + _array: PhantomData, +} + +impl> Journal { + /// Size of each entry: item + CRC32 checksum. + pub const CHUNK_SIZE: usize = A::SIZE + u32::SIZE; + const CHUNK_SIZE_U64: u64 = Self::CHUNK_SIZE as u64; + + /// Initialize a new `Journal` instance. + /// + /// All backing blobs are opened but not read during initialization. Use `replay` + /// to iterate over all items. + /// + /// # Repair + /// + /// Corrupted trailing data in blobs is automatically truncated during replay. + pub async fn init(context: E, cfg: Config) -> Result { + let manager_cfg = ManagerConfig { + partition: cfg.partition, + factory: AppendFactory { + write_buffer: cfg.write_buffer, + pool_ref: cfg.buffer_pool, + }, + }; + let manager = Manager::init(context, manager_cfg).await?; + Ok(Self { + manager, + _array: PhantomData, + }) + } + + /// Append a new item to the journal in the given section. + /// + /// Returns the position of the item within the section (0-indexed). + pub async fn append(&mut self, section: u64, item: A) -> Result { + let blob = self.manager.get_or_create(section).await?; + + let size = blob.size().await; + if !size.is_multiple_of(Self::CHUNK_SIZE_U64) { + return Err(Error::InvalidBlobSize(section, size)); + } + let position = size / Self::CHUNK_SIZE_U64; + + // Pre-allocate exact size and write directly to avoid copying + let mut buf: Vec = Vec::with_capacity(Self::CHUNK_SIZE); + item.write(&mut buf); + let checksum = crc32fast::hash(&buf); + buf.put_u32(checksum); + + blob.append(buf).await?; + trace!(section, position, "appended item"); + + Ok(position) + } + + /// Read the item at the given section and position. + /// + /// # Errors + /// + /// - [Error::AlreadyPrunedToSection] if the section has been pruned. + /// - [Error::SectionOutOfRange] if the section doesn't exist. + /// - [Error::ItemOutOfRange] if the position is beyond the blob size. + pub async fn get(&self, section: u64, position: u64) -> Result { + let blob = self + .manager + .get(section)? + .ok_or(Error::SectionOutOfRange(section))?; + + let offset = position + .checked_mul(Self::CHUNK_SIZE_U64) + .ok_or(Error::ItemOutOfRange(position))?; + let end = offset + .checked_add(Self::CHUNK_SIZE_U64) + .ok_or(Error::ItemOutOfRange(position))?; + if end > blob.size().await { + return Err(Error::ItemOutOfRange(position)); + } + + let buf = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?; + Self::verify_integrity(buf.as_ref()) + } + + /// Read the last item in a section, if any. + pub async fn last(&self, section: u64) -> Result, Error> { + let blob = self + .manager + .get(section)? + .ok_or(Error::SectionOutOfRange(section))?; + + let size = blob.size().await; + if size < Self::CHUNK_SIZE_U64 { + return Ok(None); + } + + let last_position = (size / Self::CHUNK_SIZE_U64) - 1; + let offset = last_position * Self::CHUNK_SIZE_U64; + let buf = blob.read_at(vec![0u8; Self::CHUNK_SIZE], offset).await?; + Self::verify_integrity(buf.as_ref()).map(Some) + } + + /// Verify the integrity of the item + checksum in `buf`. + fn verify_integrity(buf: &[u8]) -> Result { + let stored_checksum = + u32::from_be_bytes(buf[A::SIZE..].try_into().expect("checksum is 4 bytes")); + let checksum = crc32fast::hash(&buf[..A::SIZE]); + if checksum != stored_checksum { + return Err(Error::ChecksumMismatch(stored_checksum, checksum)); + } + A::decode(&buf[..A::SIZE]).map_err(Error::Codec) + } + + /// Returns a stream of all items starting from the given section. + /// + /// Each item is returned as (section, position, item). + /// + /// # Repair + /// + /// Corrupted trailing data is automatically truncated. + pub async fn replay( + &self, + start_section: u64, + buffer: NonZeroUsize, + ) -> Result> + '_, Error> { + let mut blob_info = Vec::new(); + for (§ion, blob) in self.manager.sections_from(start_section) { + let size = blob.size().await; + blob_info.push((section, blob.clone(), size)); + } + + Ok( + stream::iter(blob_info).flat_map(move |(section, blob, blob_size)| { + let reader = Read::new(blob, blob_size, buffer); + let buf = vec![0u8; Self::CHUNK_SIZE]; + + stream::unfold( + (section, buf, reader, 0u64, 0u64), + move |(section, mut buf, mut reader, offset, valid_size)| async move { + if offset >= reader.blob_size() { + return None; + } + + let position = offset / Self::CHUNK_SIZE_U64; + match reader.read_exact(&mut buf, Self::CHUNK_SIZE).await { + Ok(()) => { + let next_offset = offset + Self::CHUNK_SIZE_U64; + match Self::verify_integrity(&buf) { + Ok(item) => Some(( + Ok((section, position, item)), + (section, buf, reader, next_offset, next_offset), + )), + Err(Error::ChecksumMismatch(expected, found)) => { + warn!( + section, + position, + expected, + found, + new_size = valid_size, + "corruption detected: truncating" + ); + reader.resize(valid_size).await.ok()?; + None + } + Err(err) => { + Some((Err(err), (section, buf, reader, offset, valid_size))) + } + } + } + Err(RError::BlobInsufficientLength) => { + warn!( + section, + position, + new_size = valid_size, + "trailing bytes detected: truncating" + ); + reader.resize(valid_size).await.ok()?; + None + } + Err(err) => { + warn!(section, position, ?err, "unexpected error"); + Some(( + Err(Error::Runtime(err)), + (section, buf, reader, offset, valid_size), + )) + } + } + }, + ) + }), + ) + } + + /// Sync the given section to storage. + pub async fn sync(&self, section: u64) -> Result<(), Error> { + self.manager.sync(section).await + } + + /// Sync all sections to storage. + pub async fn sync_all(&self) -> Result<(), Error> { + self.manager.sync_all().await + } + + /// Prune all sections less than `min`. Returns true if any were pruned. + pub async fn prune(&mut self, min: u64) -> Result { + self.manager.prune(min).await + } + + /// Returns the oldest section number, if any blobs exist. + pub fn oldest_section(&self) -> Option { + self.manager.oldest_section() + } + + /// Returns the newest section number, if any blobs exist. + pub fn newest_section(&self) -> Option { + self.manager.newest_section() + } + + /// Returns an iterator over all section numbers. + pub fn sections(&self) -> impl Iterator + '_ { + self.manager.sections_from(0).map(|(section, _)| *section) + } + + /// Returns the number of items in the given section. + pub async fn section_len(&self, section: u64) -> Result { + let size = self.manager.size(section).await?; + Ok(size / Self::CHUNK_SIZE_U64) + } + + /// Returns the byte size of the given section. + pub async fn size(&self, section: u64) -> Result { + self.manager.size(section).await + } + + /// Rewind the journal to a specific section and byte offset. + /// + /// This truncates the section to the given size. All sections + /// after `section` are removed. + pub async fn rewind(&mut self, section: u64, offset: u64) -> Result<(), Error> { + self.manager.rewind(section, offset).await + } + + /// Rewind only the given section to a specific byte offset. + /// + /// Unlike `rewind`, this does not affect other sections. + pub async fn rewind_section(&mut self, section: u64, size: u64) -> Result<(), Error> { + self.manager.rewind_section(section, size).await + } + + /// Remove all underlying blobs. + pub async fn destroy(self) -> Result<(), Error> { + self.manager.destroy().await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use commonware_cryptography::{sha256::Digest, Hasher as _, Sha256}; + use commonware_macros::test_traced; + use commonware_runtime::{buffer::PoolRef, deterministic, Runner}; + use commonware_utils::NZUsize; + use futures::{pin_mut, StreamExt}; + + const PAGE_SIZE: NonZeroUsize = NZUsize!(44); + const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(3); + + fn test_digest(value: u64) -> Digest { + Sha256::hash(&value.to_be_bytes()) + } + + fn test_cfg() -> Config { + Config { + partition: "test_partition".into(), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(2048), + } + } + + #[test_traced] + fn test_segmented_fixed_append_and_get() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + let pos0 = journal + .append(1, test_digest(0)) + .await + .expect("failed to append"); + assert_eq!(pos0, 0); + + let pos1 = journal + .append(1, test_digest(1)) + .await + .expect("failed to append"); + assert_eq!(pos1, 1); + + let pos2 = journal + .append(2, test_digest(2)) + .await + .expect("failed to append"); + assert_eq!(pos2, 0); + + let item0 = journal.get(1, 0).await.expect("failed to get"); + assert_eq!(item0, test_digest(0)); + + let item1 = journal.get(1, 1).await.expect("failed to get"); + assert_eq!(item1, test_digest(1)); + + let item2 = journal.get(2, 0).await.expect("failed to get"); + assert_eq!(item2, test_digest(2)); + + let err = journal.get(1, 2).await; + assert!(matches!(err, Err(Error::ItemOutOfRange(2)))); + + let err = journal.get(3, 0).await; + assert!(matches!(err, Err(Error::SectionOutOfRange(3)))); + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_replay() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + for i in 0u64..10 { + journal + .append(1, test_digest(i)) + .await + .expect("failed to append"); + } + for i in 10u64..20 { + journal + .append(2, test_digest(i)) + .await + .expect("failed to append"); + } + + journal.sync_all().await.expect("failed to sync"); + drop(journal); + + let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone()) + .await + .expect("failed to re-init"); + + let items = { + let stream = journal + .replay(0, NZUsize!(1024)) + .await + .expect("failed to replay"); + pin_mut!(stream); + + let mut items = Vec::new(); + while let Some(result) = stream.next().await { + match result { + Ok((section, pos, item)) => items.push((section, pos, item)), + Err(err) => panic!("replay error: {err}"), + } + } + items + }; + + assert_eq!(items.len(), 20); + for (i, item) in items.iter().enumerate().take(10) { + assert_eq!(item.0, 1); + assert_eq!(item.1, i as u64); + assert_eq!(item.2, test_digest(i as u64)); + } + for (i, item) in items.iter().enumerate().skip(10).take(10) { + assert_eq!(item.0, 2); + assert_eq!(item.1, (i - 10) as u64); + assert_eq!(item.2, test_digest(i as u64)); + } + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_prune() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + for section in 1u64..=5 { + journal + .append(section, test_digest(section)) + .await + .expect("failed to append"); + } + journal.sync_all().await.expect("failed to sync"); + + journal.prune(3).await.expect("failed to prune"); + + let err = journal.get(1, 0).await; + assert!(matches!(err, Err(Error::AlreadyPrunedToSection(3)))); + + let err = journal.get(2, 0).await; + assert!(matches!(err, Err(Error::AlreadyPrunedToSection(3)))); + + let item = journal.get(3, 0).await.expect("should exist"); + assert_eq!(item, test_digest(3)); + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_rewind() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + // Create sections 1, 2, 3 + for section in 1u64..=3 { + journal + .append(section, test_digest(section)) + .await + .expect("failed to append"); + } + journal.sync_all().await.expect("failed to sync"); + + // Verify all sections exist + for section in 1u64..=3 { + let size = journal.size(section).await.expect("failed to get size"); + assert!(size > 0, "section {section} should have data"); + } + + // Rewind to section 1 (should remove sections 2, 3) + let size = journal.size(1).await.expect("failed to get size"); + journal.rewind(1, size).await.expect("failed to rewind"); + + // Verify section 1 still has data + let size = journal.size(1).await.expect("failed to get size"); + assert!(size > 0, "section 1 should still have data"); + + // Verify sections 2, 3 are removed + for section in 2u64..=3 { + let size = journal.size(section).await.expect("failed to get size"); + assert_eq!(size, 0, "section {section} should be removed"); + } + + // Verify data in section 1 is still readable + let item = journal.get(1, 0).await.expect("failed to get"); + assert_eq!(item, test_digest(1)); + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_rewind_many_sections() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + // Create sections 1-10 + for section in 1u64..=10 { + journal + .append(section, test_digest(section)) + .await + .expect("failed to append"); + } + journal.sync_all().await.expect("failed to sync"); + + // Rewind to section 5 (should remove sections 6-10) + let size = journal.size(5).await.expect("failed to get size"); + journal.rewind(5, size).await.expect("failed to rewind"); + + // Verify sections 1-5 still have data + for section in 1u64..=5 { + let size = journal.size(section).await.expect("failed to get size"); + assert!(size > 0, "section {section} should still have data"); + } + + // Verify sections 6-10 are removed + for section in 6u64..=10 { + let size = journal.size(section).await.expect("failed to get size"); + assert_eq!(size, 0, "section {section} should be removed"); + } + + // Verify data integrity via replay + { + let stream = journal + .replay(0, NZUsize!(1024)) + .await + .expect("failed to replay"); + pin_mut!(stream); + let mut items = Vec::new(); + while let Some(result) = stream.next().await { + let (section, _, item) = result.expect("failed to read"); + items.push((section, item)); + } + assert_eq!(items.len(), 5); + for (i, (section, item)) in items.iter().enumerate() { + assert_eq!(*section, (i + 1) as u64); + assert_eq!(*item, test_digest((i + 1) as u64)); + } + } + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_rewind_persistence() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create sections 1-5 + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + for section in 1u64..=5 { + journal + .append(section, test_digest(section)) + .await + .expect("failed to append"); + } + journal.sync_all().await.expect("failed to sync"); + + // Rewind to section 2 + let size = journal.size(2).await.expect("failed to get size"); + journal.rewind(2, size).await.expect("failed to rewind"); + journal.sync_all().await.expect("failed to sync"); + drop(journal); + + // Re-init and verify only sections 1-2 exist + let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone()) + .await + .expect("failed to re-init"); + + // Verify sections 1-2 have data + for section in 1u64..=2 { + let size = journal.size(section).await.expect("failed to get size"); + assert!(size > 0, "section {section} should have data after restart"); + } + + // Verify sections 3-5 are gone + for section in 3u64..=5 { + let size = journal.size(section).await.expect("failed to get size"); + assert_eq!(size, 0, "section {section} should be gone after restart"); + } + + // Verify data integrity + let item1 = journal.get(1, 0).await.expect("failed to get"); + assert_eq!(item1, test_digest(1)); + let item2 = journal.get(2, 0).await.expect("failed to get"); + assert_eq!(item2, test_digest(2)); + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_corruption_recovery() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + for i in 0u64..5 { + journal + .append(1, test_digest(i)) + .await + .expect("failed to append"); + } + journal.sync_all().await.expect("failed to sync"); + drop(journal); + + let (blob, size) = context + .open(&cfg.partition, &1u64.to_be_bytes()) + .await + .expect("failed to open blob"); + blob.resize(size - 1).await.expect("failed to truncate"); + blob.sync().await.expect("failed to sync"); + + let journal = Journal::<_, Digest>::init(context.clone(), cfg.clone()) + .await + .expect("failed to re-init"); + + let count = { + let stream = journal + .replay(0, NZUsize!(1024)) + .await + .expect("failed to replay"); + pin_mut!(stream); + + let mut count = 0; + while let Some(result) = stream.next().await { + result.expect("should be ok"); + count += 1; + } + count + }; + assert_eq!(count, 4); + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_persistence() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate journal + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + for i in 0u64..5 { + journal + .append(1, test_digest(i)) + .await + .expect("failed to append"); + } + journal.sync_all().await.expect("failed to sync"); + drop(journal); + + // Reopen and verify data persisted + let journal = Journal::<_, Digest>::init(context.clone(), cfg) + .await + .expect("failed to re-init"); + + for i in 0u64..5 { + let item = journal.get(1, i).await.expect("failed to get"); + assert_eq!(item, test_digest(i)); + } + + journal.destroy().await.expect("failed to destroy"); + }); + } + + #[test_traced] + fn test_segmented_fixed_section_len() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + let mut journal = Journal::init(context.clone(), cfg.clone()) + .await + .expect("failed to init"); + + assert_eq!(journal.section_len(1).await.unwrap(), 0); + + for i in 0u64..5 { + journal + .append(1, test_digest(i)) + .await + .expect("failed to append"); + } + + assert_eq!(journal.section_len(1).await.unwrap(), 5); + assert_eq!(journal.section_len(2).await.unwrap(), 0); + + journal.destroy().await.expect("failed to destroy"); + }); + } +} diff --git a/storage/src/journal/segmented/glob.rs b/storage/src/journal/segmented/glob.rs new file mode 100644 index 0000000000..19c66c2a84 --- /dev/null +++ b/storage/src/journal/segmented/glob.rs @@ -0,0 +1,501 @@ +//! Simple section-based blob storage for values. +//! +//! This module provides a minimal blob storage optimized for storing values where +//! the size is tracked externally (in an index entry). Unlike the segmented variable +//! journal, this format does not include a size prefix since the caller already +//! knows the size. +//! +//! # Format +//! +//! Each entry is stored as: +//! +//! ```text +//! +---+---+---+---+---+---+---+---+---+---+---+---+ +//! | Compressed Data (variable) | CRC32 | +//! +---+---+---+---+---+---+---+---+---+---+---+---+ +//! ``` +//! +//! - **Compressed Data**: zstd compressed (if enabled) or raw codec output +//! - **CRC32**: 4-byte checksum of the compressed data +//! +//! # Read Flow +//! +//! 1. Get `(offset, size)` from index entry +//! 2. Read `size` bytes directly from blob at byte offset +//! 3. Last 4 bytes are CRC32, verify it +//! 4. Decompress remaining bytes if compression enabled +//! 5. Decode value + +use super::manager::{Config as ManagerConfig, Manager, WriteFactory}; +use crate::journal::Error; +use bytes::BufMut; +use commonware_codec::{Codec, FixedSize}; +use commonware_runtime::{Blob as _, Error as RError, Metrics, Storage}; +use std::{io::Cursor, num::NonZeroUsize}; +use zstd::{bulk::compress, decode_all}; + +/// Configuration for blob storage. +#[derive(Clone)] +pub struct Config { + /// The partition to use for storing blobs. + pub partition: String, + + /// Optional compression level (using `zstd`) to apply to data before storing. + pub compression: Option, + + /// The codec configuration to use for encoding and decoding items. + pub codec_config: C, + + /// The size of the write buffer to use for each blob. + pub write_buffer: NonZeroUsize, +} + +/// Simple section-based blob storage for values. +/// +/// Uses [`buffer::Write`](commonware_runtime::buffer::Write) for batching writes. +/// Reads go directly to blobs without any caching (ideal for large values that +/// shouldn't pollute a buffer pool cache). +pub struct Glob { + manager: Manager, + + /// Compression level (if enabled). + compression: Option, + + /// Codec configuration. + codec_config: V::Cfg, +} + +impl Glob { + /// Initialize blob storage, opening existing section blobs. + pub async fn init(context: E, cfg: Config) -> Result { + let manager_cfg = ManagerConfig { + partition: cfg.partition, + factory: WriteFactory { + capacity: cfg.write_buffer, + }, + }; + let manager = Manager::init(context, manager_cfg).await?; + + Ok(Self { + manager, + compression: cfg.compression, + codec_config: cfg.codec_config, + }) + } + + /// Append value to section, returns (offset, size). + /// + /// The returned offset is the byte offset where the entry was written. + /// The returned size is the total bytes written (compressed_data + crc32). + /// Both should be stored in the index entry for later retrieval. + pub async fn append(&mut self, section: u64, value: &V) -> Result<(u64, u32), Error> { + // Encode and optionally compress, then append checksum + let buf = if let Some(level) = self.compression { + // Compressed: encode first, then compress, then append checksum + let encoded = value.encode(); + let mut compressed = + compress(&encoded, level as i32).map_err(|_| Error::CompressionFailed)?; + let checksum = crc32fast::hash(&compressed); + compressed.put_u32(checksum); + compressed + } else { + // Uncompressed: pre-allocate exact size to avoid copying + let entry_size = value.encode_size() + u32::SIZE; + let mut buf = Vec::with_capacity(entry_size); + value.write(&mut buf); + let checksum = crc32fast::hash(&buf); + buf.put_u32(checksum); + buf + }; + + // Write to blob + let entry_size = u32::try_from(buf.len()).map_err(|_| Error::ValueTooLarge)?; + let writer = self.manager.get_or_create(section).await?; + let offset = writer.size().await; + writer.write_at(buf, offset).await.map_err(Error::Runtime)?; + + Ok((offset, entry_size)) + } + + /// Read value at offset with known size (from index entry). + /// + /// The offset should be the byte offset returned by `append()`. + /// Reads directly from blob without any caching. + pub async fn get(&self, section: u64, offset: u64, size: u32) -> Result { + let writer = self + .manager + .get(section)? + .ok_or(Error::SectionOutOfRange(section))?; + + let size_usize = size as usize; + + // Read via buffered writer (handles read-through for buffered data) + let buf = writer.read_at(vec![0u8; size_usize], offset).await?; + let buf = buf.as_ref(); + + // Entry format: [compressed_data] [crc32 (4 bytes)] + if buf.len() < u32::SIZE { + return Err(Error::Runtime(RError::BlobInsufficientLength)); + } + + let data_len = buf.len() - u32::SIZE; + let compressed_data = &buf[..data_len]; + let stored_checksum = + u32::from_be_bytes(buf[data_len..].try_into().expect("checksum is 4 bytes")); + + // Verify checksum + let checksum = crc32fast::hash(compressed_data); + if checksum != stored_checksum { + return Err(Error::ChecksumMismatch(stored_checksum, checksum)); + } + + // Decompress if needed and decode + let value = if self.compression.is_some() { + let decompressed = + decode_all(Cursor::new(compressed_data)).map_err(|_| Error::DecompressionFailed)?; + V::decode_cfg(decompressed.as_ref(), &self.codec_config).map_err(Error::Codec)? + } else { + V::decode_cfg(compressed_data, &self.codec_config).map_err(Error::Codec)? + }; + + Ok(value) + } + + /// Sync section to disk (flushes write buffer). + pub async fn sync(&self, section: u64) -> Result<(), Error> { + self.manager.sync(section).await + } + + /// Sync all sections to disk. + pub async fn sync_all(&self) -> Result<(), Error> { + self.manager.sync_all().await + } + + /// Get the current size of a section (including buffered data). + pub async fn size(&self, section: u64) -> Result { + self.manager.size(section).await + } + + /// Rewind to a specific section and size. + /// + /// Truncates the section to the given size and removes all sections after it. + pub async fn rewind(&mut self, section: u64, size: u64) -> Result<(), Error> { + self.manager.rewind(section, size).await + } + + /// Rewind only the given section to a specific size. + /// + /// Unlike `rewind`, this does not affect other sections. + pub async fn rewind_section(&mut self, section: u64, size: u64) -> Result<(), Error> { + self.manager.rewind_section(section, size).await + } + + /// Prune sections before min. + pub async fn prune(&mut self, min: u64) -> Result { + self.manager.prune(min).await + } + + /// Returns the number of the oldest section. + pub fn oldest_section(&self) -> Option { + self.manager.oldest_section() + } + + /// Returns the number of the newest section. + pub fn newest_section(&self) -> Option { + self.manager.newest_section() + } + + /// Returns an iterator over all section numbers. + pub fn sections(&self) -> impl Iterator + '_ { + self.manager.sections() + } + + /// Remove a specific section. Returns true if the section existed and was removed. + pub async fn remove_section(&mut self, section: u64) -> Result { + self.manager.remove_section(section).await + } + + /// Close all blobs (syncs first). + pub async fn close(&mut self) -> Result<(), Error> { + self.sync_all().await + } + + /// Destroy all blobs. + pub async fn destroy(self) -> Result<(), Error> { + self.manager.destroy().await + } +} + +#[cfg(test)] +mod tests { + use super::*; + use commonware_macros::test_traced; + use commonware_runtime::{deterministic, Runner}; + use commonware_utils::NZUsize; + + fn test_cfg() -> Config<()> { + Config { + partition: "test_partition".to_string(), + compression: None, + codec_config: (), + write_buffer: NZUsize!(1024), + } + } + + #[test_traced] + fn test_glob_append_and_get() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + // Append a value + let value: i32 = 42; + let (offset, size) = glob.append(1, &value).await.expect("Failed to append"); + assert_eq!(offset, 0); + + // Get the value back + let retrieved = glob.get(1, offset, size).await.expect("Failed to get"); + assert_eq!(retrieved, value); + + // Sync and verify + glob.sync(1).await.expect("Failed to sync"); + let retrieved = glob.get(1, offset, size).await.expect("Failed to get"); + assert_eq!(retrieved, value); + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_multiple_values() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + // Append multiple values + let values: Vec = vec![1, 2, 3, 4, 5]; + let mut locations = Vec::new(); + + for value in &values { + let (offset, size) = glob.append(1, value).await.expect("Failed to append"); + locations.push((offset, size)); + } + + // Get all values back + for (i, (offset, size)) in locations.iter().enumerate() { + let retrieved = glob.get(1, *offset, *size).await.expect("Failed to get"); + assert_eq!(retrieved, values[i]); + } + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_with_compression() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + partition: "test_partition".to_string(), + compression: Some(3), // zstd level 3 + codec_config: (), + write_buffer: NZUsize!(1024), + }; + let mut glob: Glob<_, [u8; 100]> = Glob::init(context.clone(), cfg) + .await + .expect("Failed to init glob"); + + // Append a value + let value: [u8; 100] = [0u8; 100]; // Compressible data + let (offset, size) = glob.append(1, &value).await.expect("Failed to append"); + + // Size should be smaller due to compression + assert!(size < 100 + 4); + + // Get the value back + let retrieved = glob.get(1, offset, size).await.expect("Failed to get"); + assert_eq!(retrieved, value); + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_prune() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + // Append to multiple sections + for section in 1..=5 { + glob.append(section, &(section as i32)) + .await + .expect("Failed to append"); + glob.sync(section).await.expect("Failed to sync"); + } + + // Prune sections < 3 + glob.prune(3).await.expect("Failed to prune"); + + // Sections 1 and 2 should be gone + assert!(glob.get(1, 0, 8).await.is_err()); + assert!(glob.get(2, 0, 8).await.is_err()); + + // Sections 3-5 should still exist + assert!(glob.manager.blobs.contains_key(&3)); + assert!(glob.manager.blobs.contains_key(&4)); + assert!(glob.manager.blobs.contains_key(&5)); + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_checksum_mismatch() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + // Append a value + let value: i32 = 42; + let (offset, size) = glob.append(1, &value).await.expect("Failed to append"); + glob.sync(1).await.expect("Failed to sync"); + + // Corrupt the data by writing directly to the underlying blob + let writer = glob.manager.blobs.get(&1).unwrap(); + writer + .write_at(vec![0xFF, 0xFF, 0xFF, 0xFF], offset) + .await + .expect("Failed to corrupt"); + writer.sync().await.expect("Failed to sync"); + + // Get should fail with checksum mismatch + let result = glob.get(1, offset, size).await; + assert!(matches!(result, Err(Error::ChecksumMismatch(_, _)))); + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_rewind() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + // Append multiple values and track sizes + let values: Vec = vec![1, 2, 3, 4, 5]; + let mut locations = Vec::new(); + + for value in &values { + let (offset, size) = glob.append(1, value).await.expect("Failed to append"); + locations.push((offset, size)); + } + glob.sync(1).await.expect("Failed to sync"); + + // Rewind to after the third value + let (third_offset, third_size) = locations[2]; + let rewind_size = third_offset + u64::from(third_size); + glob.rewind_section(1, rewind_size) + .await + .expect("Failed to rewind"); + + // First three values should still be readable + for (i, (offset, size)) in locations.iter().take(3).enumerate() { + let retrieved = glob.get(1, *offset, *size).await.expect("Failed to get"); + assert_eq!(retrieved, values[i]); + } + + // Fourth and fifth values should fail (reading past end of blob) + let (fourth_offset, fourth_size) = locations[3]; + let result = glob.get(1, fourth_offset, fourth_size).await; + assert!(result.is_err()); + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_persistence() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate glob + let mut glob: Glob<_, i32> = Glob::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init glob"); + + let value: i32 = 42; + let (offset, size) = glob.append(1, &value).await.expect("Failed to append"); + glob.sync(1).await.expect("Failed to sync"); + drop(glob); + + // Reopen and verify + let glob: Glob<_, i32> = Glob::init(context.clone(), cfg) + .await + .expect("Failed to reinit glob"); + + let retrieved = glob.get(1, offset, size).await.expect("Failed to get"); + assert_eq!(retrieved, value); + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_get_invalid_size() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + let (offset, _size) = glob.append(1, &42).await.expect("Failed to append"); + glob.sync(1).await.expect("Failed to sync"); + + // Size 0 - should fail + assert!(glob.get(1, offset, 0).await.is_err()); + + // Size < CRC_SIZE (1, 2, 3 bytes) - should fail with BlobInsufficientLength + for size in 1..4u32 { + let result = glob.get(1, offset, size).await; + assert!(matches!( + result, + Err(Error::Runtime(RError::BlobInsufficientLength)) + )); + } + + glob.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_glob_get_wrong_size() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut glob: Glob<_, i32> = Glob::init(context.clone(), test_cfg()) + .await + .expect("Failed to init glob"); + + let (offset, correct_size) = glob.append(1, &42).await.expect("Failed to append"); + glob.sync(1).await.expect("Failed to sync"); + + // Size too small (but >= CRC_SIZE) - checksum mismatch + let result = glob.get(1, offset, correct_size - 1).await; + assert!(matches!(result, Err(Error::ChecksumMismatch(_, _)))); + + glob.destroy().await.expect("Failed to destroy"); + }); + } +} diff --git a/storage/src/journal/segmented/manager.rs b/storage/src/journal/segmented/manager.rs new file mode 100644 index 0000000000..3c4ca526e5 --- /dev/null +++ b/storage/src/journal/segmented/manager.rs @@ -0,0 +1,375 @@ +//! Common blob management for segmented journals. +//! +//! This module provides `Manager`, a reusable component that handles +//! section-based blob storage, pruning, syncing, and metrics. + +use crate::journal::Error; +use commonware_runtime::{ + buffer::{Append, PoolRef, Write}, + telemetry::metrics::status::GaugeExt, + Blob, Error as RError, Metrics, Storage, +}; +use commonware_utils::hex; +use futures::future::try_join_all; +use prometheus_client::metrics::{counter::Counter, gauge::Gauge}; +use std::{collections::BTreeMap, future::Future, num::NonZeroUsize}; +use tracing::debug; + +/// A buffer that wraps a blob and provides size information. +/// +/// Both [`Append`] and [`Write`] implement this trait. +pub trait SectionBuffer: Blob + Clone + Send + Sync { + /// Returns the current logical size of the buffer including any buffered data. + fn size(&self) -> impl Future + Send; +} + +impl SectionBuffer for Append { + async fn size(&self) -> u64 { + Self::size(self).await + } +} + +impl SectionBuffer for Write { + async fn size(&self) -> u64 { + Self::size(self).await + } +} + +/// Factory for creating section buffers from raw blobs. +pub trait BufferFactory: Clone + Send + Sync { + /// The buffer type produced by this factory. + type Buffer: SectionBuffer; + + /// Create a new buffer wrapping the given blob with the specified size. + fn create( + &self, + blob: B, + size: u64, + ) -> impl Future> + Send; +} + +/// Factory for creating [`Append`] buffers with pool caching. +#[derive(Clone)] +pub struct AppendFactory { + /// The size of the write buffer. + pub write_buffer: NonZeroUsize, + /// The buffer pool for read caching. + pub pool_ref: PoolRef, +} + +impl BufferFactory for AppendFactory { + type Buffer = Append; + + async fn create(&self, blob: B, size: u64) -> Result { + Append::new(blob, size, self.write_buffer, self.pool_ref.clone()).await + } +} + +/// Factory for creating [`Write`] buffers without caching. +#[derive(Clone)] +pub struct WriteFactory { + /// The capacity of the write buffer. + pub capacity: NonZeroUsize, +} + +impl BufferFactory for WriteFactory { + type Buffer = Write; + + async fn create(&self, blob: B, size: u64) -> Result { + Ok(Write::new(blob, size, self.capacity)) + } +} + +/// Configuration for blob management. +#[derive(Clone)] +pub struct Config { + /// The partition to use for storing blobs. + pub partition: String, + + /// The factory for creating section buffers. + pub factory: F, +} + +/// Manages a collection of section-based blobs. +/// +/// Each section is stored in a separate blob, named by its section number +/// (big-endian u64). This component handles initialization, pruning, syncing, +/// and metrics. +pub struct Manager> { + context: E, + partition: String, + factory: F, + + /// One blob per section. + pub(crate) blobs: BTreeMap, + + /// A section number before which all sections have been pruned during + /// the current execution. Not persisted across restarts. + oldest_retained_section: u64, + + tracked: Gauge, + synced: Counter, + pruned: Counter, +} + +impl> Manager { + /// Initialize a new `Manager`. + /// + /// Scans the partition for existing blobs and opens them. + pub async fn init(context: E, cfg: Config) -> Result { + // Iterate over blobs in partition + let mut blobs = BTreeMap::new(); + let stored_blobs = match context.scan(&cfg.partition).await { + Ok(blobs) => blobs, + Err(RError::PartitionMissing(_)) => Vec::new(), + Err(err) => return Err(Error::Runtime(err)), + }; + + for name in stored_blobs { + let (blob, size) = context.open(&cfg.partition, &name).await?; + let hex_name = hex(&name); + let section = match name.try_into() { + Ok(section) => u64::from_be_bytes(section), + Err(_) => return Err(Error::InvalidBlobName(hex_name)), + }; + debug!(section, blob = hex_name, size, "loaded section"); + let buffer = cfg.factory.create(blob, size).await?; + blobs.insert(section, buffer); + } + + // Initialize metrics + let tracked = Gauge::default(); + let synced = Counter::default(); + let pruned = Counter::default(); + context.register("tracked", "Number of blobs", tracked.clone()); + context.register("synced", "Number of syncs", synced.clone()); + context.register("pruned", "Number of blobs pruned", pruned.clone()); + let _ = tracked.try_set(blobs.len()); + + Ok(Self { + context, + partition: cfg.partition, + factory: cfg.factory, + blobs, + oldest_retained_section: 0, + tracked, + synced, + pruned, + }) + } + + /// Ensures that a section pruned during the current execution is not accessed. + pub const fn prune_guard(&self, section: u64) -> Result<(), Error> { + if section < self.oldest_retained_section { + Err(Error::AlreadyPrunedToSection(self.oldest_retained_section)) + } else { + Ok(()) + } + } + + /// Get a reference to a blob for a section, if it exists. + pub fn get(&self, section: u64) -> Result, Error> { + self.prune_guard(section)?; + Ok(self.blobs.get(§ion)) + } + + /// Get a mutable reference to a blob, creating it if it doesn't exist. + pub async fn get_or_create(&mut self, section: u64) -> Result<&mut F::Buffer, Error> { + self.prune_guard(section)?; + + if !self.blobs.contains_key(§ion) { + let name = section.to_be_bytes(); + let (blob, size) = self.context.open(&self.partition, &name).await?; + let buffer = self.factory.create(blob, size).await?; + self.tracked.inc(); + self.blobs.insert(section, buffer); + } + + Ok(self.blobs.get_mut(§ion).unwrap()) + } + + /// Sync the given section to storage. + pub async fn sync(&self, section: u64) -> Result<(), Error> { + self.prune_guard(section)?; + if let Some(blob) = self.blobs.get(§ion) { + self.synced.inc(); + blob.sync().await.map_err(Error::Runtime)?; + } + Ok(()) + } + + /// Sync all sections to storage. + pub async fn sync_all(&self) -> Result<(), Error> { + let futures: Vec<_> = self.blobs.values().map(|blob| blob.sync()).collect(); + let results = try_join_all(futures).await.map_err(Error::Runtime)?; + self.synced.inc_by(results.len() as u64); + Ok(()) + } + + /// Prune all sections less than `min`. Returns true if any were pruned. + pub async fn prune(&mut self, min: u64) -> Result { + // Prune any blobs that are smaller than the minimum + let mut pruned = false; + while let Some((§ion, _)) = self.blobs.first_key_value() { + // Stop pruning if we reach the minimum + if section >= min { + break; + } + + // Remove blob from map + let blob = self.blobs.remove(§ion).unwrap(); + let size = blob.size().await; + drop(blob); + + // Remove blob from storage + self.context + .remove(&self.partition, Some(§ion.to_be_bytes())) + .await?; + pruned = true; + + debug!(section, size, "pruned blob"); + self.tracked.dec(); + self.pruned.inc(); + } + + if pruned { + self.oldest_retained_section = min; + } + + Ok(pruned) + } + + /// Returns the oldest section number, if any blobs exist. + pub fn oldest_section(&self) -> Option { + self.blobs.first_key_value().map(|(&s, _)| s) + } + + /// Returns the newest section number, if any blobs exist. + pub fn newest_section(&self) -> Option { + self.blobs.last_key_value().map(|(&s, _)| s) + } + + /// Returns true if no blobs exist. + pub fn is_empty(&self) -> bool { + self.blobs.is_empty() + } + + /// Returns the number of sections (blobs). + pub fn num_sections(&self) -> usize { + self.blobs.len() + } + + /// Returns an iterator over all sections starting from `start_section`. + pub fn sections_from(&self, start_section: u64) -> impl Iterator { + self.blobs.range(start_section..) + } + + /// Returns an iterator over all section numbers. + pub fn sections(&self) -> impl Iterator + '_ { + self.blobs.keys().copied() + } + + /// Remove a specific section. Returns true if the section existed and was removed. + pub async fn remove_section(&mut self, section: u64) -> Result { + self.prune_guard(section)?; + + if let Some(blob) = self.blobs.remove(§ion) { + let size = blob.size().await; + drop(blob); + self.context + .remove(&self.partition, Some(§ion.to_be_bytes())) + .await?; + self.tracked.dec(); + debug!(section, size, "removed section"); + Ok(true) + } else { + Ok(false) + } + } + + /// Remove all underlying blobs. + pub async fn destroy(self) -> Result<(), Error> { + for (section, blob) in self.blobs.into_iter() { + let size = blob.size().await; + drop(blob); + debug!(section, size, "destroyed blob"); + self.context + .remove(&self.partition, Some(§ion.to_be_bytes())) + .await?; + } + match self.context.remove(&self.partition, None).await { + Ok(()) => {} + // Partition already removed or never existed. + Err(RError::PartitionMissing(_)) => {} + Err(err) => return Err(Error::Runtime(err)), + } + Ok(()) + } + + /// Rewind by removing all sections after `section` and resizing the target section. + pub async fn rewind(&mut self, section: u64, size: u64) -> Result<(), Error> { + self.prune_guard(section)?; + + // Remove sections in descending order (newest first) to maintain a contiguous record + // if a crash occurs during rewind. + let sections_to_remove: Vec = self + .blobs + .range((section + 1)..) + .rev() + .map(|(&s, _)| s) + .collect(); + + for s in sections_to_remove { + // Remove the underlying blob from storage + let blob = self.blobs.remove(&s).unwrap(); + drop(blob); + self.context + .remove(&self.partition, Some(&s.to_be_bytes())) + .await?; + self.tracked.dec(); + debug!(section = s, "removed blob during rewind"); + } + + // If the section exists, truncate it to the given size + if let Some(blob) = self.blobs.get(§ion) { + let current_size = blob.size().await; + if size < current_size { + blob.resize(size).await?; + debug!( + section, + old_size = current_size, + new_size = size, + "rewound blob" + ); + } + } + + Ok(()) + } + + /// Resize only the given section without affecting other sections. + pub async fn rewind_section(&mut self, section: u64, size: u64) -> Result<(), Error> { + self.prune_guard(section)?; + + // Get the blob at the given section + if let Some(blob) = self.blobs.get(§ion) { + // Truncate the blob to the given size + let current = blob.size().await; + if size < current { + blob.resize(size).await?; + debug!(section, from = current, to = size, "rewound section"); + } + } + + Ok(()) + } + + /// Returns the byte size of the given section. + pub async fn size(&self, section: u64) -> Result { + self.prune_guard(section)?; + match self.blobs.get(§ion) { + Some(blob) => Ok(blob.size().await), + None => Ok(0), + } + } +} diff --git a/storage/src/journal/segmented/mod.rs b/storage/src/journal/segmented/mod.rs index 7452304636..9b53fd95a0 100644 --- a/storage/src/journal/segmented/mod.rs +++ b/storage/src/journal/segmented/mod.rs @@ -3,4 +3,8 @@ //! This module provides journal implementations that organize data into sections, //! where each section is stored in a separate blob. +pub mod fixed; +pub mod glob; +mod manager; +pub mod oversized; pub mod variable; diff --git a/storage/src/journal/segmented/oversized.rs b/storage/src/journal/segmented/oversized.rs new file mode 100644 index 0000000000..18718f9397 --- /dev/null +++ b/storage/src/journal/segmented/oversized.rs @@ -0,0 +1,2874 @@ +//! Segmented journal for oversized values. +//! +//! This module combines [super::fixed::Journal] with [super::glob::Glob] to handle +//! entries that reference variable-length "oversized" values. It provides coordinated +//! operations and built-in crash recovery. +//! +//! # Architecture +//! +//! ```text +//! +-------------------+ +-------------------+ +//! | Fixed Journal | | Glob (Values) | +//! | (Index Entries) | | | +//! +-------------------+ +-------------------+ +//! | entry_0 | --> | value_0 | +//! | entry_1 | --> | value_1 | +//! | ... | | ... | +//! +-------------------+ +-------------------+ +//! ``` +//! +//! Each index entry contains `(value_offset, value_size)` pointing to its value in glob. +//! +//! # Crash Recovery +//! +//! On unclean shutdown, the index journal and glob may have different lengths: +//! - Index entry pointing to non-existent glob data (dangerous) +//! - Glob value without index entry (orphan - acceptable but cleaned up) +//! - Glob sections without corresponding index sections (orphan sections - removed) +//! +//! During initialization, crash recovery is performed: +//! 1. Each index entry's glob reference is validated (`value_offset + value_size <= glob_size`) +//! 2. Invalid entries are skipped and the index journal is rewound +//! 3. Orphan value sections (sections in glob but not in index) are removed +//! +//! This allows async writes (glob first, then index) while ensuring consistency +//! after recovery. +//! +//! _Recovery only validates that index entries point to valid byte ranges +//! within the glob. It does **not** verify value checksums during recovery (this would +//! require reading all values). Value checksums are verified lazily when values are +//! read via `get_value()`. If the underlying storage is corrupted, `get_value()` will +//! return a checksum error even though the index entry exists._ + +use super::{ + fixed::{Config as FixedConfig, Journal as FixedJournal}, + glob::{Config as GlobConfig, Glob}, +}; +use crate::journal::Error; +use commonware_codec::{Codec, CodecFixed}; +use commonware_runtime::{Metrics, Storage}; +use futures::{future::try_join, stream::Stream}; +use std::{collections::HashSet, num::NonZeroUsize}; +use tracing::{debug, warn}; + +/// Trait for index entries that reference oversized values in glob storage. +/// +/// Implementations must provide access to the value location for crash recovery validation, +/// and a way to set the location when appending. +pub trait Record: CodecFixed + Clone { + /// Returns `(value_offset, value_size)` for crash recovery validation. + fn value_location(&self) -> (u64, u32); + + /// Returns a new entry with the value location set. + /// + /// Called during `append` after the value is written to glob storage. + fn with_location(self, offset: u64, size: u32) -> Self; +} + +/// Configuration for oversized journal. +#[derive(Clone)] +pub struct Config { + /// Partition for the fixed index journal. + pub index_partition: String, + + /// Partition for the glob value storage. + pub value_partition: String, + + /// Buffer pool for index journal caching. + pub index_buffer_pool: commonware_runtime::buffer::PoolRef, + + /// Write buffer size for the index journal. + pub index_write_buffer: NonZeroUsize, + + /// Write buffer size for the value journal. + pub value_write_buffer: NonZeroUsize, + + /// Optional compression level for values (using zstd). + pub compression: Option, + + /// Codec configuration for values. + pub codec_config: C, +} + +/// Segmented journal for entries with oversized values. +/// +/// Combines a fixed-size index journal with glob storage for variable-length values. +/// Provides coordinated operations and crash recovery. +pub struct Oversized { + index: FixedJournal, + values: Glob, +} + +impl Oversized { + /// Initialize with crash recovery validation. + /// + /// Validates each index entry's glob reference during replay. Invalid entries + /// (pointing beyond glob size) are skipped, and the index journal is rewound + /// to exclude trailing invalid entries. + pub async fn init(context: E, cfg: Config) -> Result { + // Initialize both journals + let index_cfg = FixedConfig { + partition: cfg.index_partition, + buffer_pool: cfg.index_buffer_pool, + write_buffer: cfg.index_write_buffer, + }; + let index = FixedJournal::init(context.with_label("index"), index_cfg).await?; + + let value_cfg = GlobConfig { + partition: cfg.value_partition, + compression: cfg.compression, + codec_config: cfg.codec_config, + write_buffer: cfg.value_write_buffer, + }; + let values = Glob::init(context.with_label("values"), value_cfg).await?; + + let mut oversized = Self { index, values }; + + // Perform crash recovery validation + oversized.recover().await?; + + Ok(oversized) + } + + /// Perform crash recovery by validating index entries against glob sizes. + /// + /// Only checks the last entry in each section. Since entries are appended sequentially + /// and value offsets are monotonically increasing within a section, if the last entry + /// is valid then all earlier entries must be valid too. + async fn recover(&mut self) -> Result<(), Error> { + let chunk_size = FixedJournal::::CHUNK_SIZE as u64; + let sections: Vec = self.index.sections().collect(); + + for section in sections { + let index_size = self.index.size(section).await?; + if index_size == 0 { + continue; + } + + let glob_size = match self.values.size(section).await { + Ok(size) => size, + Err(Error::AlreadyPrunedToSection(oldest)) => { + // This shouldn't happen in normal operation: prune() prunes the index + // first, then the glob. A crash between these would leave the glob + // NOT pruned (opposite of this case). We handle this defensively in + // case of external manipulation or future changes. + warn!( + section, + oldest, "index has section that glob already pruned" + ); + 0 + } + Err(e) => return Err(e), + }; + + // Truncate any trailing partial entry + let entry_count = index_size / chunk_size; + let aligned_size = entry_count * chunk_size; + if aligned_size < index_size { + warn!( + section, + index_size, aligned_size, "trailing bytes detected: truncating" + ); + self.index.rewind_section(section, aligned_size).await?; + } + + // If there is nothing, we can exit early and rewind values to 0 + if entry_count == 0 { + warn!( + section, + index_size, "trailing bytes detected: truncating to 0" + ); + self.values.rewind_section(section, 0).await?; + continue; + } + + // Find last valid entry and target glob size + let (valid_count, glob_target) = self + .find_last_valid_entry(section, entry_count, glob_size) + .await; + + // Rewind index if any entries are invalid + if valid_count < entry_count { + let valid_size = valid_count * chunk_size; + debug!(section, entry_count, valid_count, "rewinding index"); + self.index.rewind_section(section, valid_size).await?; + } + + // Truncate glob trailing garbage (can occur when value was written but + // index entry wasn't, or when index was truncated but glob wasn't) + if glob_size > glob_target { + debug!( + section, + glob_size, glob_target, "truncating glob trailing garbage" + ); + self.values.rewind_section(section, glob_target).await?; + } + } + + // Clean up orphan value sections that don't exist in index + self.cleanup_orphan_value_sections().await?; + + Ok(()) + } + + /// Remove any value sections that don't have corresponding index sections. + /// + /// This can happen if a crash occurs after writing to values but before + /// writing to index for a new section. Since sections don't have to be + /// contiguous, we compare the actual sets of sections rather than just + /// comparing the newest section numbers. + async fn cleanup_orphan_value_sections(&mut self) -> Result<(), Error> { + // Collect index sections into a set for O(1) lookup + let index_sections: HashSet = self.index.sections().collect(); + + // Find value sections that don't exist in index + let orphan_sections: Vec = self + .values + .sections() + .filter(|s| !index_sections.contains(s)) + .collect(); + + // Remove each orphan section + for section in orphan_sections { + warn!(section, "removing orphan value section"); + self.values.remove_section(section).await?; + } + + Ok(()) + } + + /// Find the number of valid entries and the corresponding glob target size. + /// + /// Scans backwards from the last entry until a valid one is found. + /// Returns `(valid_count, glob_target)` where `glob_target` is the end offset + /// of the last valid entry's value. + async fn find_last_valid_entry( + &self, + section: u64, + entry_count: u64, + glob_size: u64, + ) -> (u64, u64) { + for pos in (0..entry_count).rev() { + match self.index.get(section, pos).await { + Ok(entry) => { + let (offset, size) = entry.value_location(); + let entry_end = offset.saturating_add(u64::from(size)); + if entry_end <= glob_size { + return (pos + 1, entry_end); + } + if pos == entry_count - 1 { + warn!( + section, + pos, glob_size, entry_end, "invalid entry: glob truncated" + ); + } + } + Err(_) => { + if pos == entry_count - 1 { + warn!(section, pos, "corrupted last entry, scanning backwards"); + } + } + } + } + (0, 0) + } + + /// Append entry + value. + /// + /// Writes value to glob first, then writes index entry with the value location. + /// + /// Returns `(position, offset, size)` where: + /// - `position`: Position in the index journal + /// - `offset`: Byte offset in glob + /// - `size`: Size of value in glob (including checksum) + pub async fn append( + &mut self, + section: u64, + entry: I, + value: &V, + ) -> Result<(u64, u64, u32), Error> { + // Write value first (glob). This will typically write to an in-memory + // buffer and return quickly (only blocks when the buffer is full). + let (offset, size) = self.values.append(section, value).await?; + + // Update entry with actual location and write to index + let entry_with_location = entry.with_location(offset, size); + let position = self.index.append(section, entry_with_location).await?; + + Ok((position, offset, size)) + } + + /// Get entry at position (index entry only, not value). + pub async fn get(&self, section: u64, position: u64) -> Result { + self.index.get(section, position).await + } + + /// Get the last entry for a section, if any. + pub async fn last(&self, section: u64) -> Result, Error> { + self.index.last(section).await + } + + /// Get value using offset/size from entry. + /// + /// The offset should be the byte offset from `append()` or from the entry's `value_location()`. + pub async fn get_value(&self, section: u64, offset: u64, size: u32) -> Result { + self.values.get(section, offset, size).await + } + + /// Replay index entries starting from given section. + /// + /// Returns a stream of `(section, position, entry)` tuples. + pub async fn replay( + &self, + start_section: u64, + buffer: NonZeroUsize, + ) -> Result> + '_, Error> { + self.index.replay(start_section, buffer).await + } + + /// Sync both journals for given section. + pub async fn sync(&self, section: u64) -> Result<(), Error> { + try_join(self.index.sync(section), self.values.sync(section)) + .await + .map(|_| ()) + } + + /// Sync all sections. + pub async fn sync_all(&self) -> Result<(), Error> { + try_join(self.index.sync_all(), self.values.sync_all()) + .await + .map(|_| ()) + } + + /// Prune both journals. Returns true if any sections were pruned. + /// + /// Prunes index first, then glob. This order ensures crash safety: + /// - If crash after index prune but before glob: orphan data in glob (acceptable) + /// - If crash before index prune: no change, retry works + pub async fn prune(&mut self, min: u64) -> Result { + let index_pruned = self.index.prune(min).await?; + let value_pruned = self.values.prune(min).await?; + Ok(index_pruned || value_pruned) + } + + /// Rewind both journals to a specific section and index size. + /// + /// This rewinds the section to the given index size and removes all sections + /// after the given section. The value size is derived from the last entry. + pub async fn rewind(&mut self, section: u64, index_size: u64) -> Result<(), Error> { + // Rewind index first (this also removes sections after `section`) + self.index.rewind(section, index_size).await?; + + // Derive value size from last entry + let value_size = match self.index.last(section).await? { + Some(entry) => { + let (offset, size) = entry.value_location(); + offset + .checked_add(u64::from(size)) + .ok_or(Error::OffsetOverflow)? + } + None => 0, + }; + + // Rewind values (this also removes sections after `section`) + self.values.rewind(section, value_size).await + } + + /// Rewind only the given section to a specific index size. + /// + /// Unlike `rewind`, this does not affect other sections. + /// The value size is derived from the last entry after rewinding the index. + pub async fn rewind_section(&mut self, section: u64, index_size: u64) -> Result<(), Error> { + // Rewind index first + self.index.rewind_section(section, index_size).await?; + + // Derive value size from last entry + let value_size = match self.index.last(section).await? { + Some(entry) => { + let (offset, size) = entry.value_location(); + offset + .checked_add(u64::from(size)) + .ok_or(Error::OffsetOverflow)? + } + None => 0, + }; + + // Rewind values + self.values.rewind_section(section, value_size).await + } + + /// Get index size for checkpoint. + /// + /// The value size can be derived from the last entry's location when needed. + pub async fn size(&self, section: u64) -> Result { + self.index.size(section).await + } + + /// Get the value size for a section, derived from the last entry's location. + pub async fn value_size(&self, section: u64) -> Result { + match self.index.last(section).await { + Ok(Some(entry)) => { + let (offset, size) = entry.value_location(); + offset + .checked_add(u64::from(size)) + .ok_or(Error::OffsetOverflow) + } + Ok(None) => Ok(0), + Err(Error::SectionOutOfRange(_)) => Ok(0), + Err(e) => Err(e), + } + } + + /// Returns the oldest section number, if any exist. + pub fn oldest_section(&self) -> Option { + self.index.oldest_section() + } + + /// Returns the newest section number, if any exist. + pub fn newest_section(&self) -> Option { + self.index.newest_section() + } + + /// Destroy all underlying storage. + pub async fn destroy(self) -> Result<(), Error> { + try_join(self.index.destroy(), self.values.destroy()) + .await + .map(|_| ()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::{Buf, BufMut}; + use commonware_codec::{FixedSize, Read, ReadExt, Write}; + use commonware_macros::test_traced; + use commonware_runtime::{buffer::PoolRef, deterministic, Blob as _, Runner}; + use commonware_utils::NZUsize; + + /// Convert offset + size to byte end position (for truncation tests). + fn byte_end(offset: u64, size: u32) -> u64 { + offset + u64::from(size) + } + + /// Test index entry that stores a u64 id and references a value. + #[derive(Debug, Clone, PartialEq)] + struct TestEntry { + id: u64, + value_offset: u64, + value_size: u32, + } + + impl TestEntry { + fn new(id: u64, value_offset: u64, value_size: u32) -> Self { + Self { + id, + value_offset, + value_size, + } + } + } + + impl Write for TestEntry { + fn write(&self, buf: &mut impl BufMut) { + self.id.write(buf); + self.value_offset.write(buf); + self.value_size.write(buf); + } + } + + impl Read for TestEntry { + type Cfg = (); + + fn read_cfg(buf: &mut impl Buf, _: &Self::Cfg) -> Result { + let id = u64::read(buf)?; + let value_offset = u64::read(buf)?; + let value_size = u32::read(buf)?; + Ok(Self { + id, + value_offset, + value_size, + }) + } + } + + impl FixedSize for TestEntry { + const SIZE: usize = u64::SIZE + u64::SIZE + u32::SIZE; + } + + impl Record for TestEntry { + fn value_location(&self) -> (u64, u32) { + (self.value_offset, self.value_size) + } + + fn with_location(mut self, offset: u64, size: u32) -> Self { + self.value_offset = offset; + self.value_size = size; + self + } + } + + fn test_cfg() -> Config<()> { + Config { + index_partition: "test_index".to_string(), + value_partition: "test_values".to_string(), + index_buffer_pool: PoolRef::new(NZUsize!(64), NZUsize!(8)), + index_write_buffer: NZUsize!(1024), + value_write_buffer: NZUsize!(1024), + compression: None, + codec_config: (), + } + } + + /// Simple test value type with unit config. + type TestValue = [u8; 16]; + + #[test_traced] + fn test_oversized_append_and_get() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), test_cfg()) + .await + .expect("Failed to init"); + + // Append entry with value + let value: TestValue = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let entry = TestEntry::new(42, 0, 0); + let (position, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + + assert_eq!(position, 0); + + // Get entry + let retrieved_entry = oversized.get(1, position).await.expect("Failed to get"); + assert_eq!(retrieved_entry.id, 42); + + // Get value + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_oversized_crash_recovery() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate oversized journal + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append multiple entries + let mut locations = Vec::new(); + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let (position, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push((position, offset, size)); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Simulate crash: truncate glob to lose last 2 values + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + + // Calculate size to keep first 3 entries + let keep_size = byte_end(locations[2].1, locations[2].2); + blob.resize(keep_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should recover and rewind index + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First 3 entries should still be valid + for i in 0..3u8 { + let (position, offset, size) = locations[i as usize]; + let entry = oversized.get(1, position).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + + let value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(value, [i; 16]); + } + + // Entry at position 3 should fail (index was rewound) + let result = oversized.get(1, 3).await; + assert!(result.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_oversized_persistence() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(123, 0, 0); + let (position, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Reopen and verify + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + let retrieved_entry = oversized.get(1, position).await.expect("Failed to get"); + assert_eq!(retrieved_entry.id, 123); + + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_oversized_prune() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), test_cfg()) + .await + .expect("Failed to init"); + + // Append to multiple sections + for section in 1u64..=5 { + let value: TestValue = [section as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(section).await.expect("Failed to sync"); + } + + // Prune sections < 3 + oversized.prune(3).await.expect("Failed to prune"); + + // Sections 1, 2 should be gone + assert!(oversized.get(1, 0).await.is_err()); + assert!(oversized.get(2, 0).await.is_err()); + + // Sections 3, 4, 5 should exist + assert!(oversized.get(3, 0).await.is_ok()); + assert!(oversized.get(4, 0).await.is_ok()); + assert!(oversized.get(5, 0).await.is_ok()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_empty_section() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create oversized journal + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append to section 2 only (section 1 remains empty after being opened) + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + oversized + .append(2, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(2).await.expect("Failed to sync"); + drop(oversized); + + // Reinitialize - recovery should handle the empty/non-existent section 1 + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // Section 2 entry should be valid + let entry = oversized.get(2, 0).await.expect("Failed to get"); + assert_eq!(entry.id, 1); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_all_entries_invalid() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append 5 entries + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Truncate glob to 0 bytes - ALL entries become invalid + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + blob.resize(0).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should recover and rewind index to 0 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // No entries should be accessible + let result = oversized.get(1, 0).await; + assert!(result.is_err()); + + // Should be able to append after recovery + let value: TestValue = [99; 16]; + let entry = TestEntry::new(100, 0, 0); + let (pos, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 0); + + let retrieved = oversized.get(1, 0).await.expect("Failed to get"); + assert_eq!(retrieved.id, 100); + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_multiple_sections_mixed_validity() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate multiple sections + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Section 1: 3 entries + let mut section1_locations = Vec::new(); + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + section1_locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + + // Section 2: 5 entries + let mut section2_locations = Vec::new(); + for i in 0..5u8 { + let value: TestValue = [10 + i; 16]; + let entry = TestEntry::new(10 + i as u64, 0, 0); + let loc = oversized + .append(2, entry, &value) + .await + .expect("Failed to append"); + section2_locations.push(loc); + } + oversized.sync(2).await.expect("Failed to sync"); + + // Section 3: 2 entries + for i in 0..2u8 { + let value: TestValue = [20 + i; 16]; + let entry = TestEntry::new(20 + i as u64, 0, 0); + oversized + .append(3, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(3).await.expect("Failed to sync"); + drop(oversized); + + // Truncate section 1 glob to keep only first entry + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let keep_size = byte_end(section1_locations[0].1, section1_locations[0].2); + blob.resize(keep_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Truncate section 2 glob to keep first 3 entries + let (blob, _) = context + .open(&cfg.value_partition, &2u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let keep_size = byte_end(section2_locations[2].1, section2_locations[2].2); + blob.resize(keep_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Section 3 remains intact + + // Reinitialize + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // Section 1: only position 0 valid + assert!(oversized.get(1, 0).await.is_ok()); + assert!(oversized.get(1, 1).await.is_err()); + assert!(oversized.get(1, 2).await.is_err()); + + // Section 2: positions 0,1,2 valid + assert!(oversized.get(2, 0).await.is_ok()); + assert!(oversized.get(2, 1).await.is_ok()); + assert!(oversized.get(2, 2).await.is_ok()); + assert!(oversized.get(2, 3).await.is_err()); + assert!(oversized.get(2, 4).await.is_err()); + + // Section 3: both positions valid + assert!(oversized.get(3, 0).await.is_ok()); + assert!(oversized.get(3, 1).await.is_ok()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_corrupted_last_index_entry() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append 5 entries + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Corrupt the last index entry's checksum + let (blob, size) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + + // Each entry is TestEntry::SIZE (16) + 4 (CRC32) = 20 bytes + // Corrupt the CRC of the last entry + let last_entry_crc_offset = size - 4; + blob.write_at(vec![0xFF, 0xFF, 0xFF, 0xFF], last_entry_crc_offset) + .await + .expect("Failed to corrupt"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should detect corruption and scan backwards + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // First 4 entries should be valid + for i in 0..4u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + } + + // Entry 4 should be gone (corrupted and rewound) + assert!(oversized.get(1, 4).await.is_err()); + + // Should be able to append after recovery + let value: TestValue = [99; 16]; + let entry = TestEntry::new(100, 0, 0); + let (pos, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 4); + + let retrieved = oversized.get(1, 4).await.expect("Failed to get"); + assert_eq!(retrieved.id, 100); + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_all_entries_valid() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append entries to multiple sections + for section in 1u64..=3 { + for i in 0..10u8 { + let value: TestValue = [(section as u8) * 10 + i; 16]; + let entry = TestEntry::new(section * 100 + i as u64, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Reinitialize with no corruption - should be fast + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // All entries should be valid + for section in 1u64..=3 { + for i in 0..10u8 { + let entry = oversized + .get(section, i as u64) + .await + .expect("Failed to get"); + assert_eq!(entry.id, section * 100 + i as u64); + } + } + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_single_entry_invalid() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with single entry + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Truncate glob to 0 - single entry becomes invalid + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + blob.resize(0).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // Entry should be gone + assert!(oversized.get(1, 0).await.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_last_entry_off_by_one() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut locations = Vec::new(); + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Truncate glob to be off by 1 byte from last entry + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + + // Last entry needs: offset + size bytes + // Truncate to offset + size - 1 (missing 1 byte) + let last = &locations[2]; + let truncate_to = byte_end(last.1, last.2) - 1; + blob.resize(truncate_to).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // First 2 entries should be valid + assert!(oversized.get(1, 0).await.is_ok()); + assert!(oversized.get(1, 1).await.is_ok()); + + // Entry 2 should be gone (truncated) + assert!(oversized.get(1, 2).await.is_err()); + + // Should be able to append after recovery + let value: TestValue = [99; 16]; + let entry = TestEntry::new(100, 0, 0); + let (pos, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 2); + + let retrieved = oversized.get(1, 2).await.expect("Failed to get"); + assert_eq!(retrieved.id, 100); + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_glob_missing_entirely() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Delete the glob file entirely + context + .remove(&cfg.value_partition, Some(&1u64.to_be_bytes())) + .await + .expect("Failed to remove"); + + // Reinitialize - glob size will be 0, all entries invalid + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // All entries should be gone + assert!(oversized.get(1, 0).await.is_err()); + assert!(oversized.get(1, 1).await.is_err()); + assert!(oversized.get(1, 2).await.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_can_append_after_recovery() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut locations = Vec::new(); + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Truncate glob to keep only first 2 entries + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let keep_size = byte_end(locations[1].1, locations[1].2); + blob.resize(keep_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Verify first 2 entries exist + assert!(oversized.get(1, 0).await.is_ok()); + assert!(oversized.get(1, 1).await.is_ok()); + assert!(oversized.get(1, 2).await.is_err()); + + // Append new entries after recovery + for i in 10..15u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + } + oversized.sync(1).await.expect("Failed to sync"); + + // Verify new entries at positions 2, 3, 4, 5, 6 + for i in 0..5u8 { + let entry = oversized + .get(1, 2 + i as u64) + .await + .expect("Failed to get new entry"); + assert_eq!(entry.id, (10 + i) as u64); + } + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_glob_pruned_but_index_not() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate multiple sections + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for section in 1u64..=3 { + let value: TestValue = [section as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Simulate crash during prune: prune ONLY the glob, not the index + // This creates the "glob pruned but index not" scenario + use crate::journal::segmented::glob::{Config as GlobConfig, Glob}; + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + glob.prune(2).await.expect("Failed to prune glob"); + glob.sync_all().await.expect("Failed to sync glob"); + drop(glob); + + // Reinitialize - should recover gracefully with warning + // Index section 1 will be rewound to 0 entries + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Section 1 entries should be gone (index rewound due to glob pruned) + assert!(oversized.get(1, 0).await.is_err()); + + // Sections 2 and 3 should still be valid + assert!(oversized.get(2, 0).await.is_ok()); + assert!(oversized.get(3, 0).await.is_ok()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_index_partition_deleted() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate multiple sections + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for section in 1u64..=3 { + let value: TestValue = [section as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Delete index blob for section 2 (simulate corruption/loss) + context + .remove(&cfg.index_partition, Some(&2u64.to_be_bytes())) + .await + .expect("Failed to remove index"); + + // Reinitialize - should handle gracefully + // Section 2 is gone from index, orphan data in glob is acceptable + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Section 1 and 3 should still be valid + assert!(oversized.get(1, 0).await.is_ok()); + assert!(oversized.get(3, 0).await.is_ok()); + + // Section 2 should be gone (index file deleted) + assert!(oversized.get(2, 0).await.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_index_synced_but_glob_not() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append entries and sync + let mut locations = Vec::new(); + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + + // Add more entries WITHOUT syncing (simulates unsynced writes) + for i in 10..15u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + } + // Note: NOT calling sync() here + drop(oversized); + + // Simulate crash where index was synced but glob wasn't: + // Truncate glob back to the synced size (3 entries) + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let synced_size = byte_end(locations[2].1, locations[2].2); + blob.resize(synced_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should rewind index to match glob + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // First 3 entries should be valid + for i in 0..3u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + } + + // Entries 3-7 should be gone (unsynced, index rewound) + assert!(oversized.get(1, 3).await.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_glob_synced_but_index_not() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append entries and sync + let mut locations = Vec::new(); + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Simulate crash: truncate INDEX but leave GLOB intact + // This creates orphan data in glob (glob ahead of index) + let (blob, _size) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + + // Keep only first 2 index entries + let chunk_size = (TestEntry::SIZE + u32::SIZE) as u64; // entry + CRC32 + blob.resize(2 * chunk_size) + .await + .expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - glob has orphan data from entry 3 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First 2 entries should be valid + for i in 0..2u8 { + let (position, offset, size) = locations[i as usize]; + let entry = oversized.get(1, position).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + + let value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(value, [i; 16]); + } + + // Entry at position 2 should fail (index was truncated) + assert!(oversized.get(1, 2).await.is_err()); + + // Append new entries - should work despite orphan data in glob + let mut new_locations = Vec::new(); + for i in 10..13u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let (position, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + + // New entries start at position 2 (after the 2 valid entries) + assert_eq!(position, (i - 10 + 2) as u64); + new_locations.push((position, offset, size, i)); + + // Verify we can read the new entry + let retrieved = oversized.get(1, position).await.expect("Failed to get"); + assert_eq!(retrieved.id, i as u64); + + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + } + + // Sync and restart again to verify persistence with orphan data + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Reinitialize after adding data on top of orphan glob data + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit after append"); + + // Read all valid entries in the index + // First 2 entries from original data + for i in 0..2u8 { + let (position, offset, size) = locations[i as usize]; + let entry = oversized.get(1, position).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + + let value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(value, [i; 16]); + } + + // New entries added after recovery + for (position, offset, size, expected_id) in &new_locations { + let entry = oversized + .get(1, *position) + .await + .expect("Failed to get new entry after restart"); + assert_eq!(entry.id, *expected_id as u64); + + let value = oversized + .get_value(1, *offset, *size) + .await + .expect("Failed to get new value after restart"); + assert_eq!(value, [*expected_id; 16]); + } + + // Verify total entry count: 2 original + 3 new = 5 + assert!(oversized.get(1, 4).await.is_ok()); + assert!(oversized.get(1, 5).await.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_partial_index_entry() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append 3 entries + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Simulate crash during write: truncate index to partial entry + // Each entry is TestEntry::SIZE (20) + 4 (CRC32) = 24 bytes + // Truncate to 3 full entries + 10 bytes of partial entry + let (blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let partial_size = 3 * 24 + 10; // 3 full entries + partial + blob.resize(partial_size).await.expect("Failed to resize"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should handle partial entry gracefully + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First 3 entries should still be valid + for i in 0..3u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + } + + // Entry 3 should not exist (partial entry was removed) + assert!(oversized.get(1, 3).await.is_err()); + + // Append new entry after recovery + let value: TestValue = [42; 16]; + let entry = TestEntry::new(100, 0, 0); + let (pos, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 3); + + // Verify we can read the new entry + let retrieved = oversized.get(1, 3).await.expect("Failed to get new entry"); + assert_eq!(retrieved.id, 100); + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get new value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_only_partial_entry() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with single entry + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Truncate index to only partial data (less than one full entry) + let (blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + blob.resize(10).await.expect("Failed to resize"); // Less than chunk size + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should handle gracefully (rewind to 0) + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // No entries should exist + assert!(oversized.get(1, 0).await.is_err()); + + // Should be able to append after recovery + let value: TestValue = [99; 16]; + let entry = TestEntry::new(100, 0, 0); + let (pos, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 0); + + let retrieved = oversized.get(1, 0).await.expect("Failed to get"); + assert_eq!(retrieved.id, 100); + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_crash_during_rewind_index_ahead() { + // Simulates crash where index was rewound but glob wasn't + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut locations = Vec::new(); + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Simulate crash during rewind: truncate index to 2 entries but leave glob intact + // This simulates: rewind(index) succeeded, crash before rewind(glob) + let (blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let chunk_size = (TestEntry::SIZE + u32::SIZE) as u64; + blob.resize(2 * chunk_size) + .await + .expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - recovery should succeed (glob has orphan data) + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First 2 entries should be valid + for i in 0..2u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + } + + // Entries 2-4 should be gone (index was truncated) + assert!(oversized.get(1, 2).await.is_err()); + + // Should be able to append new entries + let (pos, _, _) = oversized + .append(1, TestEntry::new(100, 0, 0), &[100u8; 16]) + .await + .expect("Failed to append"); + assert_eq!(pos, 2); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_crash_during_rewind_glob_ahead() { + // Simulates crash where glob was rewound but index wasn't + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut locations = Vec::new(); + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Simulate crash during rewind: truncate glob to 2 entries but leave index intact + // This simulates: rewind(glob) succeeded, crash before rewind(index) + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let keep_size = byte_end(locations[1].1, locations[1].2); + blob.resize(keep_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - recovery should detect index entries pointing beyond glob + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First 2 entries should be valid (index rewound to match glob) + for i in 0..2u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + } + + // Entries 2-4 should be gone (index rewound during recovery) + assert!(oversized.get(1, 2).await.is_err()); + + // Should be able to append after recovery + let value: TestValue = [99; 16]; + let entry = TestEntry::new(100, 0, 0); + let (pos, offset, size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 2); + + let retrieved = oversized.get(1, 2).await.expect("Failed to get"); + assert_eq!(retrieved.id, 100); + let retrieved_value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, value); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_oversized_get_value_invalid_size() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), test_cfg()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + let (_, offset, _size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + + // Size 0 - should fail + assert!(oversized.get_value(1, offset, 0).await.is_err()); + + // Size < CRC_SIZE (1, 2, 3 bytes) - should fail with BlobInsufficientLength + for size in 1..4u32 { + let result = oversized.get_value(1, offset, size).await; + assert!(matches!( + result, + Err(Error::Runtime( + commonware_runtime::Error::BlobInsufficientLength + )) + )); + } + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_oversized_get_value_wrong_size() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), test_cfg()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + let (_, offset, correct_size) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + + // Size too small (but >= CRC_SIZE) - checksum mismatch + let result = oversized.get_value(1, offset, correct_size - 1).await; + assert!(matches!(result, Err(Error::ChecksumMismatch(_, _)))); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_values_has_orphan_section() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with sections 1 and 2 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for section in 1u64..=2 { + let value: TestValue = [section as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Manually create an orphan value section (section 3) without corresponding index + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + let orphan_value: TestValue = [99; 16]; + glob.append(3, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(3).await.expect("Failed to sync glob"); + drop(glob); + + // Reinitialize - should detect and remove the orphan section + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Sections 1 and 2 should still be valid + assert!(oversized.get(1, 0).await.is_ok()); + assert!(oversized.get(2, 0).await.is_ok()); + + // Newest section should be 2 (orphan was removed) + assert_eq!(oversized.newest_section(), Some(2)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_values_has_multiple_orphan_sections() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with only section 1 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [1; 16]; + let entry = TestEntry::new(1, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Manually create multiple orphan value sections (2, 3, 4) + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + + for section in 2u64..=4 { + let orphan_value: TestValue = [section as u8; 16]; + glob.append(section, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(section).await.expect("Failed to sync glob"); + } + drop(glob); + + // Reinitialize - should detect and remove all orphan sections + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Section 1 should still be valid + assert!(oversized.get(1, 0).await.is_ok()); + + // Newest section should be 1 (orphans removed) + assert_eq!(oversized.newest_section(), Some(1)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_index_empty_but_values_exist() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Manually create value sections without any index entries + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + + for section in 1u64..=3 { + let orphan_value: TestValue = [section as u8; 16]; + glob.append(section, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(section).await.expect("Failed to sync glob"); + } + drop(glob); + + // Initialize oversized - should remove all orphan value sections + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // No sections should exist + assert_eq!(oversized.newest_section(), None); + assert_eq!(oversized.oldest_section(), None); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_orphan_section_append_after() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with section 1 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [1; 16]; + let entry = TestEntry::new(1, 0, 0); + let (_, offset1, size1) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Manually create orphan value sections (2, 3) + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + + for section in 2u64..=3 { + let orphan_value: TestValue = [section as u8; 16]; + glob.append(section, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(section).await.expect("Failed to sync glob"); + } + drop(glob); + + // Reinitialize - should remove orphan sections + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Section 1 should still be valid + let entry = oversized.get(1, 0).await.expect("Failed to get"); + assert_eq!(entry.id, 1); + let value = oversized + .get_value(1, offset1, size1) + .await + .expect("Failed to get value"); + assert_eq!(value, [1; 16]); + + // Should be able to append to section 2 after recovery + let new_value: TestValue = [42; 16]; + let new_entry = TestEntry::new(42, 0, 0); + let (pos, offset, size) = oversized + .append(2, new_entry, &new_value) + .await + .expect("Failed to append after recovery"); + assert_eq!(pos, 0); + + // Verify the new entry + let retrieved = oversized.get(2, 0).await.expect("Failed to get"); + assert_eq!(retrieved.id, 42); + let retrieved_value = oversized + .get_value(2, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(retrieved_value, new_value); + + // Sync and restart to verify persistence + oversized.sync(2).await.expect("Failed to sync"); + drop(oversized); + + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit after append"); + + // Both sections should be valid + assert!(oversized.get(1, 0).await.is_ok()); + assert!(oversized.get(2, 0).await.is_ok()); + assert_eq!(oversized.newest_section(), Some(2)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_no_orphan_sections() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with sections 1, 2, 3 (no orphans) + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for section in 1u64..=3 { + let value: TestValue = [section as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Reinitialize - no orphan cleanup needed + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // All sections should be valid + for section in 1u64..=3 { + let entry = oversized.get(section, 0).await.expect("Failed to get"); + assert_eq!(entry.id, section); + } + assert_eq!(oversized.newest_section(), Some(3)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_orphan_with_empty_index_section() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate section 1 with entries + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [1; 16]; + let entry = TestEntry::new(1, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Manually create orphan value section 2 + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + let orphan_value: TestValue = [2; 16]; + glob.append(2, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(2).await.expect("Failed to sync glob"); + drop(glob); + + // Now truncate index section 1 to 0 (making it empty but still tracked) + let (blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + blob.resize(0).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should handle empty index section and remove orphan value section + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // Section 1 should exist but have no entries (empty after truncation) + assert!(oversized.get(1, 0).await.is_err()); + + // Orphan section 2 should be removed + assert_eq!(oversized.newest_section(), Some(1)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_orphan_sections_with_gaps() { + // Test non-contiguous sections: index has [1, 3, 5], values has [1, 2, 3, 4, 5, 6] + // Orphan sections 2, 4, 6 should be removed + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create index with sections 1, 3, 5 (gaps) + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for section in [1u64, 3, 5] { + let value: TestValue = [section as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Manually create orphan value sections 2, 4, 6 (filling gaps and beyond) + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + + for section in [2u64, 4, 6] { + let orphan_value: TestValue = [section as u8; 16]; + glob.append(section, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(section).await.expect("Failed to sync glob"); + } + drop(glob); + + // Reinitialize - should remove orphan sections 2, 4, 6 + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg) + .await + .expect("Failed to reinit"); + + // Sections 1, 3, 5 should still be valid + for section in [1u64, 3, 5] { + let entry = oversized.get(section, 0).await.expect("Failed to get"); + assert_eq!(entry.id, section); + } + + // Verify only sections 1, 3, 5 exist (orphans removed) + assert_eq!(oversized.oldest_section(), Some(1)); + assert_eq!(oversized.newest_section(), Some(5)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_glob_trailing_garbage_truncated() { + // Tests the bug fix: when value is written to glob but index entry isn't + // (crash after value write, before index write), recovery should truncate + // the glob trailing garbage so subsequent appends start at correct offset. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + // Append 2 entries + let mut locations = Vec::new(); + for i in 0..2u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + + // Record where next entry SHOULD start (end of entry 1) + let expected_next_offset = byte_end(locations[1].1, locations[1].2); + drop(oversized); + + // Simulate crash: write garbage to glob (simulating partial value write) + let (blob, size) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + assert_eq!(size, expected_next_offset); + + // Write 100 bytes of garbage (simulating partial/failed value write) + let garbage = vec![0xDE; 100]; + blob.write_at(garbage, size) + .await + .expect("Failed to write garbage"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Verify glob now has trailing garbage + let (blob, new_size) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + assert_eq!(new_size, expected_next_offset + 100); + drop(blob); + + // Reinitialize - should truncate the trailing garbage + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First 2 entries should still be valid + for i in 0..2u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + } + + // Append new entry - should start at expected_next_offset, NOT at garbage end + let new_value: TestValue = [99; 16]; + let new_entry = TestEntry::new(99, 0, 0); + let (pos, offset, _size) = oversized + .append(1, new_entry, &new_value) + .await + .expect("Failed to append after recovery"); + + // Verify position is 2 (after the 2 existing entries) + assert_eq!(pos, 2); + + // Verify offset is at expected_next_offset (garbage was truncated) + assert_eq!(offset, expected_next_offset); + + // Verify we can read the new entry + let retrieved = oversized.get(1, 2).await.expect("Failed to get new entry"); + assert_eq!(retrieved.id, 99); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_entry_with_overflow_offset() { + // Tests that an entry with offset near u64::MAX that would overflow + // when added to size is detected as invalid during recovery. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate with valid entry + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [1; 16]; + let entry = TestEntry::new(1, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Corrupt the index entry to have offset near u64::MAX + // Entry format: id (8) + value_offset (8) + value_size (4) + CRC32 (4) = 24 bytes + let (blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + + // Write a corrupted entry with offset = u64::MAX - 10 and size = 100 + // This would overflow when computing offset + size + let mut corrupted_entry = Vec::new(); + 1u64.write(&mut corrupted_entry); // id + (u64::MAX - 10).write(&mut corrupted_entry); // value_offset (near max) + 100u32.write(&mut corrupted_entry); // value_size + let checksum = crc32fast::hash(&corrupted_entry); + corrupted_entry.put_u32(checksum); + + blob.write_at(corrupted_entry, 0) + .await + .expect("Failed to write corrupted entry"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - recovery should detect the invalid entry + // (offset + size would overflow, and even with saturating_add it exceeds glob_size) + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // The corrupted entry should have been rewound (invalid) + assert!(oversized.get(1, 0).await.is_err()); + + // Should be able to append after recovery + let new_value: TestValue = [99; 16]; + let new_entry = TestEntry::new(99, 0, 0); + let (pos, new_offset, _) = oversized + .append(1, new_entry, &new_value) + .await + .expect("Failed to append after recovery"); + + // Position should be 0 (corrupted entry was removed) + assert_eq!(pos, 0); + // Offset should be 0 (glob was truncated to 0) + assert_eq!(new_offset, 0); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_empty_section_persistence() { + // Tests that sections that become empty (all entries removed/rewound) + // are handled correctly across restart cycles. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Create and populate section 1 with entries + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + for i in 0..3u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + } + oversized.sync(1).await.expect("Failed to sync"); + + // Also create section 2 to ensure it survives + let value2: TestValue = [10; 16]; + let entry2 = TestEntry::new(10, 0, 0); + oversized + .append(2, entry2, &value2) + .await + .expect("Failed to append to section 2"); + oversized.sync(2).await.expect("Failed to sync section 2"); + drop(oversized); + + // Truncate section 1's index to 0 (making it empty) + let (blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + blob.resize(0).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // First restart - recovery should handle empty section 1 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Section 1 should exist but have no entries + assert!(oversized.get(1, 0).await.is_err()); + + // Section 2 should still be valid + let entry = oversized.get(2, 0).await.expect("Failed to get section 2"); + assert_eq!(entry.id, 10); + + // Section 1 should still be tracked (blob exists but is empty) + assert_eq!(oversized.oldest_section(), Some(1)); + + // Append to empty section 1 + // Note: When index is truncated to 0 but the index blob still exists, + // the glob is NOT truncated (the section isn't considered an orphan). + // The glob still has orphan DATA from the old entries, but this doesn't + // affect correctness - new entries simply append after the orphan data. + let new_value: TestValue = [99; 16]; + let new_entry = TestEntry::new(99, 0, 0); + let (pos, offset, size) = oversized + .append(1, new_entry, &new_value) + .await + .expect("Failed to append to empty section"); + assert_eq!(pos, 0); + // Glob offset is non-zero because orphan data wasn't truncated + assert!(offset > 0); + oversized.sync(1).await.expect("Failed to sync"); + + // Verify the new entry is readable despite orphan data before it + let entry = oversized.get(1, 0).await.expect("Failed to get"); + assert_eq!(entry.id, 99); + let value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(value, new_value); + + drop(oversized); + + // Second restart - verify persistence + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit again"); + + // Section 1's new entry should be valid + let entry = oversized.get(1, 0).await.expect("Failed to get"); + assert_eq!(entry.id, 99); + + // Section 2 should still be valid + let entry = oversized.get(2, 0).await.expect("Failed to get section 2"); + assert_eq!(entry.id, 10); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_get_value_size_equals_crc_size() { + // Tests the boundary condition where size = 4 (just CRC, no data). + // This should fail because there's no actual data to decode. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), test_cfg()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + let (_, offset, _) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + + // Size = 4 (exactly CRC_SIZE) means 0 bytes of actual data + // This should fail with ChecksumMismatch or decode error + let result = oversized.get_value(1, offset, 4).await; + assert!(result.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_get_value_size_just_over_crc() { + // Tests size = 5 (CRC + 1 byte of data). + // This should fail because the data is too short to decode. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), test_cfg()) + .await + .expect("Failed to init"); + + let value: TestValue = [42; 16]; + let entry = TestEntry::new(1, 0, 0); + let (_, offset, _) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + + // Size = 5 means 1 byte of actual data (after stripping CRC) + // This should fail with checksum mismatch since we're reading wrong bytes + let result = oversized.get_value(1, offset, 5).await; + assert!(result.is_err()); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_maximum_section_numbers() { + // Test recovery with very large section numbers near u64::MAX to check + // for overflow edge cases in section arithmetic. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Use section numbers near u64::MAX + let large_sections = [u64::MAX - 3, u64::MAX - 2, u64::MAX - 1]; + + // Create and populate with large section numbers + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut locations = Vec::new(); + for §ion in &large_sections { + let value: TestValue = [(section & 0xFF) as u8; 16]; + let entry = TestEntry::new(section, 0, 0); + let loc = oversized + .append(section, entry, &value) + .await + .expect("Failed to append"); + locations.push((section, loc)); + oversized.sync(section).await.expect("Failed to sync"); + } + drop(oversized); + + // Simulate crash: truncate glob for middle section + let middle_section = large_sections[1]; + let (blob, size) = context + .open(&cfg.value_partition, &middle_section.to_be_bytes()) + .await + .expect("Failed to open blob"); + blob.resize(size / 2).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Reinitialize - should recover without overflow panics + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // First and last sections should still be valid + let entry = oversized + .get(large_sections[0], 0) + .await + .expect("Failed to get first section"); + assert_eq!(entry.id, large_sections[0]); + + let entry = oversized + .get(large_sections[2], 0) + .await + .expect("Failed to get last section"); + assert_eq!(entry.id, large_sections[2]); + + // Middle section should have been rewound (no entries) + assert!(oversized.get(middle_section, 0).await.is_err()); + + // Verify we can still append to these large sections + let new_value: TestValue = [0xAB; 16]; + let new_entry = TestEntry::new(999, 0, 0); + let mut oversized = oversized; + oversized + .append(middle_section, new_entry, &new_value) + .await + .expect("Failed to append after recovery"); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_crash_during_recovery_rewind() { + // Tests a nested crash scenario: initial crash leaves inconsistent state, + // then a second crash occurs during recovery's rewind operation. + // This simulates the worst-case where recovery itself is interrupted. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Phase 1: Create valid data with 5 entries + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let mut locations = Vec::new(); + for i in 0..5u8 { + let value: TestValue = [i; 16]; + let entry = TestEntry::new(i as u64, 0, 0); + let loc = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + locations.push(loc); + } + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Phase 2: Simulate first crash - truncate glob to lose last 2 entries + let (blob, _) = context + .open(&cfg.value_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open blob"); + let keep_size = byte_end(locations[2].1, locations[2].2); + blob.resize(keep_size).await.expect("Failed to truncate"); + blob.sync().await.expect("Failed to sync"); + drop(blob); + + // Phase 3: Simulate crash during recovery's rewind + // Recovery would try to rewind index from 5 entries to 3 entries. + // Simulate partial rewind by manually truncating index to 4 entries + // (as if crash occurred mid-rewind). + let chunk_size = FixedJournal::::CHUNK_SIZE as u64; + let (index_blob, _) = context + .open(&cfg.index_partition, &1u64.to_be_bytes()) + .await + .expect("Failed to open index blob"); + let partial_rewind_size = 4 * chunk_size; // 4 entries instead of 3 + index_blob + .resize(partial_rewind_size) + .await + .expect("Failed to resize"); + index_blob.sync().await.expect("Failed to sync"); + drop(index_blob); + + // Phase 4: Second recovery attempt should handle the inconsistent state + // Index has 4 entries, but glob only supports 3. + let oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit after nested crash"); + + // Only first 3 entries should be valid (recovery should rewind again) + for i in 0..3u8 { + let entry = oversized.get(1, i as u64).await.expect("Failed to get"); + assert_eq!(entry.id, i as u64); + + let (_, offset, size) = locations[i as usize]; + let value = oversized + .get_value(1, offset, size) + .await + .expect("Failed to get value"); + assert_eq!(value, [i; 16]); + } + + // Entry 3 should not exist (index was rewound to match glob) + assert!(oversized.get(1, 3).await.is_err()); + + // Verify append works after nested crash recovery + let new_value: TestValue = [0xFF; 16]; + let new_entry = TestEntry::new(100, 0, 0); + let mut oversized = oversized; + let (pos, offset, _size) = oversized + .append(1, new_entry, &new_value) + .await + .expect("Failed to append"); + assert_eq!(pos, 3); // Should be position 3 (after the 3 valid entries) + + // Verify the offset starts where entry 2 ended (no gaps) + assert_eq!(offset, byte_end(locations[2].1, locations[2].2)); + + oversized.destroy().await.expect("Failed to destroy"); + }); + } + + #[test_traced] + fn test_recovery_crash_during_orphan_cleanup() { + // Tests crash during orphan section cleanup: recovery starts removing + // orphan value sections, but crashes mid-cleanup. + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = test_cfg(); + + // Phase 1: Create valid data in section 1 + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to init"); + + let value: TestValue = [1; 16]; + let entry = TestEntry::new(1, 0, 0); + let (_, offset1, size1) = oversized + .append(1, entry, &value) + .await + .expect("Failed to append"); + oversized.sync(1).await.expect("Failed to sync"); + drop(oversized); + + // Phase 2: Create orphan value sections 2, 3, 4 (no index entries) + let glob_cfg = GlobConfig { + partition: cfg.value_partition.clone(), + compression: cfg.compression, + codec_config: (), + write_buffer: cfg.value_write_buffer, + }; + let mut glob: Glob<_, TestValue> = Glob::init(context.with_label("glob"), glob_cfg) + .await + .expect("Failed to init glob"); + + for section in 2u64..=4 { + let orphan_value: TestValue = [section as u8; 16]; + glob.append(section, &orphan_value) + .await + .expect("Failed to append orphan"); + glob.sync(section).await.expect("Failed to sync glob"); + } + drop(glob); + + // Phase 3: Simulate partial orphan cleanup (section 2 removed, 3 and 4 remain) + // This simulates a crash during cleanup_orphan_value_sections() + context + .remove(&cfg.value_partition, Some(&2u64.to_be_bytes())) + .await + .expect("Failed to remove section 2"); + + // Phase 4: Recovery should complete the cleanup + let mut oversized: Oversized<_, TestEntry, TestValue> = + Oversized::init(context.clone(), cfg.clone()) + .await + .expect("Failed to reinit"); + + // Section 1 should still be valid + let entry = oversized.get(1, 0).await.expect("Failed to get"); + assert_eq!(entry.id, 1); + let value = oversized + .get_value(1, offset1, size1) + .await + .expect("Failed to get value"); + assert_eq!(value, [1; 16]); + + // No orphan sections should remain + assert_eq!(oversized.oldest_section(), Some(1)); + assert_eq!(oversized.newest_section(), Some(1)); + + // Should be able to append to section 2 (now clean) + let new_value: TestValue = [42; 16]; + let new_entry = TestEntry::new(42, 0, 0); + let (pos, _, _) = oversized + .append(2, new_entry, &new_value) + .await + .expect("Failed to append to section 2"); + assert_eq!(pos, 0); // First entry in new section + + oversized.destroy().await.expect("Failed to destroy"); + }); + } +} diff --git a/storage/src/journal/segmented/variable.rs b/storage/src/journal/segmented/variable.rs index 7373132c12..f545f33c9e 100644 --- a/storage/src/journal/segmented/variable.rs +++ b/storage/src/journal/segmented/variable.rs @@ -33,15 +33,6 @@ //! number of open `Blobs`, they can group data into fewer `sections` and/or prune unused //! `sections`. //! -//! # Offset Alignment -//! -//! In practice, `Journal` users won't store `u64::MAX` bytes of data in a given `section` (the max -//! `Offset` provided by `Blob`). To reduce the memory usage for tracking offsets within `Journal`, -//! offsets are thus `u32` (4 bytes) and aligned to 16 bytes. This means that the maximum size of -//! any `section` is `u32::MAX * 17 = ~70GB` bytes (the last offset item can store up to `u32::MAX` -//! bytes). If more data is written to a `section` past this max, an `OffsetOverflow` error is -//! returned. -//! //! # Sync //! //! Data written to `Journal` may not be immediately persisted to `Storage`. It is up to the caller @@ -94,24 +85,17 @@ //! }); //! ``` +use super::manager::{AppendFactory, Config as ManagerConfig, Manager}; use crate::journal::Error; use bytes::{Buf, BufMut}; -use commonware_codec::{varint::UInt, Codec, EncodeSize, ReadExt, Write as CodecWrite}; +use commonware_codec::{varint::UInt, Codec, EncodeSize, FixedSize, ReadExt, Write as CodecWrite}; use commonware_runtime::{ buffer::{Append, PoolRef, Read}, - telemetry::metrics::status::GaugeExt, Blob, Error as RError, Metrics, Storage, }; -use commonware_utils::hex; use futures::stream::{self, Stream, StreamExt}; -use prometheus_client::metrics::{counter::Counter, gauge::Gauge}; -use std::{ - collections::{btree_map::Entry, BTreeMap}, - io::Cursor, - marker::PhantomData, - num::NonZeroUsize, -}; -use tracing::{debug, trace, warn}; +use std::{io::Cursor, num::NonZeroUsize}; +use tracing::{trace, warn}; use zstd::{bulk::compress, decode_all}; /// Configuration for `Journal` storage. @@ -134,43 +118,20 @@ pub struct Config { pub write_buffer: NonZeroUsize, } -pub(crate) const ITEM_ALIGNMENT: u64 = 16; - /// Minimum size of any item: 1 byte varint (size=0) + 0 bytes data + 4 bytes checksum. /// This is also the max varint size for u32, so we can always read this many bytes /// at the start of an item to get the complete varint. const MIN_ITEM_SIZE: usize = 5; -/// Computes the next offset for an item using the underlying `u64` -/// offset of `Blob`. -#[inline] -fn compute_next_offset(mut offset: u64) -> Result { - let overage = offset % ITEM_ALIGNMENT; - if overage != 0 { - offset += ITEM_ALIGNMENT - overage; - } - let offset = offset / ITEM_ALIGNMENT; - let aligned_offset = offset.try_into().map_err(|_| Error::OffsetOverflow)?; - Ok(aligned_offset) -} - /// Implementation of `Journal` storage. pub struct Journal { - pub(crate) context: E, - pub(crate) cfg: Config, - - pub(crate) blobs: BTreeMap>, - - /// A section number before which all sections have been pruned. This value is not persisted, - /// and is initialized to 0 at startup. It's updated only during calls to `prune` during the - /// current execution, and therefore provides only a best effort lower-bound on the true value. - pub(crate) oldest_retained_section: u64, + manager: Manager, - pub(crate) tracked: Gauge, - pub(crate) synced: Counter, - pub(crate) pruned: Counter, + /// Compression level (if enabled). + compression: Option, - pub(crate) _phantom: PhantomData, + /// Codec configuration. + codec_config: V::Cfg, } impl Journal { @@ -180,67 +141,31 @@ impl Journal { /// initialization. The `replay` method can be used /// to iterate over all items in the `Journal`. pub async fn init(context: E, cfg: Config) -> Result { - // Iterate over blobs in partition - let mut blobs = BTreeMap::new(); - let stored_blobs = match context.scan(&cfg.partition).await { - Ok(blobs) => blobs, - Err(RError::PartitionMissing(_)) => Vec::new(), - Err(err) => return Err(Error::Runtime(err)), + let manager_cfg = ManagerConfig { + partition: cfg.partition, + factory: AppendFactory { + write_buffer: cfg.write_buffer, + pool_ref: cfg.buffer_pool, + }, }; - for name in stored_blobs { - let (blob, size) = context.open(&cfg.partition, &name).await?; - let hex_name = hex(&name); - let section = match name.try_into() { - Ok(section) => u64::from_be_bytes(section), - Err(_) => return Err(Error::InvalidBlobName(hex_name)), - }; - debug!(section, blob = hex_name, size, "loaded section"); - let blob = Append::new(blob, size, cfg.write_buffer, cfg.buffer_pool.clone()).await?; - blobs.insert(section, blob); - } + let manager = Manager::init(context, manager_cfg).await?; - // Initialize metrics - let tracked = Gauge::default(); - let synced = Counter::default(); - let pruned = Counter::default(); - context.register("tracked", "Number of blobs", tracked.clone()); - context.register("synced", "Number of syncs", synced.clone()); - context.register("pruned", "Number of blobs pruned", pruned.clone()); - let _ = tracked.try_set(blobs.len()); - - // Create journal instance Ok(Self { - context, - cfg, - blobs, - oldest_retained_section: 0, - tracked, - synced, - pruned, - - _phantom: PhantomData, + manager, + compression: cfg.compression, + codec_config: cfg.codec_config, }) } - /// Ensures that a section pruned during the current execution is not accessed. - const fn prune_guard(&self, section: u64) -> Result<(), Error> { - if section < self.oldest_retained_section { - Err(Error::AlreadyPrunedToSection(self.oldest_retained_section)) - } else { - Ok(()) - } - } - /// Reads an item from the blob at the given offset. - pub(crate) async fn read( + async fn read( compressed: bool, cfg: &V::Cfg, blob: &Append, - offset: u32, - ) -> Result<(u32, u32, V), Error> { + offset: u64, + ) -> Result<(u64, u32, V), Error> { // Read varint size (max 5 bytes for u32) let mut hasher = crc32fast::Hasher::new(); - let offset = offset as u64 * ITEM_ALIGNMENT; let varint_buf = blob.read_at(vec![0; MIN_ITEM_SIZE], offset).await?; let mut varint = varint_buf.as_ref(); let size = UInt::::read(&mut varint).map_err(Error::Codec)?.0 as usize; @@ -251,10 +176,10 @@ impl Journal { .ok_or(Error::OffsetOverflow)?; // Read remaining - let buf_size = size.checked_add(4).ok_or(Error::OffsetOverflow)?; + let buf_size = size.checked_add(u32::SIZE).ok_or(Error::OffsetOverflow)?; let buf = blob.read_at(vec![0u8; buf_size], offset).await?; let buf = buf.as_ref(); - let offset = offset + let next_offset = offset .checked_add(buf_size as u64) .ok_or(Error::OffsetOverflow)?; @@ -269,9 +194,6 @@ impl Journal { return Err(Error::ChecksumMismatch(stored_checksum, checksum)); } - // Compute next offset - let aligned_offset = compute_next_offset(offset)?; - // If compression is enabled, decompress the item let item = if compressed { let decompressed = @@ -282,22 +204,19 @@ impl Journal { }; // Return item - Ok((aligned_offset, size as u32, item)) + Ok((next_offset, size as u32, item)) } /// Helper function to read an item from a [Read]. async fn read_buffered( reader: &mut Read>, - offset: u32, + offset: u64, cfg: &V::Cfg, compressed: bool, - ) -> Result<(u32, u64, u32, V), Error> { - // Calculate absolute file offset from the item offset - let file_offset = offset as u64 * ITEM_ALIGNMENT; - + ) -> Result<(u64, u64, u32, V), Error> { // If we're not at the right position, seek to it - if reader.position() != file_offset { - reader.seek_to(file_offset).map_err(Error::Runtime)?; + if reader.position() != offset { + reader.seek_to(offset).map_err(Error::Runtime)?; } // Read varint size (max 5 bytes for u32, and min item size is 5 bytes) @@ -313,7 +232,7 @@ impl Journal { hasher.update(&varint_buf[..varint_len]); // Read remaining data+checksum (we already have some bytes from the varint read) - let buf_size = size.checked_add(4).ok_or(Error::OffsetOverflow)?; + let buf_size = size.checked_add(u32::SIZE).ok_or(Error::OffsetOverflow)?; let already_read = MIN_ITEM_SIZE - varint_len; let mut buf = vec![0u8; buf_size]; buf[..already_read].copy_from_slice(&varint_buf[varint_len..]); @@ -345,9 +264,8 @@ impl Journal { }; // Calculate next offset - let current_pos = reader.position(); - let aligned_offset = compute_next_offset(current_pos)?; - Ok((aligned_offset, current_pos, size as u32, item)) + let next_offset = reader.position(); + Ok((next_offset, next_offset, size as u32, item)) } /// Returns an ordered stream of all items in the journal starting with the item at the given @@ -365,20 +283,18 @@ impl Journal { pub async fn replay( &self, start_section: u64, - mut offset: u32, + mut offset: u64, buffer: NonZeroUsize, - ) -> Result> + '_, Error> { + ) -> Result> + '_, Error> { // Collect all blobs to replay - let codec_config = self.cfg.codec_config.clone(); - let compressed = self.cfg.compression.is_some(); - let mut blobs = Vec::with_capacity(self.blobs.len()); - for (section, blob) in self.blobs.range(start_section..) { + let codec_config = self.codec_config.clone(); + let compressed = self.compression.is_some(); + let mut blobs = Vec::new(); + for (§ion, blob) in self.manager.sections_from(start_section) { let blob_size = blob.size().await; - let max_offset = compute_next_offset(blob_size)?; blobs.push(( - *section, + section, blob.clone(), - max_offset, blob_size, codec_config.clone(), compressed, @@ -388,11 +304,11 @@ impl Journal { // Replay all blobs in order and stream items as they are read (to avoid occupying too much // memory with buffered data) Ok(stream::iter(blobs).flat_map( - move |(section, blob, max_offset, blob_size, codec_config, compressed)| { + move |(section, blob, blob_size, codec_config, compressed)| { // Created buffered reader let mut reader = Read::new(blob, blob_size, buffer); if section == start_section && offset != 0 { - if let Err(err) = reader.seek_to(offset as u64 * ITEM_ALIGNMENT) { + if let Err(err) = reader.seek_to(offset) { warn!(section, offset, ?err, "failed to seek to offset"); // Return early with the error to terminate the entire stream return stream::once(async move { Err(err.into()) }).left_stream(); @@ -403,95 +319,97 @@ impl Journal { // Read over the blob stream::unfold( - (section, reader, offset, 0u64, codec_config, compressed), - move |( - section, - mut reader, - offset, - valid_size, - codec_config, - compressed, - )| async move { - // Check if we are at the end of the blob - if offset >= max_offset { - return None; - } - - // Read an item from the buffer - match Self::read_buffered( - &mut reader, - offset, - &codec_config, - compressed, - ) + ( + section, + reader, + offset, + 0u64, + blob_size, + codec_config, + compressed, + ), + move |( + section, + mut reader, + offset, + valid_size, + blob_size, + codec_config, + compressed, + )| async move { + // Check if we are at the end of the blob + if offset >= blob_size { + return None; + } + + // Read an item from the buffer + match Self::read_buffered(&mut reader, offset, &codec_config, compressed) .await - { - Ok((next_offset, next_valid_size, size, item)) => { - trace!(blob = section, cursor = offset, "replayed item"); - Some(( - Ok((section, offset, size, item)), - ( - section, - reader, - next_offset, - next_valid_size, - codec_config, - compressed, - ), - )) - } - Err(Error::ChecksumMismatch(expected, found)) => { - // If we encounter corruption, we prune to the last valid item. This - // can happen during an unclean file close (where pending data is not - // fully synced to disk). - warn!( - blob = section, - bad_offset = offset, - new_size = valid_size, - expected, - found, - "corruption detected: truncating" - ); - reader.resize(valid_size).await.ok()?; - None - } - Err(Error::Runtime(RError::BlobInsufficientLength)) => { - // If we encounter trailing bytes, we prune to the last - // valid item. This can happen during an unclean file close (where - // pending data is not fully synced to disk). - warn!( - blob = section, - bad_offset = offset, - new_size = valid_size, - "trailing bytes detected: truncating" - ); - reader.resize(valid_size).await.ok()?; - None - } - Err(err) => { - // If we encounter an unexpected error, return it without attempting - // to fix anything. - warn!( - blob = section, - cursor = offset, - ?err, - "unexpected error" - ); - Some(( - Err(err), - ( - section, - reader, - offset, - valid_size, - codec_config, - compressed, - ), - )) - } + { + Ok((next_offset, next_valid_size, size, item)) => { + trace!(blob = section, cursor = offset, "replayed item"); + Some(( + Ok((section, offset, size, item)), + ( + section, + reader, + next_offset, + next_valid_size, + blob_size, + codec_config, + compressed, + ), + )) + } + Err(Error::ChecksumMismatch(expected, found)) => { + // If we encounter corruption, we prune to the last valid item. This + // can happen during an unclean file close (where pending data is not + // fully synced to disk). + warn!( + blob = section, + bad_offset = offset, + new_size = valid_size, + expected, + found, + "corruption detected: truncating" + ); + reader.resize(valid_size).await.ok()?; + None + } + Err(Error::Runtime(RError::BlobInsufficientLength)) => { + // If we encounter trailing bytes, we prune to the last + // valid item. This can happen during an unclean file close (where + // pending data is not fully synced to disk). + warn!( + blob = section, + bad_offset = offset, + new_size = valid_size, + "trailing bytes detected: truncating" + ); + reader.resize(valid_size).await.ok()?; + None } - }, - ).right_stream() + Err(err) => { + // If we encounter an unexpected error, return it without attempting + // to fix anything. + warn!(blob = section, cursor = offset, ?err, "unexpected error"); + Some(( + Err(err), + ( + section, + reader, + offset, + valid_size, + blob_size, + codec_config, + compressed, + ), + )) + } + } + }, + ) + .right_stream() }, )) } @@ -507,73 +425,63 @@ impl Journal { /// to the `Blob` will be considered corrupted (as the trailing bytes will fail /// the checksum verification). It is recommended to call `replay` before calling /// `append` to prevent this. - pub async fn append(&mut self, section: u64, item: V) -> Result<(u32, u32), Error> { - // Check last pruned - self.prune_guard(section)?; - - // Create item - let encoded = item.encode(); - let encoded = if let Some(compression) = self.cfg.compression { - compress(&encoded, compression as i32).map_err(|_| Error::CompressionFailed)? + pub async fn append(&mut self, section: u64, item: V) -> Result<(u64, u32), Error> { + // Create buffer with item data + let (buf, item_len) = if let Some(compression) = self.compression { + // Compressed: encode first, then compress + let encoded = item.encode(); + let compressed = + compress(&encoded, compression as i32).map_err(|_| Error::CompressionFailed)?; + let item_len = compressed.len(); + let item_len_u32: u32 = match item_len.try_into() { + Ok(len) => len, + Err(_) => return Err(Error::ItemTooLarge(item_len)), + }; + let size_len = UInt(item_len_u32).encode_size(); + let entry_len = size_len + .checked_add(item_len) + .and_then(|v| v.checked_add(4)) + .ok_or(Error::OffsetOverflow)?; + + let mut buf = Vec::with_capacity(entry_len); + UInt(item_len_u32).write(&mut buf); + buf.put_slice(&compressed); + let checksum = crc32fast::hash(&buf); + buf.put_u32(checksum); + + (buf, item_len) } else { - encoded.into() - }; - - // Ensure item is not too large - let item_len = encoded.len(); - let item_len = match item_len.try_into() { - Ok(len) => len, - Err(_) => return Err(Error::ItemTooLarge(item_len)), - }; - let size_len = UInt(item_len).encode_size(); - let entry_len = size_len + item_len as usize + 4; - - // Get existing blob or create new one - let blob = match self.blobs.entry(section) { - Entry::Occupied(entry) => entry.into_mut(), - Entry::Vacant(entry) => { - let name = section.to_be_bytes(); - let (blob, size) = self.context.open(&self.cfg.partition, &name).await?; - let blob = Append::new( - blob, - size, - self.cfg.write_buffer, - self.cfg.buffer_pool.clone(), - ) - .await?; - self.tracked.inc(); - entry.insert(blob) - } + // Uncompressed: pre-allocate exact size to avoid copying + let item_len = item.encode_size(); + let item_len_u32: u32 = match item_len.try_into() { + Ok(len) => len, + Err(_) => return Err(Error::ItemTooLarge(item_len)), + }; + let size_len = UInt(item_len_u32).encode_size(); + let entry_len = size_len + .checked_add(item_len) + .and_then(|v| v.checked_add(4)) + .ok_or(Error::OffsetOverflow)?; + + let mut buf = Vec::with_capacity(entry_len); + UInt(item_len_u32).write(&mut buf); + item.write(&mut buf); + let checksum = crc32fast::hash(&buf); + buf.put_u32(checksum); + + (buf, item_len) }; - // Calculate alignment - let cursor = blob.size().await; - let offset = compute_next_offset(cursor)?; - let aligned_cursor = offset as u64 * ITEM_ALIGNMENT; - let padding = (aligned_cursor - cursor) as usize; - - // Populate buffer - let mut buf = Vec::with_capacity(padding + entry_len); - - // Add padding bytes if necessary - if padding > 0 { - buf.resize(padding, 0); - } - - // Add entry data - let entry_start = buf.len(); - UInt(item_len).write(&mut buf); - buf.put_slice(&encoded); + // Get or create blob + let blob = self.manager.get_or_create(section).await?; - // Calculate checksum only for the entry data (without padding) - let checksum = crc32fast::hash(&buf[entry_start..]); - buf.put_u32(checksum); - assert_eq!(buf[entry_start..].len(), entry_len); + // Get current position - this is where we'll write + let offset = blob.size().await; // Append item to blob blob.append(buf).await?; trace!(blob = section, offset, "appended item"); - Ok((offset, item_len)) + Ok((offset, item_len as u32)) } /// Retrieves an item from `Journal` at a given `section` and `offset`. @@ -586,21 +494,15 @@ impl Journal { /// - An invalid `offset` for a given section (that is, an offset that doesn't correspond to a /// previously appended item) will result in an error, with the specific type being /// undefined. - pub async fn get(&self, section: u64, offset: u32) -> Result { - self.prune_guard(section)?; - let blob = match self.blobs.get(§ion) { - Some(blob) => blob, - None => return Err(Error::SectionOutOfRange(section)), - }; + pub async fn get(&self, section: u64, offset: u64) -> Result { + let blob = self + .manager + .get(section)? + .ok_or(Error::SectionOutOfRange(section))?; // Perform a multi-op read. - let (_, _, item) = Self::read( - self.cfg.compression.is_some(), - &self.cfg.codec_config, - blob, - offset, - ) - .await?; + let (_, _, item) = + Self::read(self.compression.is_some(), &self.codec_config, blob, offset).await?; Ok(item) } @@ -608,11 +510,7 @@ impl Journal { /// /// Returns 0 if the section does not exist. pub async fn size(&self, section: u64) -> Result { - self.prune_guard(section)?; - match self.blobs.get(§ion) { - Some(blob) => Ok(blob.size().await), - None => Ok(0), - } + self.manager.size(section).await } /// Rewinds the journal to the given `section` and `offset`, removing any data beyond it. @@ -622,8 +520,8 @@ impl Journal { /// * This operation is not guaranteed to survive restarts until sync is called. /// * This operation is not atomic, but it will always leave the journal in a consistent state /// in the event of failure since blobs are always removed in reverse order of section. - pub async fn rewind_to_offset(&mut self, section: u64, offset: u32) -> Result<(), Error> { - self.rewind(section, offset as u64 * ITEM_ALIGNMENT).await + pub async fn rewind_to_offset(&mut self, section: u64, offset: u64) -> Result<(), Error> { + self.manager.rewind(section, offset).await } /// Rewinds the journal to the given `section` and `size`. @@ -636,48 +534,7 @@ impl Journal { /// * This operation is not atomic, but it will always leave the journal in a consistent state /// in the event of failure since blobs are always removed in reverse order of section. pub async fn rewind(&mut self, section: u64, size: u64) -> Result<(), Error> { - self.prune_guard(section)?; - - // Remove any sections beyond the given section - let trailing: Vec = self - .blobs - .range(( - std::ops::Bound::Excluded(section), - std::ops::Bound::Unbounded, - )) - .map(|(§ion, _)| section) - .collect(); - for index in trailing.iter().rev() { - // Remove the underlying blob from storage. - let blob = self.blobs.remove(index).unwrap(); - - // Destroy the blob - drop(blob); - self.context - .remove(&self.cfg.partition, Some(&index.to_be_bytes())) - .await?; - debug!(section = index, "removed section"); - self.tracked.dec(); - } - - // If the section exists, truncate it to the given offset - let blob = match self.blobs.get_mut(§ion) { - Some(blob) => blob, - None => return Ok(()), - }; - let current = blob.size().await; - if size >= current { - return Ok(()); // Already smaller than or equal to target size - } - blob.resize(size).await?; - debug!( - section, - from = current, - to = size, - ?trailing, - "rewound journal" - ); - Ok(()) + self.manager.rewind(section, size).await } /// Rewinds the `section` to the given `size`. @@ -688,102 +545,49 @@ impl Journal { /// /// This operation is not guaranteed to survive restarts until sync is called. pub async fn rewind_section(&mut self, section: u64, size: u64) -> Result<(), Error> { - self.prune_guard(section)?; - - // Get the blob at the given section - let blob = match self.blobs.get_mut(§ion) { - Some(blob) => blob, - None => return Ok(()), - }; - - // Truncate the blob to the given size - let current = blob.size().await; - if size >= current { - return Ok(()); // Already smaller than or equal to target size - } - blob.resize(size).await?; - debug!(section, from = current, to = size, "rewound section"); - Ok(()) + self.manager.rewind_section(section, size).await } /// Ensures that all data in a given `section` is synced to the underlying store. /// /// If the `section` does not exist, no error will be returned. pub async fn sync(&self, section: u64) -> Result<(), Error> { - self.prune_guard(section)?; - let blob = match self.blobs.get(§ion) { - Some(blob) => blob, - None => return Ok(()), - }; - self.synced.inc(); - blob.sync().await.map_err(Error::Runtime) + self.manager.sync(section).await } /// Syncs all open sections. pub async fn sync_all(&self) -> Result<(), Error> { - for blob in self.blobs.values() { - self.synced.inc(); - blob.sync().await.map_err(Error::Runtime)?; - } - Ok(()) + self.manager.sync_all().await } /// Prunes all `sections` less than `min`. Returns true if any sections were pruned. pub async fn prune(&mut self, min: u64) -> Result { - // Prune any blobs that are smaller than the minimum - let mut pruned = false; - while let Some((§ion, _)) = self.blobs.first_key_value() { - // Stop pruning if we reach the minimum - if section >= min { - break; - } - - // Remove blob from journal - let blob = self.blobs.remove(§ion).unwrap(); - let size = blob.size().await; - drop(blob); - - // Remove blob from storage - self.context - .remove(&self.cfg.partition, Some(§ion.to_be_bytes())) - .await?; - pruned = true; + self.manager.prune(min).await + } - debug!(blob = section, size, "pruned blob"); - self.tracked.dec(); - self.pruned.inc(); - } + /// Returns the number of the oldest section in the journal. + pub fn oldest_section(&self) -> Option { + self.manager.oldest_section() + } - if pruned { - self.oldest_retained_section = min; - } + /// Returns the number of the newest section in the journal. + pub fn newest_section(&self) -> Option { + self.manager.newest_section() + } - Ok(pruned) + /// Returns true if no sections exist. + pub fn is_empty(&self) -> bool { + self.manager.is_empty() } - /// Returns the number of the oldest section in the journal. - pub fn oldest_section(&self) -> Option { - self.blobs.first_key_value().map(|(section, _)| *section) + /// Returns the number of sections. + pub fn num_sections(&self) -> usize { + self.manager.num_sections() } /// Removes any underlying blobs created by the journal. pub async fn destroy(self) -> Result<(), Error> { - for (i, blob) in self.blobs.into_iter() { - let size = blob.size().await; - drop(blob); - debug!(blob = i, size, "destroyed blob"); - self.context - .remove(&self.cfg.partition, Some(&i.to_be_bytes())) - .await?; - } - match self.context.remove(&self.cfg.partition, None).await { - Ok(()) => {} - Err(RError::PartitionMissing(_)) => { - // Partition already removed or never existed. - } - Err(err) => return Err(Error::Runtime(err)), - } - Ok(()) + self.manager.destroy().await } } @@ -791,14 +595,10 @@ impl Journal { mod tests { use super::*; use bytes::BufMut; - use commonware_cryptography::{Hasher, Sha256}; use commonware_macros::test_traced; - use commonware_runtime::{ - deterministic, Blob, Error as RError, Runner, Storage, DEFAULT_BLOB_VERSION, - }; - use commonware_utils::{NZUsize, StableBuf}; + use commonware_runtime::{deterministic, Runner}; + use commonware_utils::NZUsize; use futures::{pin_mut, StreamExt}; - use prometheus_client::registry::Metric; const PAGE_SIZE: NonZeroUsize = NZUsize!(1024); const PAGE_CACHE_SIZE: NonZeroUsize = NZUsize!(10); @@ -1061,15 +861,9 @@ mod tests { journal.sync(section).await.expect("Failed to sync"); } - // Verify initial oldest_retained_section is 0 - assert_eq!(journal.oldest_retained_section, 0); - // Prune sections < 3 journal.prune(3).await.expect("Failed to prune"); - // Verify oldest_retained_section is updated - assert_eq!(journal.oldest_retained_section, 3); - // Test that accessing pruned sections returns the correct error // Test append on pruned section @@ -1128,7 +922,6 @@ mod tests { // Prune more sections journal.prune(5).await.expect("Failed to prune"); - assert_eq!(journal.oldest_retained_section, 5); // Verify sections 3 and 4 are now pruned match journal.get(3, 0).await { @@ -1174,7 +967,6 @@ mod tests { } journal.prune(3).await.expect("Failed to prune"); - assert_eq!(journal.oldest_retained_section, 3); } // Second session: verify oldest_retained_section is reset @@ -1183,10 +975,6 @@ mod tests { .await .expect("Failed to re-initialize journal"); - // After restart, oldest_retained_section should be back to 0 - // since it's not persisted - assert_eq!(journal.oldest_retained_section, 0); - // But the actual sections 1 and 2 should be gone from storage // so get should return SectionOutOfRange, not AlreadyPrunedToSection match journal.get(1, 0).await { @@ -1486,7 +1274,7 @@ mod tests { } #[test_traced] - fn test_journal_handling_unaligned_truncated_data() { + fn test_journal_truncation_recovery() { // Initialize the deterministic context let executor = deterministic::Runner::default(); @@ -1509,7 +1297,7 @@ mod tests { // Append 1 item to the first index journal.append(1, 1).await.expect("Failed to append data"); - // Append multiple items to the second index (with unaligned values) + // Append multiple items to the second section let data_items = vec![(2u64, 2), (2u64, 3), (2u64, 4)]; for (index, data) in &data_items { journal @@ -1566,12 +1354,12 @@ mod tests { // Confirm blob is expected length // entry = 1 (varint for 4) + 4 (data) + 4 (checksum) = 9 bytes - // Item 2 ends at position 16 + 9 = 25 + // Item 2 ends at position 9 + 9 = 18 let (_, blob_size) = context .open(&cfg.partition, &2u64.to_be_bytes()) .await .expect("Failed to open blob"); - assert_eq!(blob_size, 25); + assert_eq!(blob_size, 18); // Attempt to replay journal after truncation let mut journal = Journal::init(context.clone(), cfg.clone()) @@ -1604,24 +1392,24 @@ mod tests { assert_eq!(items[2].1, data_items[1].1); // Append a new item to truncated partition - journal.append(2, 5).await.expect("Failed to append data"); + let (offset, _) = journal.append(2, 5).await.expect("Failed to append data"); journal.sync(2).await.expect("Failed to sync blob"); // Get the new item - let item = journal.get(2, 2).await.expect("Failed to get item"); + let item = journal.get(2, offset).await.expect("Failed to get item"); assert_eq!(item, 5); // Drop the journal (data already synced) drop(journal); // Confirm blob is expected length - // Items 1 and 2 at positions 0 and 16, item 3 (value 5) at position 32 - // Item 3 = 1 (varint) + 4 (data) + 4 (checksum) = 9 bytes, ends at 41 + // Items 1 and 2 at positions 0 and 9, item 3 (value 5) at position 18 + // Item 3 = 1 (varint) + 4 (data) + 4 (checksum) = 9 bytes, ends at 27 let (_, blob_size) = context .open(&cfg.partition, &2u64.to_be_bytes()) .await .expect("Failed to open blob"); - assert_eq!(blob_size, 41); + assert_eq!(blob_size, 27); // Re-initialize the journal to simulate a restart let journal = Journal::init(context.clone(), cfg.clone()) @@ -1657,138 +1445,6 @@ mod tests { }); } - #[test_traced] - fn test_journal_handling_aligned_truncated_data() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - - // Start the test within the executor - executor.start(|context| async move { - // Create a journal configuration - let cfg = Config { - partition: "test_partition".into(), - compression: None, - codec_config: (), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - write_buffer: NZUsize!(1024), - }; - - // Initialize the journal - let mut journal = Journal::init(context.clone(), cfg.clone()) - .await - .expect("Failed to initialize journal"); - - // Append 1 item to the first index - journal.append(1, 1).await.expect("Failed to append data"); - - // Append multiple items to the second index (with unaligned values) - let data_items = vec![(2u64, 2), (2u64, 3), (2u64, 4)]; - for (index, data) in &data_items { - journal - .append(*index, *data) - .await - .expect("Failed to append data"); - journal.sync(*index).await.expect("Failed to sync blob"); - } - - // Sync all sections and drop the journal - journal.sync_all().await.expect("Failed to sync"); - drop(journal); - - // Manually corrupt the end of the second blob - let (blob, blob_size) = context - .open(&cfg.partition, &2u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - blob.resize(blob_size - 4) - .await - .expect("Failed to corrupt blob"); - blob.sync().await.expect("Failed to sync blob"); - - // Re-initialize the journal to simulate a restart - let mut journal = Journal::init(context.clone(), cfg.clone()) - .await - .expect("Failed to re-initialize journal"); - - // Attempt to replay the journal - let mut items = Vec::<(u64, u64)>::new(); - { - let stream = journal - .replay(0, 0, NZUsize!(1024)) - .await - .expect("unable to setup replay"); - pin_mut!(stream); - while let Some(result) = stream.next().await { - match result { - Ok((blob_index, _, _, item)) => items.push((blob_index, item)), - Err(err) => panic!("Failed to read item: {err}"), - } - } - } - - // Verify that only non-corrupted items were replayed - assert_eq!(items.len(), 3); - assert_eq!(items[0].0, 1); - assert_eq!(items[0].1, 1); - assert_eq!(items[1].0, data_items[0].0); - assert_eq!(items[1].1, data_items[0].1); - assert_eq!(items[2].0, data_items[1].0); - assert_eq!(items[2].1, data_items[1].1); - - // Append a new item to the truncated partition - journal.append(2, 5).await.expect("Failed to append data"); - journal.sync(2).await.expect("Failed to sync blob"); - - // Get the new item - let item = journal.get(2, 2).await.expect("Failed to get item"); - assert_eq!(item, 5); - - // Drop the journal (data already synced) - drop(journal); - - // Confirm blob is expected length - // entry = 1 (varint for 8) + 8 (u64 data) + 4 (checksum) = 13 bytes - // Items at positions 0, 16, 32; item 3 ends at 32 + 13 = 45 - let (_, blob_size) = context - .open(&cfg.partition, &2u64.to_be_bytes()) - .await - .expect("Failed to open blob"); - assert_eq!(blob_size, 45); - - // Attempt to replay journal after truncation - let journal = Journal::init(context, cfg) - .await - .expect("Failed to re-initialize journal"); - - // Attempt to replay the journal - let mut items = Vec::<(u64, u64)>::new(); - { - let stream = journal - .replay(0, 0, NZUsize!(1024)) - .await - .expect("unable to setup replay"); - pin_mut!(stream); - while let Some(result) = stream.next().await { - match result { - Ok((blob_index, _, _, item)) => items.push((blob_index, item)), - Err(err) => panic!("Failed to read item: {err}"), - } - } - } - - // Verify that only non-corrupted items were replayed - assert_eq!(items.len(), 4); - assert_eq!(items[0].0, 1); - assert_eq!(items[0].1, 1); - assert_eq!(items[1].0, data_items[0].0); - assert_eq!(items[1].1, data_items[0].1); - assert_eq!(items[2].0, data_items[1].0); - assert_eq!(items[2].1, data_items[1].1); - assert_eq!(items[3].0, 2); - assert_eq!(items[3].1, 5); - }); - } - #[test_traced] fn test_journal_handling_extra_data() { // Initialize the deterministic context @@ -1858,137 +1514,6 @@ mod tests { }); } - // Define `MockBlob` that returns an offset length that should overflow - #[derive(Clone)] - struct MockBlob {} - - impl Blob for MockBlob { - async fn read_at( - &self, - buf: impl Into + Send, - _offset: u64, - ) -> Result { - Ok(buf.into()) - } - - async fn write_at( - &self, - _buf: impl Into + Send, - _offset: u64, - ) -> Result<(), RError> { - Ok(()) - } - - async fn resize(&self, _len: u64) -> Result<(), RError> { - Ok(()) - } - - async fn sync(&self) -> Result<(), RError> { - Ok(()) - } - } - - // Define `MockStorage` that returns `MockBlob` - #[derive(Clone)] - struct MockStorage { - len: u64, - } - - impl Storage for MockStorage { - type Blob = MockBlob; - - async fn open_versioned( - &self, - _partition: &str, - _name: &[u8], - versions: std::ops::RangeInclusive, - ) -> Result<(MockBlob, u64, u16), RError> { - assert!(versions.contains(&DEFAULT_BLOB_VERSION)); - Ok((MockBlob {}, self.len, DEFAULT_BLOB_VERSION)) - } - - async fn remove(&self, _partition: &str, _name: Option<&[u8]>) -> Result<(), RError> { - Ok(()) - } - - async fn scan(&self, _partition: &str) -> Result>, RError> { - Ok(vec![]) - } - } - - impl Metrics for MockStorage { - fn with_label(&self, _: &str) -> Self { - self.clone() - } - - fn label(&self) -> String { - String::new() - } - - fn register, H: Into>(&self, _: N, _: H, _: impl Metric) {} - - fn encode(&self) -> String { - String::new() - } - } - - // Define the `INDEX_ALIGNMENT` again explicitly to ensure we catch any accidental - // changes to the value - const INDEX_ALIGNMENT: u64 = 16; - - #[test_traced] - fn test_journal_large_offset() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - executor.start(|_| async move { - // Create journal - let cfg = Config { - partition: "partition".to_string(), - compression: None, - codec_config: (), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - write_buffer: NZUsize!(1024), - }; - let context = MockStorage { - len: u32::MAX as u64 * INDEX_ALIGNMENT, // can store up to u32::Max at the last offset - }; - let mut journal = Journal::init(context, cfg).await.unwrap(); - - // Append data - let data = 1; - let (result, _) = journal - .append(1, data) - .await - .expect("Failed to append data"); - assert_eq!(result, u32::MAX); - }); - } - - #[test_traced] - fn test_journal_offset_overflow() { - // Initialize the deterministic context - let executor = deterministic::Runner::default(); - executor.start(|_| async move { - // Create journal - let cfg = Config { - partition: "partition".to_string(), - compression: None, - codec_config: (), - buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), - write_buffer: NZUsize!(1024), - }; - let context = MockStorage { - len: u32::MAX as u64 * INDEX_ALIGNMENT + 1, - }; - let mut journal = Journal::init(context, cfg).await.unwrap(); - - // Append data - let data = 1; - let result = journal.append(1, data).await; - assert!(matches!(result, Err(Error::OffsetOverflow))); - }); - } - #[test_traced] fn test_journal_rewind() { // Initialize the deterministic context @@ -2099,52 +1624,258 @@ mod tests { }); } - /// Protect against accidental changes to the journal disk format. #[test_traced] - fn test_journal_conformance() { - // Initialize the deterministic context + fn test_journal_rewind_many_sections() { let executor = deterministic::Runner::default(); - - // Start the test within the executor executor.start(|context| async move { - // Create a journal configuration let cfg = Config { - partition: "test_partition".into(), + partition: "test_partition".to_string(), compression: None, codec_config: (), buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), write_buffer: NZUsize!(1024), }; + let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap(); - // Initialize the journal - let mut journal = Journal::init(context.clone(), cfg.clone()) + // Create sections 1-10 with data + for section in 1u64..=10 { + journal.append(section, section as i32).await.unwrap(); + } + journal.sync_all().await.unwrap(); + + // Verify all sections exist + for section in 1u64..=10 { + let size = journal.size(section).await.unwrap(); + assert!(size > 0, "section {section} should have data"); + } + + // Rewind to section 5 (should remove sections 6-10) + journal + .rewind(5, journal.size(5).await.unwrap()) .await - .expect("Failed to initialize journal"); + .unwrap(); - // Append 100 items to the journal - for i in 0..100 { - journal.append(1, i).await.expect("Failed to append data"); + // Verify sections 1-5 still exist with correct data + for section in 1u64..=5 { + let size = journal.size(section).await.unwrap(); + assert!(size > 0, "section {section} should still have data"); } - journal.sync(1).await.expect("Failed to sync blob"); - // Drop the journal (data already synced) + // Verify sections 6-10 are removed (size should be 0) + for section in 6u64..=10 { + let size = journal.size(section).await.unwrap(); + assert_eq!(size, 0, "section {section} should be removed"); + } + + // Verify data integrity via replay + { + let stream = journal.replay(0, 0, NZUsize!(1024)).await.unwrap(); + pin_mut!(stream); + let mut items = Vec::new(); + while let Some(result) = stream.next().await { + let (section, _, _, item) = result.unwrap(); + items.push((section, item)); + } + assert_eq!(items.len(), 5); + for (i, (section, item)) in items.iter().enumerate() { + assert_eq!(*section, (i + 1) as u64); + assert_eq!(*item, (i + 1) as i32); + } + } + + journal.destroy().await.unwrap(); + }); + } + + #[test_traced] + fn test_journal_rewind_partial_truncation() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + partition: "test_partition".to_string(), + compression: None, + codec_config: (), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(1024), + }; + let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap(); + + // Append 5 items and record sizes after each + let mut sizes = Vec::new(); + for i in 0..5 { + journal.append(1, i).await.unwrap(); + journal.sync(1).await.unwrap(); + sizes.push(journal.size(1).await.unwrap()); + } + + // Rewind to keep only first 3 items + let target_size = sizes[2]; + journal.rewind(1, target_size).await.unwrap(); + + // Verify size is correct + let new_size = journal.size(1).await.unwrap(); + assert_eq!(new_size, target_size); + + // Verify first 3 items via replay + { + let stream = journal.replay(0, 0, NZUsize!(1024)).await.unwrap(); + pin_mut!(stream); + let mut items = Vec::new(); + while let Some(result) = stream.next().await { + let (_, _, _, item) = result.unwrap(); + items.push(item); + } + assert_eq!(items.len(), 3); + for (i, item) in items.iter().enumerate() { + assert_eq!(*item, i as i32); + } + } + + journal.destroy().await.unwrap(); + }); + } + + #[test_traced] + fn test_journal_rewind_nonexistent_target() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + partition: "test_partition".to_string(), + compression: None, + codec_config: (), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(1024), + }; + let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap(); + + // Create sections 5, 6, 7 (skip 1-4) + for section in 5u64..=7 { + journal.append(section, section as i32).await.unwrap(); + } + journal.sync_all().await.unwrap(); + + // Rewind to section 3 (doesn't exist) + journal.rewind(3, 0).await.unwrap(); + + // Verify sections 5, 6, 7 are removed + for section in 5u64..=7 { + let size = journal.size(section).await.unwrap(); + assert_eq!(size, 0, "section {section} should be removed"); + } + + // Verify replay returns nothing + { + let stream = journal.replay(0, 0, NZUsize!(1024)).await.unwrap(); + pin_mut!(stream); + let items: Vec<_> = stream.collect().await; + assert!(items.is_empty()); + } + + journal.destroy().await.unwrap(); + }); + } + + #[test_traced] + fn test_journal_rewind_persistence() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + partition: "test_partition".to_string(), + compression: None, + codec_config: (), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(1024), + }; + + // Create sections 1-5 with data + let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap(); + for section in 1u64..=5 { + journal.append(section, section as i32).await.unwrap(); + } + journal.sync_all().await.unwrap(); + + // Rewind to section 2 + let size = journal.size(2).await.unwrap(); + journal.rewind(2, size).await.unwrap(); + journal.sync_all().await.unwrap(); drop(journal); - // Hash blob contents - let (blob, size) = context - .open(&cfg.partition, &1u64.to_be_bytes()) + // Re-init and verify only sections 1-2 exist + let journal = Journal::<_, i32>::init(context.clone(), cfg.clone()) .await - .expect("Failed to open blob"); - assert!(size > 0); - let buf = blob - .read_at(vec![0u8; size as usize], 0) - .await - .expect("Failed to read blob"); - let digest = Sha256::hash(buf.as_ref()); - assert_eq!( - hex(&digest), - "f55bf27a59118603466fcf6a507ab012eea4cb2d6bdd06ce8f515513729af847", - ); + .unwrap(); + + // Verify sections 1-2 have data + for section in 1u64..=2 { + let size = journal.size(section).await.unwrap(); + assert!(size > 0, "section {section} should have data after restart"); + } + + // Verify sections 3-5 are gone + for section in 3u64..=5 { + let size = journal.size(section).await.unwrap(); + assert_eq!(size, 0, "section {section} should be gone after restart"); + } + + // Verify data integrity via replay + { + let stream = journal.replay(0, 0, NZUsize!(1024)).await.unwrap(); + pin_mut!(stream); + let mut items = Vec::new(); + while let Some(result) = stream.next().await { + let (section, _, _, item) = result.unwrap(); + items.push((section, item)); + } + assert_eq!(items.len(), 2); + assert_eq!(items[0], (1, 1)); + assert_eq!(items[1], (2, 2)); + } + + journal.destroy().await.unwrap(); + }); + } + + #[test_traced] + fn test_journal_rewind_to_zero_removes_all_newer() { + let executor = deterministic::Runner::default(); + executor.start(|context| async move { + let cfg = Config { + partition: "test_partition".to_string(), + compression: None, + codec_config: (), + buffer_pool: PoolRef::new(PAGE_SIZE, PAGE_CACHE_SIZE), + write_buffer: NZUsize!(1024), + }; + let mut journal = Journal::init(context.clone(), cfg.clone()).await.unwrap(); + + // Create sections 1, 2, 3 + for section in 1u64..=3 { + journal.append(section, section as i32).await.unwrap(); + } + journal.sync_all().await.unwrap(); + + // Rewind section 1 to size 0 + journal.rewind(1, 0).await.unwrap(); + + // Verify section 1 exists but is empty + let size = journal.size(1).await.unwrap(); + assert_eq!(size, 0, "section 1 should be empty"); + + // Verify sections 2, 3 are completely removed + for section in 2u64..=3 { + let size = journal.size(section).await.unwrap(); + assert_eq!(size, 0, "section {section} should be removed"); + } + + // Verify replay returns nothing + { + let stream = journal.replay(0, 0, NZUsize!(1024)).await.unwrap(); + pin_mut!(stream); + let items: Vec<_> = stream.collect().await; + assert!(items.is_empty()); + } + + journal.destroy().await.unwrap(); }); } }