diff --git a/Cargo.lock b/Cargo.lock index a4545dc5b..9cbc81ae4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -128,9 +128,9 @@ checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "asn1-rs" @@ -187,6 +187,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "assert_matches" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" + [[package]] name = "async-channel" version = "2.3.1" @@ -5395,11 +5401,13 @@ dependencies = [ name = "sbtc" version = "0.1.0" dependencies = [ + "assert_matches", "bitcoin", "bitcoincore-rpc", "bitcoincore-rpc-json", "clarity", "hex", + "more-asserts", "proptest", "rand", "secp256k1 0.29.0", diff --git a/Cargo.toml b/Cargo.toml index 2e6e44f7e..0e78d9506 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ wsts = { version = "13.0.1" } # Crates.io aquamarine = { version = "0.6.0", default-features = false } +assert_matches = { version = "1.5.0", default-features = false } aws-config = { version = "1.5.15", default-features = false, features = ["rustls", "rt-tokio"] } aws_lambda_events = { version = "0.16.0", default-features = false } aws-sdk-dynamodb = { version = "1.62.0", default-features = false } diff --git a/sbtc/Cargo.toml b/sbtc/Cargo.toml index aa767ce14..de80fee5d 100644 --- a/sbtc/Cargo.toml +++ b/sbtc/Cargo.toml @@ -31,4 +31,6 @@ stackslib = { workspace = true, optional = true } bitcoincore-rpc.workspace = true bitcoincore-rpc-json.workspace = true test-case.workspace = true -proptest.workspace = true \ No newline at end of file +proptest.workspace = true +assert_matches.workspace = true +more-asserts.workspace = true \ No newline at end of file diff --git a/sbtc/proptest-regressions/idpack/segmenters/mod.txt b/sbtc/proptest-regressions/idpack/segmenters/mod.txt new file mode 100644 index 000000000..f4a4a525e --- /dev/null +++ b/sbtc/proptest-regressions/idpack/segmenters/mod.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 2124eacfb1b30c39b1671b87f5eb23ef92a8936aac77ea5f71aea418ff0f34a6 # shrinks to values = [0, 834, 955, 962] diff --git a/sbtc/src/idpack/codec/decoder.rs b/sbtc/src/idpack/codec/decoder.rs new file mode 100644 index 000000000..127820c42 --- /dev/null +++ b/sbtc/src/idpack/codec/decoder.rs @@ -0,0 +1,352 @@ +//! Segment decoding implementation with compression-optimized routines. +//! +//! This module handles the decoding of compressed integer segments using +//! bitmap encoding optimized for specific data patterns. The decoder implements +//! several efficiency techniques: +//! +//! - **Delta-offset optimization**: Offsets are delta-encoded for space savings +//! - **LEB128 variable-length encoding**: Minimizes space for numeric values +//! - **Robust error handling**: Guards against malformed input without panicking +//! +//! ## Safety Considerations +//! +//! The decoder implements multiple safety checks to handle potentially +//! malicious inputs: +//! +//! - Validates allocation sizes to prevent excessive memory usage +//! - Handles integer overflow with checked arithmetic +//! - Properly handles truncated or incomplete data +//! - Enforces semantic constraints on segment relationships +//! +//! ## Format +//! +//! Each segment is encoded as: +//! +//! 1. Offset value (LEB128-encoded, delta compressed after first segment) +//! 2. Bitmap length (LEB128-encoded) +//! 3. Bitmap bytes (1 bit per value) + +use std::io::{Cursor, Read}; + +use crate::idpack::{Segment, Segments}; +use crate::leb128::ReadLeb128; + +use super::{Decodable, DecodeError}; + +/// Implements decoding from bytes into a collection of optimally encoded +/// segments. +/// +/// Handles empty input gracefully by returning an empty segments collection. +/// For non-empty input, processes each segment sequentially with delta-offset +/// decoding between segments. +impl Decodable for Segments { + /// Decodes a byte array into a Segments collection. + /// + /// This function processes the entire byte array sequentially, decoding + /// each segment and ensuring proper semantic relationships between them. + /// + /// ## Parameters + /// + /// * `bytes` - A slice of the encoded bytes to decode + /// + /// ## Returns + /// + /// * `Ok(Segments)` - Successfully decoded segments + /// * `Err(DecodeError)` - If any error occurs during decoding + /// + /// ## Implementation Notes + /// - Returns empty segments for empty input (valid edge case) + /// - Ensures complete consumption of input bytes + /// - Maintains offset ordering constraints between segments + fn decode(bytes: &[u8]) -> Result { + let mut segments = Segments::default(); + + if bytes.is_empty() { + return Ok(segments); + } + + let mut cursor = Cursor::new(bytes); + + let mut prev_max_value = 0; // Tracks previous segment's max value for delta decoding + + // Process segments until we've consumed all input bytes + while cursor.position() < bytes.len() as u64 { + // Read next segment with position-aware offset handling + let segment = read_segment_into(&mut cursor, prev_max_value)?; + + // Update state for next segment + prev_max_value = segment.max(); // Use max value for delta encoding + segments.try_push(segment)?; + } + + // Ensure we've consumed exactly the right amount of data + if cursor.position() as usize != bytes.len() { + return Err(DecodeError::UnexpectedEndOfData); + } + + Ok(segments) + } +} + +/// Reads a single segment from the cursor with delta-offset optimization. +/// +/// This function decodes a single segment from the current cursor position, +/// applying delta-offset decoding relative to the provided previous maximum +/// value. It handles the complete segment decoding process: +/// 1. Reading and decoding the offset value +/// 2. Reading the payload length and validating allocation size +/// 3. Reading and processing the bitmap payload +/// +/// ## Parameters +/// * `cursor` - Mutable cursor positioned at the start of a segment +/// * `prev_max_value` - Previous segment's maximum value (0 for first segment) +/// +/// ## Returns +/// * `Ok(Segment)` - Successfully decoded segment +/// * `Err(DecodeError)` - If any error occurs during decoding +/// +/// ## Errors +/// * `ArithmeticOverflow` - If offset calculation would overflow +/// * `ByteAllocationLimit` - If payload length exceeds allocation limits +/// * `IO` - For any I/O errors during reading +/// * `Leb128Error` - For LEB128 decoding errors +pub fn read_segment_into( + cursor: &mut Cursor<&[u8]>, + prev_max_value: u64, +) -> Result { + // Read LEB128-encoded offset value + let offset = cursor + .read_leb128()? + .checked_add(prev_max_value) + .ok_or(DecodeError::ArithmeticOverflow)?; + + // Read the payload length + let payload_length = cursor.read_leb128()?; + + // Safety check to prevent excessive allocation + if payload_length > crate::idpack::ALLOC_BYTES_LIMIT as u64 { + return Err(DecodeError::ByteAllocationLimit(payload_length)); + } + + // Read the payload bytes + let mut payload_bytes = vec![0u8; payload_length as usize]; + cursor + .read_exact(&mut payload_bytes) + .map_err(DecodeError::IO)?; + + // Initialize segment with offset + let mut segment = Segment::new_with_offset(offset); + + // Decode the bitmap payload into the segment + decode_bitmap(offset, &payload_bytes, &mut segment)?; + + // Return the completed segment + Ok(segment) +} + +/// Decodes bitmap data into values and inserts them into a segment. +/// +/// This function processes each bit in the bitmap to reconstruct the original +/// values: +/// 1. For each bit position that is set to 1 in the bitmap +/// 2. Calculate the corresponding value: offset + position + 1 +/// 3. Insert the value into the segment +/// +/// ## Parameters +/// * `offset` - Base value for the segment +/// * `bitmap` - Byte slice containing the bitmap data +/// * `segment` - Mutable reference to the segment for storing values +/// +/// ## Returns +/// * `Ok(())` - If bitmap was successfully decoded +/// * `Err(DecodeError)` - If any error occurs during decoding +/// +/// ## Errors +/// * `SegmentError` - If any validation error occurs while inserting values +/// into the segment +/// +/// ## Implementation Notes +/// This function doesn't perform additional allocation beyond what's already +/// allocated in the segment. All integer calculations are performed using plain +/// addition which won't overflow given the constraints on offset and bitmap +/// size enforced by earlier validation. +fn decode_bitmap(offset: u64, bitmap: &[u8], segment: &mut Segment) -> Result<(), DecodeError> { + // Process each bit in the bitmap to reconstruct original values. + // 1. Iterate over each byte in the bitmap + // 2. Iterate over each bit in the byte + // 3. Calculate the value from the bit position and add it to the results. + for (byte_idx, &byte) in bitmap.iter().enumerate() { + for bit_idx in 0..8 { + // Check if this bit is set + if byte & (1 << bit_idx) != 0 { + // Calculate position within the bitmap + let position = byte_idx * 8 + bit_idx; + + // Convert bitmap position to value: + // - Add offset (base of the segment) + // - Add position + 1 (converting 0-based bit to 1-based value) + let value = offset + (position as u64) + 1; + segment.try_insert(value)?; + } + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + use test_case::test_case; + + use crate::idpack::{Decodable, DecodeError, Segments}; + + /// Test specific error cases with crafted invalid inputs + #[test_case(&[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF]; "overflow in LEB128")] + #[test_case(&[1, 0xFF, 0xFF, 0xFF, 0xFF]; "excessive allocation")] + #[test_case(&[1, 1]; "incomplete segment - missing bitmap data")] + #[test_case(&[1]; "incomplete segment - missing length")] + #[test_case(&[]; "empty input")] + fn test_specific_invalid_inputs(bytes: &[u8]) { + // Empty input is actually valid and should produce empty segments + if bytes.is_empty() { + Segments::decode(bytes).expect("empty input should not fail"); + return; + } + + // For all other cases, we expect either: + // 1. A proper error (most likely) + // 2. Successful decoding (if the invalid input happens to be valid) + Segments::decode(bytes).expect_err("invalid input should fail decoding"); + } + + /// Test scenarios with valid first segment but invalid second segment + #[test_case( + &[10, 2, 0b00000011, 0xFF, 0xFF, 0xFF, 0xFF]; + "valid first segment + LEB128 overflow in second" + )] + #[test_case( + &[10, 2, 0b00000011, 11, 0xFF, 0xFF]; + "valid first segment + excessive allocation in second" + )] + #[test_case( + &[10, 2, 0b00000011, 11, 1]; + "valid first segment + truncated second segment" + )] + #[test_case( + &[10, 1, 0b00000011, 11]; + "valid first segment + incomplete second segment (missing length)" + )] + fn test_partial_valid_data(bytes: &[u8]) { + // Parse the first segment to verify it's valid + let mut cursor = std::io::Cursor::new(bytes); + super::read_segment_into(&mut cursor, 0).expect("first segment should be valid"); + + // Attempting to decode the entire byte sequence should fail + let full_result = Segments::decode(bytes); + assert!( + full_result.is_err(), + "decoding full sequence with invalid second segment should fail, got: {full_result:?}" + ); + } + + /// Test handling of valid first segment followed by garbage data + #[test] + fn test_valid_segment_with_trailing_garbage() { + // Start with a valid segment (with 1-byte bitmap) + let mut bytes = vec![10, 1, 0b00000011]; + + // Append random garbage data that doesn't form a valid second segment + bytes.extend_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD]); + + // Attempting to decode should fail with an appropriate error + let result = Segments::decode(&bytes); + assert!( + result.is_err(), + "decoding with trailing garbage should fail, got: {result:?}" + ); + + // We can also check that partial decoding works correctly + let mut cursor = std::io::Cursor::new(bytes.as_slice()); + let segment = + super::read_segment_into(&mut cursor, 0).expect("should decode first valid segment"); + + assert_eq!(segment.offset(), 10, "correct offset should be decoded"); + assert_eq!( + segment.payload_values().len(), + 2, + "correct number of values should be decoded" + ); + } + + /// Test handling of multiple well-formed but semantically invalid segments + #[test] + fn test_multiple_invalid_segment_relationships() { + // Create sequence with valid structure but invalid semantic relationships: + // 1. First segment: [100, 110, 120] + // 2. Second segment: [90, 95] (invalid: offset < previous max) + // 3. Third segment: [150, 200] (valid relationship with second, but overall sequence invalid) + + let mut invalid_bytes = Vec::new(); + + // First segment (valid) + invalid_bytes.extend_from_slice(&[100, 3, 0b00000111]); + + // Second segment (invalid relationship with first) + invalid_bytes.extend_from_slice(&[90, 2, 0b00000011]); + + // Third segment (valid relationship with second, but overall sequence invalid) + invalid_bytes.extend_from_slice(&[55, 2, 0b00000011]); + + // Decoding should return an error + Segments::decode(&invalid_bytes) + .expect_err("multiple segment relationship violations should fail decoding"); + } + + /// Test handling of semantically invalid but structurally correct data + #[test] + fn test_semantically_invalid_data() { + // Craft a payload with invalid segment relationships + // This example creates segments where a later segment has an offset + // smaller than a previous segment's max value + + // First segment: offset=100, values=[100, 110, 120] + // Second segment: offset=50 (invalid - should be > 120) + + // This would be encoded as: + // - First segment: offset=100 (absolute), length=3 bytes, bitmap=[...] + // - Second segment: offset=50 (delta from 120 would be -70, which is invalid) + + // We'll manually construct this invalid encoding + let mut invalid_bytes = Vec::new(); + + // First segment (valid) + invalid_bytes.extend_from_slice(&[100, 3, 0b00000111]); // Simple encoding + + // Second segment (invalid relationship) + invalid_bytes.extend_from_slice(&[50, 2, 0b00000011]); // Invalid delta + + // Decoding should return an error + Segments::decode(&invalid_bytes) + .expect_err("semantically invalid data should fail decoding"); + } + + /// Test to verify overflow handling in segment offset calculation + #[test] + fn test_overflow_handling() { + // Create a cursor with minimal content for offset delta, length, and bitmap + let mut cursor = std::io::Cursor::new([2u8, 1, 0].as_slice()); + + // Set previous max value just below u64::MAX + let prev_max = u64::MAX - 1; + + // Attempt to decode a segment, which should fail with arithmetic overflow + let result = super::read_segment_into(&mut cursor, prev_max); + + assert_matches!( + result, + Err(DecodeError::ArithmeticOverflow), + "expected arithmetic overflow when offset calculation exceeds u64::MAX" + ); + } +} diff --git a/sbtc/src/idpack/codec/encoder.rs b/sbtc/src/idpack/codec/encoder.rs new file mode 100644 index 000000000..078cefc8f --- /dev/null +++ b/sbtc/src/idpack/codec/encoder.rs @@ -0,0 +1,244 @@ +//! Segment encoding implementation with efficient compression. + +use crate::idpack::{Segment, Segments}; +use crate::leb128::Leb128; + +use super::Encodable; + +/// Implementation of encoding for segment collections with delta-optimization. +/// +/// Encodes multiple segments sequentially with optimizations: +/// - Delta-encoding offsets between adjacent segments +/// - Bitmap-based payload compression for efficient storage +impl Encodable for Segments { + /// Encodes a collection of segments into a byte vector. + /// + /// ## Format + /// Each segment is encoded as: + /// 1. Segment offset (LEB128): + /// - For first segment: absolute offset + /// - For subsequent segments: delta from previous segment's maximum value + /// 2. Payload length (LEB128): Number of bytes in the bitmap + /// 3. Bitmap payload: Bits set where values exist + /// + /// ## Returns + /// * `Vec` - Encoded byte vector, or empty vector if segments collection is empty + fn encode(&self) -> Vec { + let mut result = Vec::new(); + + // Return empty bytes for empty segments + if self.is_empty() { + return result; + } + + // Track the previous segment's max value for delta encoding of offsets. + let mut last_segment_max_value = 0; + + for segment in self.iter() { + // Calculate offset to encode: absolute for first segment, delta from + // the previous segment's max value for subsequent segments. Helps to + // reduce the encoded offset size when there are multiple segments. + let actual_offset = segment.offset().saturating_sub(last_segment_max_value); // Delta encoding for savings + + // Encode segment using the bitmap encoder + let mut payload_bytes = encode_bitmap(segment); + + // Write the segment offset, determined above + Leb128::encode_into(actual_offset, &mut result); + + // Write the payload length header + let payload_length = payload_bytes.len() as u64; + Leb128::encode_into(payload_length, &mut result); + + // Append the encoded segment payload + result.append(&mut payload_bytes); + + // Update the previous segment's max value for delta encoding + last_segment_max_value = segment.max(); + } + + result + } +} + +/// Encodes a single segment using bitmap encoding. +/// +/// This function creates a bitmap representation of the segment values, where: +/// - Each bit position corresponds to a value relative to the segment offset +/// - Bit at position N represents whether value (offset+N+1) exists in the segment +/// +/// ## Parameters +/// * `segment` - Reference to the segment to encode +/// +/// ## Returns +/// * `Vec` - Encoded bitmap bytes +/// +/// ## Algorithm +/// 1. Calculate required bitmap size based on range of values +/// 2. Allocate zeroed buffer of appropriate size +/// 3. For each value in the segment: +/// a. Calculate its position relative to offset +/// b. Set the corresponding bit in the bitmap +fn encode_bitmap(segment: &Segment) -> Vec { + // Calculate bitmap size requirements + let range = segment.range(); + let bytes_needed = range.div_ceil(8); + + // Allocate bitmap array filled with zeros + let mut bitmap = vec![0u8; bytes_needed as usize]; + + // Populate the bitmap by setting bits for each value (excluding the + // offset). + for &value in segment.payload_values() { + // Convert from value to bit position: + // 1. Subtract offset to get relative position + // 2. Subtract 1 more because bit 0 represents (offset+1) + // + // SAFETY: The following subtractions are safe because: + // 1. The Segment type works explicitly with unsigned integers, + // 2. and segment.values() returns values in the segment excluding the + // offset, hence all values from segment.values() are > offset and ≥ 0, + // guaranteed by the invariant of the Segment type, + // 2. Therefore, the following is always ≥ 0: + let relative_pos: u64 = value - segment.offset() - 1; + + // Calculate byte and bit index within the bitmap + let byte_index = relative_pos / 8; + let bit_index = relative_pos % 8; + + // Set the corresponding bit in the bitmap + // + // SAFETY: The index access is safe because: + // 1. bytes_needed is calculated based on the range() of values in the + // segment, so byte_index is always in range [0, bytes_needed-1] + // 2. And bitmap is sized exactly to bytes_needed + // + // SAFETY: The bit shift operation is safe because: + // 1. bit_index = relative_pos % 8 is always in range [0, 7] + // 2. Shifting by 0-7 bits is safe for u8 (which has 8 bits) + bitmap[byte_index as usize] |= 1 << bit_index; + } + + bitmap +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::idpack::{Segment, Segments}; + use more_asserts::assert_le; + use test_case::test_case; + + /// Test encoding of segments with various value patterns + #[test_case(&[10], 0, &[10, 0]; "offset only")] + #[test_case(&[10, 11, 12], 2, &[10, 1, 0b00000011]; "sequential values")] + #[test_case(&[10, 11, 18], 8, &[10, 1, 0b10000001]; "sparse values")] + #[test_case(&[10, 11, 18, 26], 16, &[10, 2, 0b10000001, 0b10000000]; "multiple bytes")] + #[test_case(&[0, 8, 16, 24, 32], 32, &[0, 4, 0b10000000, 0b10000000, 0b10000000, 0b10000000]; "byte boundaries")] + #[test_case(&[0, 255], 255, &[0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b01000000]; "maximum first byte")] + #[test_case(&[42], 0, &[42, 0]; "custom offset only")] + fn test_segment_encoding( + values: &[u64], + expected_range: u64, + expected_encoded: &[u8], + ) -> Result<(), Box> { + // Create a segment from the test values + let mut segment = Segment::new_with_offset(values[0]); + for &value in &values[1..] { + segment.try_insert(value)?; + } + + // Verify the expected range calculation + assert_eq!( + segment.range(), + expected_range, + "segment range calculation should match expected" + ); + + // Create a segments collection and encode + let mut segments = Segments::default(); + segments.try_push(segment.clone())?; + let encoded = segments.encode(); + + // Verify the encoded output matches expectations + assert_eq!( + encoded, expected_encoded, + "encoded output should match expected bytes" + ); + + // Double-check bitmap encoding specifically + if values.len() > 1 { + let bitmap = encode_bitmap(&segment); + + // Verify bitmap contents + for &value in segment.payload_values() { + let relative_pos = value - segment.offset() - 1; + let byte_index = (relative_pos / 8) as usize; + let bit_index = (relative_pos % 8) as usize; + + assert_le!( + byte_index, + bitmap.len(), + "bitmap should have enough bytes for value {value}" + ); + + assert!( + (bitmap[byte_index] & (1 << bit_index)) != 0, + "expected bit set for value {value} at byte {byte_index} bit {bit_index}" + ); + } + } + + Ok(()) + } + + /// Test delta encoding between segments + #[test_case( + &[(10, &[11, 12]), (20, &[21, 22])], + &[10, 1, 0b00000011, 8, 1, 0b00000011]; + "simple delta" + )] + #[test_case( + &[(10, &[11, 12]), (100, &[101]), (200, &[201, 202, 203])], + &[10, 1, 0b00000011, 88, 1, 0b00000001, 99, 1, 0b00000111]; + "multi segment delta" + )] + fn test_segments_delta_encoding( + segment_specs: &[(u64, &[u64])], + expected_encoded: &[u8], + ) -> Result<(), Box> { + // Create segments with specified offsets and values + let mut segments = Segments::default(); + + for (offset, values) in segment_specs { + let mut segment = Segment::new_with_offset(*offset); + for &value in *values { + segment.try_insert(value)?; + } + segments.try_push(segment)?; + } + + // Encode the segments + let encoded = segments.encode(); + + // Verify against expected encoding + assert_eq!( + encoded, expected_encoded, + "delta encoding should match expected bytes" + ); + + Ok(()) + } + + /// Test encoding of empty segments collection + #[test] + fn test_empty_segments() { + let segments = Segments::default(); + let encoded = segments.encode(); + assert_eq!( + encoded.len(), + 0, + "empty segments should encode to empty bytes" + ); + } +} diff --git a/sbtc/src/idpack/codec/mod.rs b/sbtc/src/idpack/codec/mod.rs new file mode 100644 index 000000000..9e6448c03 --- /dev/null +++ b/sbtc/src/idpack/codec/mod.rs @@ -0,0 +1,77 @@ +//! Segment encoding/decoding implementations with compression-optimized +//! routines. +//! +//! This module handles the compressed encoding and decoding of integer segments +//! using bitmap encoding optimized for specific data patterns. The codec +//! implements several efficiency techniques: +//! +//! - **Delta-offset optimization**: Offsets are delta-encoded for space savings +//! - **LEB128 variable-length encoding**: Minimizes space for numeric values +//! +//! ## Safety Considerations +//! +//! The decoder implements multiple safety checks to handle potentially +//! malicious or corrupt inputs: +//! +//! - Validates allocation sizes to prevent excessive memory usage when decoding +//! - Handles integer overflow with checked arithmetic +//! - Properly handles truncated or incomplete data +//! - Enforces semantic constraints on segment relationships +//! +//! ## Format +//! +//! Each segment is encoded as: +//! +//! 1. Offset value (LEB128-encoded, delta compressed after first segment) +//! 2. Bitmap length (LEB128-encoded) +//! 3. Bitmap bytes (1 bit per value) + +use super::segment; + +mod decoder; +mod encoder; + +/// Trait for types that can be encoded to bytes. +pub trait Encodable { + /// Encodes an instance into a byte vector. + fn encode(&self) -> Vec; +} + +/// Trait for types that can be decoded from bytes. +pub trait Decodable: Sized { + /// Decodes an instance from bytes. + fn decode(bytes: &[u8]) -> Result; +} + +/// Detailed errors that can occur during segment decoding. +/// These errors provide specific diagnostics for compression format issues. +#[derive(Debug, thiserror::Error)] +pub enum DecodeError { + /// Error decoding LEB128-encoded value + #[error("error decoding LEB128 value: {0}")] + Leb128(#[from] crate::leb128::Error), + + /// I/O error during decoding + #[error("io error: {0}")] + IO(#[from] std::io::Error), + + /// Buffer ended unexpectedly during decoding + #[error("unexpected end of data")] + UnexpectedEndOfData, + + /// Error adding decoded values to a segment + #[error("error decoding segment values: {0}")] + Segment(#[from] segment::SegmentError), + + /// Error adding decoded segments to a collection (i.e. overlapping segments) + #[error("error decoding segment values: {0}")] + Segments(#[from] crate::idpack::segments::SegmentsError), + + /// Numeric overflow during decoding calculations + #[error("arithmetic overflow")] + ArithmeticOverflow, + + /// Total allocation size exceeds safety limit + #[error("byte allocation limit exceeded: {0}")] + ByteAllocationLimit(u64), +} diff --git a/sbtc/src/idpack/mod.rs b/sbtc/src/idpack/mod.rs new file mode 100644 index 000000000..36f7bd087 --- /dev/null +++ b/sbtc/src/idpack/mod.rs @@ -0,0 +1,65 @@ +//! # IDPack: Integer Set Compression Encoding +//! +//! `idpack` is an integer compression module designed to achieve byte +//! savings through automatic segmentation and multiple bitmaps. +//! +//! ## Usage Example +//! +//! ``` +//! use sbtc::idpack::{BitmapSegmenter, Segmenter, Encodable}; +//! +//! // Compress a sequence of integers with maximum efficiency +//! let values = vec![1, 2, 3, 50, 51, 52, 1000, 1001]; +//! +//! // Segment the values with automatic encoding selection +//! let segments = BitmapSegmenter.package(&values).unwrap(); +//! +//! // Encode to binary representation +//! let encoded = segments.encode().unwrap(); +//! +//! println!("Compressed {} integers into {} bytes", values.len(), encoded.len()); +//! ``` +//! +//! ## Safety Considerations +//! +//! This library implements safeguards against memory exhaustion attacks that +//! could occur when decoding malicious inputs: +//! +//! * Input validation for semantic correctness (packaging) +//! * Safe bitmap allocation limits (decoding) +//! * Protection against excessive delta ranges (segmenting) +//! +//! ## Architecture +//! +//! * **Segmenters**: Divide integer sequences into optimally-sized segments +//! * **Segments**: Manage collections of un-encoded segments +//! * **Segment**: Represents a single packaged integer range +//! * **Codec**: Low-level encoding/decoding + +mod codec; +mod segment; +mod segmenters; +mod segments; + +#[cfg(test)] +mod tests; + +pub use segment::Segment; +pub use segment::SegmentError; + +pub use segments::Segments; +pub use segments::SegmentsError; + +pub use segmenters::BitmapSegmenter; +pub use segmenters::Segmenter; +pub use segmenters::SegmenterError; + +pub use codec::Decodable; +pub use codec::DecodeError; +pub use codec::Encodable; + +/// Maximum allocation limit in bytes (1MB) for a single bitmap payload for +/// preventing memory allocation attacks while allowing sufficient space for +/// optimal compression operations. This limit has no effect on the number of +/// segments or number of values to be decoded. +pub const ALLOC_BYTES_LIMIT: u32 = 1 << 20; // 1MB = 2^20 bytes diff --git a/sbtc/src/idpack/segment.rs b/sbtc/src/idpack/segment.rs new file mode 100644 index 000000000..5cd1979c8 --- /dev/null +++ b/sbtc/src/idpack/segment.rs @@ -0,0 +1,254 @@ +//! Segment encoding for withdrawal IDs with guaranteed invariants. +//! +//! # Safety Invariants +//! +//! A `Segment` maintains critical invariants at all times: +//! - **Never empty**: Always contains at least one value (the offset) +//! - **Always sorted**: Values are in strictly ascending order +//! - **No duplicates**: Each value appears exactly once +//! +//! These invariants are enforced by the API and enable optimized encoding and +//! safe access without bounds checking in critical paths. + +use std::fmt::Debug; + +/// Error types that can occur when working with segments. +#[derive(Debug, Clone, Copy, PartialEq, Eq, thiserror::Error)] +pub enum SegmentError { + /// Values must be in strictly ascending order. + /// Provides the value that violated the ordering constraint. + #[error("Value {0} is out of order (must be inserted in strictly ascending order)")] + UnsortedValue(u64), + + /// The input contains duplicate values. + /// Duplicate elimination is crucial for maximum compression. + #[error("The input contains duplicate values")] + DuplicateValue(u64), +} + +/// Represents a segment of integer values encoded with a specific method. +/// Facilitates pattern-based optimal compression. +/// +/// # Safety Considerations +/// - Contains at least one value (offset) at all times +/// - Values are always sorted in strictly ascending order +/// - Duplicate values are not allowed (and are silently ignored) +#[derive(Clone)] +pub struct Segment { + values: Vec, +} + +impl Segment { + /// Creates a new segment with the specified encoding and initial offset + /// value. The offset is crucial for compression as it establishes the base + /// value for the segment. + pub fn new_with_offset(offset: u64) -> Self { + Self { values: vec![offset] } + } + + /// Returns the offset (first value) of the segment. + pub fn offset(&self) -> u64 { + // SAFETY: `values` is never empty due to struct invariants + self.values[0] + } + + /// Inserts a value into the segment, requiring that values are sorted. + /// Attempting to insert the same value consecutively is a no-op and only + /// one copy is stored. + /// + /// ## Errors + /// - Unsorted values (`UnsortedInput`) + pub fn try_insert(&mut self, value: u64) -> Result<(), SegmentError> { + // SAFETY: `values` is never empty due to struct invariants + let last_value = self.max(); + + // Validate that the new value is greater than the last value (sorted) + if value < last_value { + return Err(SegmentError::UnsortedValue(value)); + } + + // If the value already exists, return early (no duplicates allowed) + if value == last_value { + return Err(SegmentError::DuplicateValue(value)); + } + + // Add the value to the segment + self.values.push(value); + + Ok(()) + } + + /// Gets a slice of all values in the segment, including the offset. + pub fn as_slice(&self) -> &[u64] { + &self.values + } + + /// Gets a slice of all values in the segment, excluding the offset. + /// Returns an empty slice if there are no values beyond the offset. + pub fn payload_values(&self) -> &[u64] { + // SAFETY: `values` is never empty due to struct invariants + &self.values[1..] + } + + /// Returns the number of values in the segment (including offset). + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> usize { + self.values.len() + } + + /// Returns span of the segment (maximum value - offset). + pub fn range(&self) -> u64 { + self.max() - self.offset() + } + + /// Returns the greatest value in the segment. + pub fn max(&self) -> u64 { + // SAFETY: `values` is never empty due to struct invariants + self.values[self.values.len() - 1] + } +} + +/// String representation for segments: `Segment(value1,value2,...)`. +/// Useful for debugging during compression optimization. +impl std::fmt::Display for Segment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Segment(")?; + for (i, value) in self.values.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "{}", value)?; + } + write!(f, ")") + } +} + +/// Debug representation for segments, matching the Display format. +impl std::fmt::Debug for Segment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self, f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use assert_matches::assert_matches; + use test_case::test_case; + + /// Test segment creation with different offsets + #[test_case(0; "zero offset")] + #[test_case(10; "small offset")] + #[test_case(u64::MAX; "large offset")] + fn test_new_segment(offset: u64) { + let segment = Segment::new_with_offset(offset); + + // Verify offset is stored correctly + assert_eq!(segment.offset(), offset); + + // Verify invariant: segment is never empty + assert!(!segment.as_slice().is_empty()); + assert_eq!(segment.len(), 1); + + // Verify max equals offset when only offset exists + assert_eq!(segment.max(), offset); + + // Verify range is zero when only offset exists + assert_eq!(segment.range(), 0); + } + + /// Test successful insertion in ascending order + #[test] + fn test_ordered_insertion() -> Result<(), SegmentError> { + let mut segment = Segment::new_with_offset(10); + + // Insert values in strictly ascending order + segment.try_insert(11)?; + segment.try_insert(12)?; + segment.try_insert(15)?; + segment.try_insert(22)?; + + // Verify all values are stored + assert_eq!(segment.len(), 5); + assert_eq!(segment.as_slice(), &[10, 11, 12, 15, 22]); + assert_eq!(segment.payload_values(), &[11, 12, 15, 22]); + + // Verify max and range + assert_eq!(segment.offset(), 10); + assert_eq!(segment.max(), 22); + assert_eq!(segment.range(), 12); // 22 - 10 + + Ok(()) + } + + #[test_case(&[10, 10] => Err(SegmentError::DuplicateValue(10)); "duplicate offsets")] + #[test_case(&[10, 11, 11] => Err(SegmentError::DuplicateValue(11)); "duplicate values")] + fn test_duplicate_value_error(values: &[u64]) -> Result<(), SegmentError> { + let mut segment = Segment::new_with_offset(values[0]); + + // Insert duplicates + for &value in &values[1..] { + segment.try_insert(value)?; + } + Ok(()) + } + + /// Test insertion of out-of-order values + #[test_case(10, 9; "value less than offset")] + #[test_case(10, 0; "zero value with non-zero offset")] + fn test_out_of_order_insertion(offset: u64, value: u64) { + let mut segment = Segment::new_with_offset(offset); + + // Try inserting an out-of-order value + let result = segment.try_insert(value); + + // Verify appropriate error is returned + assert!(result.is_err()); + assert_matches!(result, Err(SegmentError::UnsortedValue(v)) if v == value); + + // Verify segment wasn't modified + assert_eq!(segment.len(), 1); + assert_eq!(segment.offset(), offset); + } + + /// Test insertion after values are already present + #[test] + fn test_intermediate_insertion_error() -> Result<(), SegmentError> { + let mut segment = Segment::new_with_offset(10); + + // Add values + segment.try_insert(15)?; + segment.try_insert(20)?; + + // Try inserting value between existing values (should fail) + segment + .try_insert(14) + .expect_err("expected insertion error"); + + // Verify value > max still works + segment.try_insert(25)?; + + // Verify segment state + assert_eq!(segment.as_slice(), &[10, 15, 20, 25]); + + Ok(()) + } + + /// Test segment with large ranges and edge values + #[test_case(0, u64::MAX; "full range")] + #[test_case(u64::MAX / 2, u64::MAX; "upper half")] + #[test_case(0, 1; "minimum range")] + fn test_range_calculation(offset: u64, max: u64) -> Result<(), SegmentError> { + let mut segment = Segment::new_with_offset(offset); + + // Insert maximum value + segment.try_insert(max)?; + + // Verify range calculation + assert_eq!(segment.offset(), offset); + assert_eq!(segment.range(), max - offset); + assert_eq!(segment.max(), max); + + Ok(()) + } +} diff --git a/sbtc/src/idpack/segmenters/bitmap.rs b/sbtc/src/idpack/segmenters/bitmap.rs new file mode 100644 index 000000000..a5bc59509 --- /dev/null +++ b/sbtc/src/idpack/segmenters/bitmap.rs @@ -0,0 +1,455 @@ +use crate::{ + idpack::{Segment, Segments}, + leb128::Leb128, +}; + +use super::{Segmenter, SegmenterError}; + +/// Bitmap cost calculation result for compression optimization. +/// +/// Contains the calculated byte costs for both segmentation options: +/// 1. splitting at the current position, or +/// 2. continuing the current segment. +#[derive(Debug)] +struct BitmapCosts { + /// Total bytes required if we split at current position + /// Includes current segment bytes plus new segment overhead and payload + bytes_if_split: usize, + + /// Total bytes required if we continue current segment to include next value + /// Includes overhead plus expanded bitmap payload + bytes_if_combined: usize, +} + +impl BitmapCosts { + /// Determines if splitting produces a smaller byte size + /// + /// This core decision function compares exact byte costs to ensure + /// optimal segmentation. + /// + /// ## Returns + /// * `true` if splitting saves bytes compared to continuing the current segment, + /// * `false` otherwise + fn should_split(&self) -> bool { + // Simple core decision: split when it saves bytes + self.bytes_if_split < self.bytes_if_combined + } + + /// Calculates bitmap costs for both splitting and continuing scenarios. + /// + /// Performs precise byte-level analysis to determine optimal segmentation: + /// * Calculates exact bitmap size using ceiling division + /// * Includes accurate LEB128 overhead costs based on value sizes + /// + /// ## Parameters + /// * `offset` - The current segment's offset value (first value) + /// * `prev` - The previous value in the sequence + /// * `next` - The next value being considered for inclusion + /// + /// ## Returns + /// A [`BitmapCosts`] instance containing the byte costs for both options. + fn calculate(offset: u64, prev: u64, next: u64) -> Self { + // Calculate current sizes + let current_payload = (prev - offset).div_ceil(8); + let current_length_header = Leb128::calculate_size(current_payload); + + // Calculate combined sizes + let combined_payload = (next - offset).div_ceil(8); + let combined_length_header = Leb128::calculate_size(combined_payload); + let bytes_if_combined = combined_length_header + combined_payload as usize; + + // Calculate split sizes + let split_length_header = 1; // Will always be one byte for only an offset + let split_offset = Leb128::calculate_size(next.saturating_sub(prev)); + let bytes_if_split = current_length_header + + current_payload as usize + + split_offset + + split_length_header as usize; + + // Return the precise byte costs for compression decision + Self { + bytes_if_split, + bytes_if_combined, + } + } +} + +/// A bitmap segmenter that optimizes for byte savings using direct size comparison +/// +/// This segmenter analyzes each potential split point with byte-level precision +/// to achieve optimal compression for integer sequences. +pub struct BitmapSegmenter; + +impl Segmenter for BitmapSegmenter { + /// Creates a new `Segments` instance with optimal boundaries + /// + /// This main entry point for bitmap segmentation divides values into segments + /// at exactly the points that optimize compression. + /// + /// ## Parameters + /// * `values` - The sorted sequence of values to segment + /// + /// ## Returns + /// A `Result` containing either the optimally segmented values or an error + fn package(&self, values: &[u64]) -> Result { + // If no values, return empty segments + if values.is_empty() { + return Ok(Segments::default()); + } + + // Ensure input is sorted and unique for bitmap segmentation + if !Self::is_unique_sorted(values) { + return Err(SegmenterError::InvalidSequence); + } + + // Find optimal segment boundaries based on byte savings + let boundaries = self.find_segment_boundaries(values); + + // Create segments using identified boundaries + let segments = self.create_segments_from_boundaries(values, &boundaries)?; + + Ok(segments) + } + + /// Estimates the total packaged and encoded size in bytes. + /// + /// ## Parameters + /// * `values` - The sorted sequence of values to estimate size for + /// + /// ## Returns + /// * `Ok(size)` - The estimated size in bytes that would be used when + /// encoding + /// * `Err(error)` - If input validation fails (empty or unsorted input) + /// + /// ## Notes + /// This currently uses the full packaging process, but an improvement + /// would be to calculate boundaries and use them to estimate the size + /// without creating (allocating) the segments. + fn estimate_size(&self, values: &[u64]) -> Result { + if values.is_empty() { + return Ok(0); + } + + // Generate optimally segmented values using our boundary detection algorithm + let segments = self.package(values)?; + + // Track the previous segment offset for delta encoding + let mut prev_max = 0; + + // Calculate the precise byte size with optimal compression + let encoded_size = segments.iter().fold(0, |total_bytes, segment| { + // Calculate LEB128-encoded delta offset from the previous segment's max value + let delta = segment.offset().saturating_sub(prev_max); + let offset_size = Leb128::calculate_size(delta); + + // Calculate bitmap size requirements + let bytes_needed = segment.range().div_ceil(8); + + // Calculate the byte-length header size for the bitmap + let length_header_size = Leb128::calculate_size(bytes_needed); + + // Update previous offset for next iteration + prev_max = segment.max(); + + // Calculate the total size for the segment + let segment_size = offset_size + length_header_size + bytes_needed as usize; + + // Add the new segment size to the accumulator + total_bytes + segment_size + }); + + Ok(encoded_size) + } +} + +impl BitmapSegmenter { + /// Finds optimal segment boundaries by directly comparing byte costs + /// + /// This core algorithm analyzes each potential split point to optimize + /// compression by comparing byte costs for splitting vs. continuing. + /// + /// These byte-perfect decisions ensure optimal compression by creating + /// segments exactly where they save bytes. + /// + /// ## Parameters + /// * `values` - The sorted sequence of values to segment + /// + /// ## Returns + /// A vector of boundary indices representing optimal segment divisions + fn find_segment_boundaries(&self, values: &[u64]) -> Vec { + let mut boundaries = vec![0]; // Always include start index + + // Handle empty and single value sequences + if values.len() <= 1 { + boundaries.push(values.len()); + return boundaries; + } + + // Track the first value in current segment (used as segment offset) + // This affects bitmap size calculations. + // SAFETY: we just ensured that `values` is not empty + let mut current_offset = values[0]; + + // Iterate over pairs of previous and next values to calculate byte costs + // and determine optimal split points + for (pos, window) in values.windows(2).enumerate() { + let [prev, next] = *window else { + // This branch is unreachable with windows(2), but is needed + // for the compiler to understand the pattern match. + continue; + }; + + // Calculate bitmap costs for splitting vs. combining + let bitmap_costs = BitmapCosts::calculate(current_offset, prev, next); + + // Determine if splitting here maximizes compression + let should_split = bitmap_costs.should_split(); + + if should_split { + // If we're splitting, then the next position is a start boundary + // for the next segment + boundaries.push(pos + 1); + + // `next` is the new segment's offset + current_offset = next; + } + } + + // Always include end boundary + boundaries.push(values.len()); + + boundaries + } + + /// Creates bitmap segments based on the identified boundaries + /// + /// Converts logical segment boundaries into actual encoded segments. + /// + /// ## Parameters + /// * `values` - The sorted sequence of original values + /// * `boundaries` - The optimal boundary indices determined by analysis + /// + /// ## Returns + /// * Ok(Segments) + /// * Err(SegmenterError::InvalidBoundaries) - If boundaries are invalid + /// * Err(SegmenterError::Segment) - If segment manipulation fails (i.e. unsorted values) + /// * Err(SegmenterError::Segments) - If segments manipulation fails (i.e. overlapping segments) + fn create_segments_from_boundaries( + &self, + values: &[u64], + boundaries: &[usize], + ) -> Result { + let mut segments = Segments::default(); + + // Create a segment for each pair of boundaries + for window in boundaries.windows(2) { + let [start_idx, end_idx] = *window else { + // This branch is unreachable with windows(2), but is needed + // for the compiler to understand the pattern match. + continue; + }; + + // SAFETY: `start_idx` and `end_idx` are always valid range values + // as the boundaries themselves are derived from index positions + // within the `values` slice (where the last `end_idx` is always + // `values.len()` and thus valid for the exclusive range). + let slice = &values[start_idx..end_idx]; + let Some(offset) = slice.first() else { + return Err(SegmenterError::InvalidBoundaries); + }; + + // Create segment using offset + let mut segment = Segment::new_with_offset(*offset); + + // Use iterator to add remaining values without indexing + for &value in slice.iter().skip(1) { + segment.try_insert(value)?; + } + + segments.try_push(segment)?; + } + + Ok(segments) + } + + /// Checks if a slice is sorted in ascending order and contains only unique + /// values. + /// + /// Bitmap segmentation requires sorted input as it relies on gaps between + /// consecutive values. + /// + /// ## Parameters + /// * `values` - The sequence to check for sorting + /// + /// ## Returns + /// * `true` if values are sorted in ascending order and doesn't contain + /// duplicates, + /// * `false` otherwise + #[inline] + fn is_unique_sorted(values: &[u64]) -> bool { + values.is_empty() || values.windows(2).all(|w| w[0] < w[1]) + } +} + +#[cfg(test)] +mod tests { + use crate::idpack::Encodable; + + use super::*; + use assert_matches::assert_matches; + use proptest::prelude::*; + use test_case::test_case; + + /// Tests validation error handling + #[test] + fn test_validation_errors() { + // Unsorted input + assert_matches!( + BitmapSegmenter.package(&[5, 3, 1]), + Err(SegmenterError::InvalidSequence) + ); + + // Duplicate values + assert_matches!( + BitmapSegmenter.package(&[5, 5, 10]), + Err(SegmenterError::InvalidSequence) + ); + } + + /// Tests the boundary detection with direct byte savings calculations + #[test_case(&[10], &[0, 1]; "single value")] + #[test_case(&[10, 11, 12, 13, 14], &[0, 5]; "small sequential - no splits")] + #[test_case(&[10, 20, 30, 40, 50], &[0, 5]; "evenly spaced - no splits")] + #[test_case(&[10, 11, 12, 1000, 1001, 1002], &[0, 3, 6]; "clear gap with byte savings")] + #[test_case(&[1, 1000000], &[0, 1, 2]; "extreme gap forces split")] + #[test_case(&[1, 2, 3, 100000, 100001], &[0, 3, 5]; "large gap with min size respected")] + #[test_case(&[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109], &[0, 10, 20]; "larger sequence with multiple splits")] + fn test_byte_saving_boundary_detection(values: &[u64], expected_boundaries: &[usize]) { + let boundaries = BitmapSegmenter.find_segment_boundaries(values); + assert_eq!( + boundaries, expected_boundaries, + "unexpected boundaries for values: {values:?}" + ); + } + + /// Tests precise break-even gap detection + #[test_case(&[1, 1000000], 2; "extreme gap creates 2 segments")] + #[test_case(&[1, 2, 3, 4, 5], 1; "small dense sequence stays as 1 segment")] + #[test_case(&[1, 2, 50000, 50001], 2; "gap creates 2 segments")] + #[test_case(&[10, 10 + 16], 1; "gap of 16 with 1-byte delta - no split")] + #[test_case(&[10, 10 + 17], 2; "gap of 17 with 1-byte delta - split")] + #[test_case(&[10000, 10000 + 24], 2; "gap of 24 with 2-byte delta - split")] + #[test_case(&[10000, 10000 + 25], 2; "gap of 25 with 2-byte delta - split")] + #[test_case(&[1, 1_000_000, 1_000_001], 2; "single value followed by large gap")] + #[test_case(&[1, 2, 1_000_000, 1_000_001], 2; "multiple values followed by large gap")] + #[test_case(&[1, 1_000_000, 1_000_001, 2_000_000], 3; "multiple large gaps")] + #[test_case(&[1, 1_000, 10_000, 10_001, 100_000], 4; "multiple varied gaps")] + #[test_case(&[1, 11, 21, 31, 41, 51, 61, 71, 81, 91], 1; "10 values spread over 100 range")] + #[test_case(&[1, 2, 3, 10_000, 20_000, 30_000], 4; "large range with few values")] + fn test_split_calculations(values: &[u64], expected_segments: usize) { + let result = BitmapSegmenter.package(values).unwrap(); + assert_eq!( + result.len(), + expected_segments, + "failed to correctly split values: {values:?}" + ); + } + + /// Test the estimate_size method against actual encoding size + #[test_case(&[]; "empty input")] + #[test_case(&[5]; "single value")] + #[test_case(&[10, 11, 12]; "sequential values")] + #[test_case(&[10, 20, 30]; "spaced values")] + #[test_case(&[10, 11, 50]; "values with gap")] + #[test_case(&[1, 100, 1000, 10000]; "large gaps")] + #[test_case(&[1, 2, 3, 20, 21, 22, 50, 51, 52]; "multiple segments")] + #[test_case(&[0]; "zero")] + #[test_case(&[u64::MAX]; "u64::max")] + #[test_case(&[0, u64::MAX]; "full range")] + #[test_case(&[0, 1, 2, 3, 4, 5, 6, 7, 8]; "byte boundary")] + fn test_size_estimation_accuracy(values: &[u64]) -> Result<(), Box> { + // Skip empty check for this test to avoid early return + if values.is_empty() { + return Ok(()); + } + + // Create a bitmap segmenter to test + let segmenter = BitmapSegmenter; + + // Get estimated size + let estimated_size = segmenter.estimate_size(values)?; + + // Get actual size by packaging and encoding + let segments = segmenter.package(values)?; + let encoded = segments.encode(); + let actual_size = encoded.len(); + + // Verify estimate matches actual size + assert_eq!( + estimated_size, actual_size, + "estimated size {estimated_size} should match actual encoded size {actual_size}", + ); + + Ok(()) + } + + /// Test error handling for invalid inputs + #[test_case(&[5, 3, 10] => Err(SegmenterError::InvalidSequence); "unsorted input")] + #[test_case(&[1, 2, 2, 3] => Err(SegmenterError::InvalidSequence); "duplicate values")] + fn test_estimate_size_invalid_inputs(values: &[u64]) -> Result { + BitmapSegmenter.estimate_size(values) + } + + /// Test estimate consistency across multiple calls + #[test] + fn test_estimate_consistency() -> Result<(), SegmenterError> { + let segmenter = BitmapSegmenter; + let values = &[10, 20, 30, 40, 50, 100, 200]; + + // Multiple calls should return the same estimate + let first_estimate = segmenter.estimate_size(values)?; + let second_estimate = segmenter.estimate_size(values)?; + let third_estimate = segmenter.estimate_size(values)?; + + assert_eq!(first_estimate, second_estimate); + assert_eq!(second_estimate, third_estimate); + + Ok(()) + } + + // Add property-based testing for broader input coverage + proptest! { + /// Property-based test for size estimation accuracy across randomized inputs + #[test] + fn prop_size_estimation_accuracy( + // Generate sorted u64 vectors with reasonable size constraints + values in prop::collection::vec(0..50_000_u64, 0..100) + .prop_map(|mut v| { + v.sort_unstable(); + v.dedup(); // Remove duplicates for valid input + v + }) + ) { + // Skip empty vectors (already tested explicitly) + prop_assume!(!values.is_empty()); + + let segmenter = BitmapSegmenter; + + // This could potentially fail, which proptest will handle + let estimated_size = segmenter.estimate_size(&values)?; + + let segments = segmenter.package(&values)?; + let encoded = segments.encode(); + let actual_size = encoded.len(); + + // The key property being tested + prop_assert_eq!( + estimated_size, + actual_size, + "estimated size {} should match actual encoded size {}", + estimated_size, + actual_size + ); + } + } +} diff --git a/sbtc/src/idpack/segmenters/mod.rs b/sbtc/src/idpack/segmenters/mod.rs new file mode 100644 index 000000000..0d15d98b5 --- /dev/null +++ b/sbtc/src/idpack/segmenters/mod.rs @@ -0,0 +1,49 @@ +mod bitmap; + +use super::segments::Segments; +use super::{segment, segments}; + +pub use bitmap::BitmapSegmenter; + +/// Errors which can occur during the adaptive segmentation process. +#[derive(Debug, PartialEq, thiserror::Error)] +pub enum SegmenterError { + /// The input values are not sorted or contain duplicates. + #[error("input must be sorted and contain unique values")] + InvalidSequence, + + /// An error was returned by the segment module. + #[error(transparent)] + Segment(#[from] segment::SegmentError), + + /// An error was returned by the segments module. + #[error(transparent)] + Segments(#[from] segments::SegmentsError), + + /// The segmenter encountered invalid boundaries. + #[error("the segmenter encountered invalid boundaries")] + InvalidBoundaries, +} + +/// Trait for segmenting integer values into optimal segments. +pub trait Segmenter { + /// Segments the input values into a series of segments. + /// + /// ## Parameters + /// * `values` - The input values to segment. + /// + /// ## Returns + /// A vector of segments containing the input values. + fn package(&self, values: &[u64]) -> Result; + + /// Estimates the total packaged and encoded size in bytes. + /// + /// ## Parameters + /// * `values` - The sequence of values to estimate size for + /// + /// ## Returns + /// * `Ok(size)` - The estimated size in bytes that would be used when + /// encoding + /// * `Err(error)` - Upon encountering an error during estimation + fn estimate_size(&self, values: &[u64]) -> Result; +} diff --git a/sbtc/src/idpack/segments.rs b/sbtc/src/idpack/segments.rs new file mode 100644 index 000000000..e8ad9116d --- /dev/null +++ b/sbtc/src/idpack/segments.rs @@ -0,0 +1,190 @@ +use super::Segment; + +/// Errors that can occur when working with segment collections. +#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)] +pub enum SegmentsError { + /// Segments must be added in ascending order (no overlaps permitted). + #[error("New segment offset {offset} must be greater than previous segment's maximum value {prev_max}")] + OverlappingSegments { + /// The offset of the segment being added + offset: u64, + /// The maximum value from the previous segment + prev_max: u64, + }, +} + +/// Collection of segment objects representing segmented integer data. +#[derive(Debug, Default, Clone)] +pub struct Segments(Vec); + +impl Segments { + /// Returns an iterator over all values across all segments. + /// + /// Values are returned in segment-order, with each segment's values + /// returned in ascending order. + pub fn values(&self) -> impl Iterator + '_ { + self.0 + .iter() + .flat_map(|segment| segment.as_slice().iter().copied()) + } + + /// Pushes a new segment to the end of the inner segments list, validating + /// proper ordering with existing segments. + /// + /// To maintain correct segment ordering for compression and decoding, each + /// new segment must have an offset greater than the maximum value of the + /// previous segment. + /// + /// # Returns + /// * `Ok(())` - If the segment was successfully added + /// * `Err(SegmentsError::InvalidSegmentOrder)` - If the segment violates + /// ordering constraints + pub fn try_push(&mut self, segment: Segment) -> Result<(), SegmentsError> { + // Check if there are existing segments + if let Some(last_segment) = self.0.last() { + // Get the maximum value from the last segment + let prev_max = last_segment.max(); + + // Validate that new segment's offset is greater than previous max + if segment.offset() <= prev_max { + return Err(SegmentsError::OverlappingSegments { + offset: segment.offset(), + prev_max, + }); + } + } + + // Validation passed, add the segment + self.0.push(segment); + + Ok(()) + } + + /// Returns the number of segments in the collection. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns `true` if there are no segments in the collection. + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns an iterator over the inner segments. + /// + /// Provides non-consuming access to segments, useful for + /// analysis operations that don't modify the collection. + pub fn iter(&self) -> impl Iterator { + self.0.iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use test_case::test_case; + + /// Helper method to create a new `Segment` from a slice of values. + fn segment(values: &[u64]) -> Segment { + assert!(!values.is_empty(), "segment values cannot be empty"); + + let mut segment = Segment::new_with_offset(values[0]); + for &value in &values[1..] { + segment.try_insert(value).unwrap(); + } + segment + } + + /// Test basic Segments construction and accessors + #[test] + fn test_new_segments() { + // Empty segments + let segments = Segments::default(); + assert!(segments.is_empty()); + assert_eq!(segments.len(), 0); + assert_eq!(segments.iter().count(), 0); + assert_eq!(segments.values().count(), 0); + + // Create with segments + let mut segment1 = Segment::new_with_offset(10); + segment1.try_insert(15).unwrap(); + + let mut segments = Segments::default(); + segments.try_push(segment1).expect("failed to push segment"); + assert!(!segments.is_empty()); + assert_eq!(segments.len(), 1); + assert_eq!(segments.iter().count(), 1); + assert_eq!(segments.values().count(), 2); // Contains offset and value + } + + /// Test segment ordering validation during push operations + #[test_case( + &[segment(&[10, 15, 20]), segment(&[30, 35, 40])] => Ok(()); + "valid ascending segments" + )] + #[test_case( + &[segment(&[10]), segment(&[20]), segment(&[30])] => Ok(()); + "offset-only segments" + )] + #[test_case( + &[segment(&[0, 1]), segment(&[2, 3]), segment(&[4, 5])] => Ok(()); + "minimal spacing" + )] + #[test_case( + &[segment(&[10]), segment(&[10])] => Err(SegmentsError::OverlappingSegments { + offset: 10, + prev_max: 10, + }); + "overlapping offset-only segments" + )] + #[test_case( + &[segment(&[10, 15, 20]), segment(&[20, 25])] => Err(SegmentsError::OverlappingSegments { + offset: 20, + prev_max: 20, + }); + "second offset equals first max" + )] + #[test_case( + &[segment(&[10, 15, 20]), segment(&[19, 25])] => Err(SegmentsError::OverlappingSegments { + offset: 19, + prev_max: 20, + }); + "second offset less than first max" + )] + #[test_case( + &[segment(&[10, 15, 20]), segment(&[5, 25])] => Err(SegmentsError::OverlappingSegments { + offset: 5, + prev_max: 20, + }); + "second offset less than first offset" + )] + fn test_segments_push(segments_to_push: &[Segment]) -> Result<(), SegmentsError> { + let mut segments = Segments::default(); + for segment in segments_to_push { + segments.try_push(segment.clone())?; + } + Ok(()) + } + + /// Test values() iterator functionality + #[test] + fn test_values_iterator() -> Result<(), SegmentsError> { + let mut segments = Segments::default(); + + // Add three segments with specific values + let seg1 = segment(&[10, 11, 12]); + segments.try_push(seg1)?; + + let seg2 = segment(&[20, 21]); + segments.try_push(seg2)?; + + let seg3 = segment(&[30, 35, 40]); + segments.try_push(seg3)?; + + // Collect all values and verify + let all_values = segments.values().collect::>(); + assert_eq!(all_values, vec![10, 11, 12, 20, 21, 30, 35, 40]); + + Ok(()) + } +} diff --git a/sbtc/src/idpack/tests.rs b/sbtc/src/idpack/tests.rs new file mode 100644 index 000000000..1ebef1bb5 --- /dev/null +++ b/sbtc/src/idpack/tests.rs @@ -0,0 +1,183 @@ +//! Round-trip testing for verifying that the entire compression pipeline +//! (package -> encode -> decode) works correctly for a wide variety of input +//! data patterns. + +use crate::idpack::{ + codec::{Decodable, Encodable}, + segmenters::{BitmapSegmenter, Segmenter}, + Segments, +}; +use proptest::prelude::*; +use std::collections::BTreeSet; +use test_case::test_case; + +/// Maximum value for generated IDs to keep tests reasonable +const MAX_ID_VALUE: u64 = 10_000_000; + +/// Maximum gap between consecutive values in sparse sequences +const MAX_SPARSE_GAP: u64 = 1_000_000; + +// Main property test suite for round-trip testing of segmentation and encoding +proptest! { + #[test] + fn test_roundtrip_dense_sequences(values in prop::collection::vec(1..10000u64, 1..1000)) { + let sorted_unique = to_sorted_unique(&values); + prop_assume!(!sorted_unique.is_empty()); + roundtrip_test(&sorted_unique).expect("round-trip failed"); + } + + #[test] + fn test_roundtrip_sparse_sequences( + base in 1..1000u64, + increments in prop::collection::vec(1..MAX_SPARSE_GAP, 1..100) + ) { + // Create a sparse sequence with large gaps + let mut values = Vec::with_capacity(increments.len()); + let mut current = base; + + for inc in increments { + current += inc; + if current <= MAX_ID_VALUE { + values.push(current); + } + } + + prop_assume!(!values.is_empty()); + roundtrip_test(&values).expect("round-trip failed"); + } + + #[test] + fn test_roundtrip_mixed_density( + dense_runs in prop::collection::vec((1..100u64, 1..20usize), 1..10), + gaps in prop::collection::vec(1..10000u64, 1..10) + ) { + // Create sequences with mixed density patterns + let mut values = Vec::new(); + let mut current = 1u64; + + for (i, (step, count)) in dense_runs.into_iter().enumerate() { + // Add a gap before each dense run (except first) + if i > 0 && i - 1 < gaps.len() { + current += gaps[i - 1]; + } + + // Add a dense run of values + for _ in 0..count { + if current <= MAX_ID_VALUE { + values.push(current); + current += step; + } + } + } + + let sorted_unique = to_sorted_unique(&values); + prop_assume!(!sorted_unique.is_empty()); + roundtrip_test(&sorted_unique).expect("round-trip failed"); + } + + #[test] + fn test_roundtrip_edge_values( + small_values in prop::collection::vec(1..100u64, 1..50), + large_values in prop::collection::vec((MAX_ID_VALUE - 10000)..MAX_ID_VALUE, 1..50) + ) { + // Combine small and large values + let mut values = small_values; + values.extend(large_values); + + let sorted_unique = to_sorted_unique(&values); + prop_assume!(!sorted_unique.is_empty()); + roundtrip_test(&sorted_unique).expect("round-trip failed"); + } +} + +/// Helper function to ensure test data is sorted and unique +fn to_sorted_unique(values: &[u64]) -> Vec { + let mut set = BTreeSet::new(); + set.extend(values); + set.into_iter().collect() +} + +/// Performs the full round-trip test: package -> encode -> decode -> compare +fn roundtrip_test(values: &[u64]) -> Result<(), String> { + // Skip empty sets (handled by prop_assume in the test functions) + if values.is_empty() { + return Ok(()); + } + + // Step 1: Package the values into segments + let segmenter = BitmapSegmenter; + let segments = segmenter.package(values).expect("segmentation failed"); + + // Step 2: Encode the segments to bytes + let encoded_bytes = segments.encode(); + + // Step 3: Decode the bytes back to segments + let decoded_segments = Segments::decode(&encoded_bytes).expect("decoding failed"); + + // Step 4: Extract values from decoded segments + let decoded_values = decoded_segments.values().collect::>(); + + // Step 5: Compare original and decoded lengths + if values.len() != decoded_values.len() { + return Err(format!( + "mismatched lengths: original={}, decoded={}", + values.len(), + decoded_values.len() + )); + } + + // Step 6: Compare original and decoded values (in order) + // Note: we don't use a simple equals just so that we can provide a more + // detailed error message in case of a mismatch. + for (idx, (original, decoded)) in values.iter().zip(decoded_values.iter()).enumerate() { + if original != decoded { + return Err(format!( + "mismatch at index {}: original={}, decoded={}", + idx, original, decoded + )); + } + } + + Ok(()) +} + +#[test_case(&[1, 2, 3, 1000, 1001, 1002]; "dense clusters with gap")] +#[test_case(&[10, 20, 30, 10000, 10001]; "varying step sizes")] +#[test_case(&[1, u64::MAX / 2, u64::MAX / 2 + 1]; "near maximum values")] +#[test_case(&[1, 2, 3, 4, 5, 6, 7, 8]; "sequential values")] +#[test_case(&[1000, 10000, 100000, 1000000]; "logarithmic spacing")] +fn test_specific_patterns(values: &[u64]) { + roundtrip_test(&values).expect("round-trip failed"); +} + +#[test] +fn test_large_sequence() { + // Test with a large sequence to stress-test memory and performance + let mut values = Vec::with_capacity(10_000); + for i in 1..10_000 { + values.push(i); + } + + // Introduce some random larger gaps + values.push(20_000); + values.push(20_001); + values.push(50_000); + + roundtrip_test(&values).expect("round-trip failed"); +} + +#[test] +fn test_boundary_values() { + // Test with values that might stress LEB128 encoding boundaries + let mut values = Vec::new(); + + // Powers of 2 minus/plus small values to test encoding boundaries + for i in 0..8 { + let base = 1u64 << (7 * i); + values.push(base - 1); + values.push(base); + values.push(base + 1); + } + + roundtrip_test(&values).expect("round-trip failed"); +} diff --git a/sbtc/src/lib.rs b/sbtc/src/lib.rs index 96ddcf1c6..95dc63843 100644 --- a/sbtc/src/lib.rs +++ b/sbtc/src/lib.rs @@ -10,6 +10,7 @@ use bitcoin::XOnlyPublicKey; pub mod deposits; pub mod error; pub mod events; +pub mod idpack; pub mod leb128; #[cfg(feature = "webhooks")] diff --git a/supply-chain/config.toml b/supply-chain/config.toml index bc25a7ed7..a01777b19 100644 --- a/supply-chain/config.toml +++ b/supply-chain/config.toml @@ -66,10 +66,6 @@ criteria = "safe-to-deploy" version = "0.3.8" criteria = "safe-to-deploy" -[[exemptions.arrayvec]] -version = "0.7.4" -criteria = "safe-to-deploy" - [[exemptions.asn1-rs]] version = "0.6.2" criteria = "safe-to-deploy" diff --git a/supply-chain/imports.lock b/supply-chain/imports.lock index 4a28549f1..c9d989cae 100644 --- a/supply-chain/imports.lock +++ b/supply-chain/imports.lock @@ -467,6 +467,25 @@ criteria = "safe-to-deploy" version = "0.1.0" notes = "No unsafe usage or ambient capabilities, sane build script" +[[audits.google.audits.arrayvec]] +who = "Lukasz Anforowicz " +criteria = "safe-to-deploy" +version = "0.7.6" +notes = ''' +Grepped for `-i cipher`, `-i crypto`, `'\bfs\b'`, `'\bnet\b'` and there were +no hits, except for some `net` usage in tests. + +The crate has quite a few bits of `unsafe` Rust. The audit comments can be +found in https://chromium-review.googlesource.com/c/chromium/src/+/6187726/2 +''' +aggregated-from = "https://chromium.googlesource.com/chromium/src/+/main/third_party/rust/chromium_crates_io/supply-chain/audits.toml?format=TEXT" + +[[audits.google.audits.assert_matches]] +who = "ChromeOS" +criteria = "safe-to-run" +version = "1.5.0" +aggregated-from = "https://chromium.googlesource.com/chromiumos/third_party/rust_crates/+/refs/heads/main/cargo-vet/audits.toml?format=TEXT" + [[audits.google.audits.async-stream]] who = "Tyler Mandry " criteria = "safe-to-deploy"