-
Notifications
You must be signed in to change notification settings - Fork 178
[cryptography/bloomfilter] generic Hasher and optimizations #2729
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
805b5e2
46b7c42
2e17353
e81e11a
73d90c2
a2cadf8
0b8b08b
04ae331
e23979c
5589649
2cb54e5
b4ea449
58cb70d
ccf34ce
6063bc5
d74c06f
adae91c
1d84f4a
c94b14a
700db56
f69fc9c
e131f6b
52888a8
a0e0a3c
e049669
98191f4
f130dde
5eeb208
9999706
b6adff7
9c83383
39df4d2
9994592
cd4768f
1107f16
36a2889
87836e8
dfecdb2
574e3f7
67780ad
cba14c0
d052b41
eb675f6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| use criterion::criterion_main; | ||
|
|
||
| mod contains; | ||
| mod insert; | ||
|
|
||
| criterion_main!(insert::benches, contains::benches); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| use commonware_cryptography::BloomFilter; | ||
| use commonware_utils::{NZUsize, NZU8}; | ||
| use criterion::{criterion_group, BenchmarkId, Criterion, Throughput}; | ||
| use rand::{rngs::StdRng, RngCore, SeedableRng}; | ||
| use std::collections::HashSet; | ||
|
|
||
| fn benchmark_contains(c: &mut Criterion, name: &str, query_inserted: bool) { | ||
| let mut group = c.benchmark_group(format!("bloomfilter/{name}")); | ||
|
|
||
| let filter_bits = [1 << 10, 1 << 14, 1 << 17, 1 << 20]; // 1024, 16384, 131072, 1048576 | ||
| let hashers = [3, 7, 10]; | ||
| let item_size = 32; | ||
| let num_items = 1_000; | ||
|
|
||
| let mut rng = StdRng::seed_from_u64(42); | ||
|
|
||
| for &bits in &filter_bits { | ||
| for &k in &hashers { | ||
| let mut bf = BloomFilter::new(NZU8!(k), NZUsize!(bits)); | ||
| let mut set = HashSet::new(); | ||
|
|
||
| // Insert items | ||
| let inserted: Vec<_> = (0..num_items) | ||
| .map(|_| { | ||
| let mut item = vec![0u8; item_size]; | ||
| rng.fill_bytes(&mut item); | ||
| bf.insert(&item); | ||
| set.insert(item.clone()); | ||
| item | ||
| }) | ||
| .collect(); | ||
|
|
||
| // Items to query: inserted ones or guaranteed non-inserted ones | ||
| let items = if query_inserted { | ||
| inserted | ||
| } else { | ||
| let mut items = Vec::with_capacity(num_items); | ||
| while items.len() < num_items { | ||
| let mut item = vec![0u8; item_size]; | ||
| rng.fill_bytes(&mut item); | ||
| if !set.contains(&item) { | ||
| items.push(item); | ||
| } | ||
| } | ||
| items | ||
| }; | ||
|
|
||
| group.throughput(Throughput::Elements(1)); | ||
| group.bench_with_input( | ||
| BenchmarkId::new(format!("bits={bits}"), format!("k={k}")), | ||
| &items, | ||
| |b, items| { | ||
| let mut idx = 0; | ||
| b.iter(|| { | ||
| let result = bf.contains(&items[idx]); | ||
| idx = (idx + 1) % items.len(); | ||
| result | ||
| }); | ||
| }, | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| fn benchmark_contains_positive(c: &mut Criterion) { | ||
| benchmark_contains(c, "contains_positive", true); | ||
| } | ||
|
|
||
| fn benchmark_contains_negative(c: &mut Criterion) { | ||
| benchmark_contains(c, "contains_negative", false); | ||
| } | ||
|
|
||
| criterion_group!( | ||
| benches, | ||
| benchmark_contains_positive, | ||
| benchmark_contains_negative | ||
| ); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| use commonware_cryptography::BloomFilter; | ||
| use commonware_utils::{NZUsize, NZU8}; | ||
| use criterion::{criterion_group, BenchmarkId, Criterion, Throughput}; | ||
| use rand::{rngs::StdRng, RngCore, SeedableRng}; | ||
|
|
||
| fn benchmark_insert(c: &mut Criterion) { | ||
| let mut group = c.benchmark_group("bloomfilter/insert"); | ||
|
|
||
| let filter_bits = [1 << 10, 1 << 14, 1 << 17, 1 << 20]; // 1024, 16384, 131072, 1048576 | ||
| let hashers = [3, 7, 10]; | ||
| let item_size = 32; | ||
|
|
||
| let mut rng = StdRng::seed_from_u64(42); | ||
|
|
||
| for &bits in &filter_bits { | ||
| for &k in &hashers { | ||
| let mut bf = BloomFilter::new(NZU8!(k), NZUsize!(bits)); | ||
|
|
||
| let mut item = vec![0u8; item_size]; | ||
| rng.fill_bytes(&mut item); | ||
|
|
||
| group.throughput(Throughput::Elements(1)); | ||
| group.bench_with_input( | ||
| BenchmarkId::new(format!("bits={bits}"), format!("k={k}")), | ||
| &item, | ||
| |b, item| { | ||
| b.iter(|| { | ||
| bf.insert(item); | ||
| }); | ||
| }, | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| criterion_group!(benches, benchmark_insert); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,11 +13,8 @@ use commonware_codec::{ | |
| use commonware_utils::bitmap::BitMap; | ||
| use core::num::{NonZeroU64, NonZeroU8, NonZeroUsize}; | ||
|
|
||
| /// The length of a half of a [Digest]. | ||
| const HALF_DIGEST_LEN: usize = 16; | ||
|
|
||
| /// The length of a full [Digest]. | ||
| const FULL_DIGEST_LEN: usize = Digest::SIZE; | ||
| /// The length of a [Digest] in bytes. | ||
| const DIGEST_LEN: usize = Digest::SIZE; | ||
|
|
||
| /// A [Bloom Filter](https://en.wikipedia.org/wiki/Bloom_filter). | ||
| /// | ||
|
|
@@ -31,38 +28,43 @@ pub struct BloomFilter { | |
| } | ||
|
|
||
| impl BloomFilter { | ||
| const _ASSERT_DIGEST_AT_LEAST_16_BYTES: () = assert!( | ||
| DIGEST_LEN >= 16, | ||
| "digest must be at least 128 bits (16 bytes)" | ||
| ); | ||
|
|
||
| /// Creates a new [BloomFilter] with `hashers` hash functions and `bits` bits. | ||
| /// | ||
| /// The number of bits will be rounded up to the next power of 2. | ||
| pub fn new(hashers: NonZeroU8, bits: NonZeroUsize) -> Self { | ||
| let bits = bits.get().next_power_of_two(); | ||
|
||
| Self { | ||
| hashers: hashers.get(), | ||
| bits: BitMap::zeroes(bits.get() as u64), | ||
| bits: BitMap::zeroes(bits as u64), | ||
| } | ||
| } | ||
cursor[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /// Generate `num_hashers` bit indices for a given item. | ||
| fn indices(&self, item: &[u8], bits: u64) -> impl Iterator<Item = u64> { | ||
| // Extract two 128-bit hash values from the SHA256 digest of the item | ||
| fn indices(&self, item: &[u8]) -> impl Iterator<Item = u64> { | ||
| #[allow(path_statements)] | ||
| Self::_ASSERT_DIGEST_AT_LEAST_16_BYTES; | ||
|
|
||
| // Extract two 64-bit hash values from the SHA256 digest of the item | ||
| let digest = Sha256::hash(item); | ||
andresilva marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| let mut h1_bytes = [0u8; HALF_DIGEST_LEN]; | ||
| h1_bytes.copy_from_slice(&digest[0..HALF_DIGEST_LEN]); | ||
| let h1 = u128::from_be_bytes(h1_bytes); | ||
| let mut h2_bytes = [0u8; HALF_DIGEST_LEN]; | ||
| h2_bytes.copy_from_slice(&digest[HALF_DIGEST_LEN..FULL_DIGEST_LEN]); | ||
| let h2 = u128::from_be_bytes(h2_bytes); | ||
| let h1 = u64::from_be_bytes(digest[0..8].try_into().unwrap()); | ||
| let h2 = u64::from_be_bytes(digest[8..16].try_into().unwrap()); | ||
|
||
|
|
||
| // Generate `hashers` hashes using the Kirsch-Mitzenmacher optimization: | ||
| // | ||
| // `h_i(x) = (h1(x) + i * h2(x)) mod m` | ||
| let hashers = self.hashers as u128; | ||
| let bits = bits as u128; | ||
| (0..hashers) | ||
| .map(move |hasher| h1.wrapping_add(hasher.wrapping_mul(h2)) % bits) | ||
| .map(|index| index as u64) | ||
| let hashers = self.hashers as u64; | ||
| let mask = self.bits.len() - 1; | ||
| (0..hashers).map(move |hasher| h1.wrapping_add(hasher.wrapping_mul(h2)) & mask) | ||
andresilva marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| /// Inserts an item into the [BloomFilter]. | ||
| pub fn insert(&mut self, item: &[u8]) { | ||
| let indices = self.indices(item, self.bits.len()); | ||
| let indices = self.indices(item); | ||
| for index in indices { | ||
| self.bits.set(index, true); | ||
| } | ||
|
|
@@ -72,7 +74,7 @@ impl BloomFilter { | |
| /// | ||
| /// Returns `true` if the item is probably in the set, and `false` if it is definitely not. | ||
| pub fn contains(&self, item: &[u8]) -> bool { | ||
| let indices = self.indices(item, self.bits.len()); | ||
| let indices = self.indices(item); | ||
| for index in indices { | ||
| if !self.bits.get(index) { | ||
| return false; | ||
|
|
@@ -190,11 +192,11 @@ mod tests { | |
|
|
||
| #[test] | ||
| fn test_codec_roundtrip() { | ||
| let mut bf = BloomFilter::new(NZU8!(5), NZUsize!(100)); | ||
| let mut bf = BloomFilter::new(NZU8!(5), NZUsize!(128)); | ||
| bf.insert(b"test1"); | ||
| bf.insert(b"test2"); | ||
|
|
||
| let cfg = (NZU8!(5), NZU64!(100)); | ||
| let cfg = (NZU8!(5), NZU64!(128)); | ||
|
|
||
| let encoded = bf.encode(); | ||
| let decoded = BloomFilter::decode_cfg(encoded, &cfg).unwrap(); | ||
|
|
@@ -213,12 +215,12 @@ mod tests { | |
|
|
||
| #[test] | ||
| fn test_codec_with_invalid_hashers() { | ||
| let mut bf = BloomFilter::new(NZU8!(5), NZUsize!(100)); | ||
| let mut bf = BloomFilter::new(NZU8!(5), NZUsize!(128)); | ||
| bf.insert(b"test1"); | ||
| let encoded = bf.encode(); | ||
|
|
||
| // Too large | ||
| let cfg = (NZU8!(10), NZU64!(100)); | ||
| let cfg = (NZU8!(10), NZU64!(128)); | ||
| let decoded = BloomFilter::decode_cfg(encoded.clone(), &cfg); | ||
| assert!(matches!( | ||
| decoded, | ||
|
|
@@ -229,7 +231,7 @@ mod tests { | |
| )); | ||
|
|
||
| // Too small | ||
| let cfg = (NZU8!(4), NZU64!(100)); | ||
| let cfg = (NZU8!(4), NZU64!(128)); | ||
| let decoded = BloomFilter::decode_cfg(encoded, &cfg); | ||
| assert!(matches!( | ||
| decoded, | ||
|
|
@@ -242,16 +244,16 @@ mod tests { | |
|
|
||
| #[test] | ||
| fn test_codec_with_invalid_bits() { | ||
| let mut bf = BloomFilter::new(NZU8!(5), NZUsize!(100)); | ||
| let mut bf = BloomFilter::new(NZU8!(5), NZUsize!(128)); | ||
| bf.insert(b"test1"); | ||
| let encoded = bf.encode(); | ||
|
|
||
| // Wrong bit count | ||
| let cfg = (NZU8!(5), NZU64!(99)); | ||
| let cfg = (NZU8!(5), NZU64!(64)); | ||
| let result = BloomFilter::decode_cfg(encoded.clone(), &cfg); | ||
| assert!(matches!(result, Err(CodecError::InvalidLength(100)))); | ||
| assert!(matches!(result, Err(CodecError::InvalidLength(128)))); | ||
|
|
||
| let cfg = (NZU8!(5), NZU64!(101)); | ||
| let cfg = (NZU8!(5), NZU64!(256)); | ||
| let result = BloomFilter::decode_cfg(encoded, &cfg); | ||
| assert!(matches!( | ||
| result, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wonder if this is overkill
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If this is not true then we'll panic:
IMO makes sense to verify at compile-time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sounds good