@@ -253,44 +253,53 @@ fn mergeIntersection(
253
253
return out_idx ;
254
254
}
255
255
256
- // TODO: The pow2 requirement is quite inefficient: explore ideas posted in
257
- // https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html
258
- // (rocksdb even-odd scheme from comments looks interesting).
256
+ fn parity (integer : anytype ) enum (u1 ) { even , odd } {
257
+ return @enumFromInt (integer & 1 );
258
+ }
259
+
259
260
pub const CuckooFilter = struct {
260
- /// len must be a power of 2.
261
- ///
262
- /// ### Pathological case with buckets.len power of 2
263
- ///
264
- /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2`
265
- /// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2`
266
- ///
267
- /// Our alternate mappings hold and `contains()` will not return false negatives.
268
- ///
269
- /// ### Pathological case with buckets.len NOT power of 2:
270
- ///
271
- /// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3`
272
- /// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4`
273
- ///
274
- /// Our alternate mappings do not hold and `contains()` can return false negatives. This is not
275
- /// acceptable as the entire point of an AMQ datastructure is the presence of false positives
276
- /// but not false negatives.
277
261
buckets : []Bucket ,
278
262
279
263
pub const Fingerprint = enum (u8 ) {
280
264
none = std .math .maxInt (u8 ),
281
265
_ ,
282
266
283
- pub fn hash (fingerprint : Fingerprint ) u32 {
284
- return @truncate (std .hash .Murmur2_64 .hash (&.{@intFromEnum (fingerprint )}));
267
+ const precomputed_odd_hashes = blk : {
268
+ var table : [255 ]u32 = undefined ;
269
+
270
+ for (& table , 0.. ) | * h , index | {
271
+ h .* = @truncate (std .hash .Murmur2_64 .hash (&.{index }) | 1 );
272
+ }
273
+
274
+ break :blk table ;
275
+ };
276
+
277
+ pub fn oddHash (fingerprint : Fingerprint ) u32 {
278
+ assert (fingerprint != .none );
279
+ return precomputed_odd_hashes [@intFromEnum (fingerprint )];
285
280
}
286
281
};
282
+
287
283
pub const Bucket = [4 ]Fingerprint ;
288
284
pub const BucketIndex = enum (u32 ) {
289
285
_ ,
290
286
291
- pub fn alternate (index : BucketIndex , fingerprint : Fingerprint ) BucketIndex {
287
+ pub fn alternate (index : BucketIndex , fingerprint : Fingerprint , len : u32 ) BucketIndex {
288
+ assert (@intFromEnum (index ) < len );
292
289
assert (fingerprint != .none );
293
- return @enumFromInt (@intFromEnum (index ) ^ fingerprint .hash ());
290
+
291
+ const signed_index : i64 = @intFromEnum (index );
292
+ const odd_hash : i64 = fingerprint .oddHash ();
293
+
294
+ const unbounded = switch (parity (signed_index )) {
295
+ .even = > signed_index + odd_hash ,
296
+ .odd = > signed_index - odd_hash ,
297
+ };
298
+ const bounded : u32 = @intCast (@mod (unbounded , len ));
299
+
300
+ assert (parity (signed_index ) != parity (bounded ));
301
+
302
+ return @enumFromInt (bounded );
294
303
}
295
304
};
296
305
@@ -299,41 +308,46 @@ pub const CuckooFilter = struct {
299
308
index_1 : BucketIndex ,
300
309
index_2 : BucketIndex ,
301
310
302
- pub fn initFromTrigram (trigram : Trigram ) Triplet {
311
+ pub fn initFromTrigram (trigram : Trigram , len : u32 ) Triplet {
303
312
const split : packed struct {
304
313
fingerprint : Fingerprint ,
305
314
padding : u24 ,
306
- index_1 : BucketIndex ,
315
+ index_1 : u32 ,
307
316
} = @bitCast (std .hash .Murmur2_64 .hash (& trigram ));
308
317
318
+ const index_1 : BucketIndex = @enumFromInt (split .index_1 % len );
319
+
309
320
const fingerprint : Fingerprint = if (split .fingerprint == .none )
310
- @enumFromInt (0 )
321
+ @enumFromInt (1 )
311
322
else
312
323
split .fingerprint ;
313
324
314
325
const triplet : Triplet = .{
315
326
.fingerprint = fingerprint ,
316
- .index_1 = split . index_1 ,
317
- .index_2 = split . index_1 .alternate (fingerprint ),
327
+ .index_1 = index_1 ,
328
+ .index_2 = index_1 .alternate (fingerprint , len ),
318
329
};
319
- assert (triplet .index_2 .alternate (fingerprint ) == triplet . index_1 );
330
+ assert (triplet .index_2 .alternate (fingerprint , len ) == index_1 );
320
331
321
332
return triplet ;
322
333
}
323
334
};
324
335
336
+ pub fn init (buckets : []Bucket ) CuckooFilter {
337
+ assert (parity (buckets .len ) == .even );
338
+ return .{ .buckets = buckets };
339
+ }
340
+
325
341
pub fn reset (filter : CuckooFilter ) void {
326
- @memset (filter .buckets , [1 ]Fingerprint {.none } ** 4 );
342
+ @memset (filter .buckets , [1 ]Fingerprint {.none } ** @typeInfo ( Bucket ). array . len );
327
343
}
328
344
329
- pub fn capacityForCount (count : usize ) error {Overflow }! usize {
330
- const fill_rate = 0.95 ;
331
- return try std .math .ceilPowerOfTwo (usize , @intFromFloat (@ceil (@as (f32 , @floatFromInt (count )) / fill_rate )));
345
+ pub fn capacityForCount (count : u32 ) error {Overflow }! u32 {
346
+ return count + (count & 1 );
332
347
}
333
348
334
- // Use a hash (fnv) for randomness.
335
349
pub fn append (filter : CuckooFilter , random : std.Random , trigram : Trigram ) error {EvictionFailed }! void {
336
- const triplet : Triplet = .initFromTrigram (trigram );
350
+ const triplet : Triplet = .initFromTrigram (trigram , @intCast ( filter . buckets . len ) );
337
351
338
352
if (filter .appendToBucket (triplet .index_1 , triplet .fingerprint ) or
339
353
filter .appendToBucket (triplet .index_2 , triplet .fingerprint ))
@@ -345,7 +359,7 @@ pub const CuckooFilter = struct {
345
359
var index = if (random .boolean ()) triplet .index_1 else triplet .index_2 ;
346
360
for (0.. 500) | _ | {
347
361
fingerprint = filter .swapFromBucket (random , index , fingerprint );
348
- index = index .alternate (fingerprint );
362
+ index = index .alternate (fingerprint , @intCast ( filter . buckets . len ) );
349
363
350
364
if (filter .appendToBucket (index , fingerprint )) {
351
365
return ;
@@ -356,8 +370,7 @@ pub const CuckooFilter = struct {
356
370
}
357
371
358
372
fn bucketAt (filter : CuckooFilter , index : BucketIndex ) * Bucket {
359
- assert (std .math .isPowerOfTwo (filter .buckets .len ));
360
- return & filter .buckets [@intFromEnum (index ) & (filter .buckets .len - 1 )];
373
+ return & filter .buckets [@intFromEnum (index )];
361
374
}
362
375
363
376
fn appendToBucket (filter : CuckooFilter , index : BucketIndex , fingerprint : Fingerprint ) bool {
@@ -382,6 +395,7 @@ pub const CuckooFilter = struct {
382
395
) Fingerprint {
383
396
assert (fingerprint != .none );
384
397
398
+ comptime assert (@typeInfo (Bucket ).array .len == 4 );
385
399
const target = & filter .bucketAt (index )[random .int (u2 )];
386
400
387
401
const old_fingerprint = target .* ;
@@ -393,7 +407,7 @@ pub const CuckooFilter = struct {
393
407
}
394
408
395
409
pub fn contains (filter : CuckooFilter , trigram : Trigram ) bool {
396
- const triplet : Triplet = .initFromTrigram (trigram );
410
+ const triplet : Triplet = .initFromTrigram (trigram , @intCast ( filter . buckets . len ) );
397
411
398
412
return filter .containsInBucket (triplet .index_1 , triplet .fingerprint ) or
399
413
filter .containsInBucket (triplet .index_2 , triplet .fingerprint );
@@ -417,16 +431,15 @@ pub const CuckooFilter = struct {
417
431
test CuckooFilter {
418
432
const allocator = std .testing .allocator ;
419
433
420
- const element_count = 486 ;
434
+ const element_count = 499 ;
421
435
const filter_size = comptime CuckooFilter .capacityForCount (element_count ) catch unreachable ;
422
- try std .testing .expectEqual (512 , filter_size );
423
436
424
437
var entries : std .AutoArrayHashMapUnmanaged (Trigram , void ) = .empty ;
425
438
defer entries .deinit (allocator );
426
439
try entries .ensureTotalCapacity (allocator , element_count );
427
440
428
441
var buckets : [filter_size ]CuckooFilter.Bucket = undefined ;
429
- var filter : CuckooFilter = .{ . buckets = & buckets } ;
442
+ var filter : CuckooFilter = .init ( & buckets ) ;
430
443
var filter_prng : std.Random.DefaultPrng = .init (42 );
431
444
432
445
for (0.. 2_500) | gen_prng_seed | {
0 commit comments