Skip to content

Commit a41568a

Browse files
committed
Even CuckooFilter
1 parent a19cec6 commit a41568a

File tree

1 file changed

+56
-43
lines changed

1 file changed

+56
-43
lines changed

src/TrigramStore.zig

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -253,44 +253,53 @@ fn mergeIntersection(
253253
return out_idx;
254254
}
255255

256-
// TODO: The pow2 requirement is quite inefficient: explore ideas posted in
257-
// https://databasearchitects.blogspot.com/2019/07/cuckoo-filters-with-arbitrarily-sized.html
258-
// (rocksdb even-odd scheme from comments looks interesting).
256+
fn parity(integer: anytype) enum(u1) { even, odd } {
257+
return @enumFromInt(integer & 1);
258+
}
259+
259260
pub const CuckooFilter = struct {
260-
/// len must be a power of 2.
261-
///
262-
/// ### Pathological case with buckets.len power of 2
263-
///
264-
/// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_2`
265-
/// - `BucketIndex(alias_1)` -> `bucket_1`, `BucketIndex(alias_1).alternate()` -> `bucket_2`
266-
///
267-
/// Our alternate mappings hold and `contains()` will not return false negatives.
268-
///
269-
/// ### Pathological case with buckets.len NOT power of 2:
270-
///
271-
/// - `BucketIndex(alias_0)` -> `bucket_1`, `BucketIndex(alias_0).alternate()` -> `bucket_3`
272-
/// - `BucketIndex(alias_1)` -> `bucket_2`, `BucketIndex(alias_1).alternate()` -> `bucket_4`
273-
///
274-
/// Our alternate mappings do not hold and `contains()` can return false negatives. This is not
275-
/// acceptable as the entire point of an AMQ datastructure is the presence of false positives
276-
/// but not false negatives.
277261
buckets: []Bucket,
278262

279263
pub const Fingerprint = enum(u8) {
280264
none = std.math.maxInt(u8),
281265
_,
282266

283-
pub fn hash(fingerprint: Fingerprint) u32 {
284-
return @truncate(std.hash.Murmur2_64.hash(&.{@intFromEnum(fingerprint)}));
267+
const precomputed_odd_hashes = blk: {
268+
var table: [255]u32 = undefined;
269+
270+
for (&table, 0..) |*h, index| {
271+
h.* = @truncate(std.hash.Murmur2_64.hash(&.{index}) | 1);
272+
}
273+
274+
break :blk table;
275+
};
276+
277+
pub fn oddHash(fingerprint: Fingerprint) u32 {
278+
assert(fingerprint != .none);
279+
return precomputed_odd_hashes[@intFromEnum(fingerprint)];
285280
}
286281
};
282+
287283
pub const Bucket = [4]Fingerprint;
288284
pub const BucketIndex = enum(u32) {
289285
_,
290286

291-
pub fn alternate(index: BucketIndex, fingerprint: Fingerprint) BucketIndex {
287+
pub fn alternate(index: BucketIndex, fingerprint: Fingerprint, len: u32) BucketIndex {
288+
assert(@intFromEnum(index) < len);
292289
assert(fingerprint != .none);
293-
return @enumFromInt(@intFromEnum(index) ^ fingerprint.hash());
290+
291+
const signed_index: i64 = @intFromEnum(index);
292+
const odd_hash: i64 = fingerprint.oddHash();
293+
294+
const unbounded = switch (parity(signed_index)) {
295+
.even => signed_index + odd_hash,
296+
.odd => signed_index - odd_hash,
297+
};
298+
const bounded: u32 = @intCast(@mod(unbounded, len));
299+
300+
assert(parity(signed_index) != parity(bounded));
301+
302+
return @enumFromInt(bounded);
294303
}
295304
};
296305

@@ -299,41 +308,46 @@ pub const CuckooFilter = struct {
299308
index_1: BucketIndex,
300309
index_2: BucketIndex,
301310

302-
pub fn initFromTrigram(trigram: Trigram) Triplet {
311+
pub fn initFromTrigram(trigram: Trigram, len: u32) Triplet {
303312
const split: packed struct {
304313
fingerprint: Fingerprint,
305314
padding: u24,
306-
index_1: BucketIndex,
315+
index_1: u32,
307316
} = @bitCast(std.hash.Murmur2_64.hash(&trigram));
308317

318+
const index_1: BucketIndex = @enumFromInt(split.index_1 % len);
319+
309320
const fingerprint: Fingerprint = if (split.fingerprint == .none)
310-
@enumFromInt(0)
321+
@enumFromInt(1)
311322
else
312323
split.fingerprint;
313324

314325
const triplet: Triplet = .{
315326
.fingerprint = fingerprint,
316-
.index_1 = split.index_1,
317-
.index_2 = split.index_1.alternate(fingerprint),
327+
.index_1 = index_1,
328+
.index_2 = index_1.alternate(fingerprint, len),
318329
};
319-
assert(triplet.index_2.alternate(fingerprint) == triplet.index_1);
330+
assert(triplet.index_2.alternate(fingerprint, len) == index_1);
320331

321332
return triplet;
322333
}
323334
};
324335

336+
pub fn init(buckets: []Bucket) CuckooFilter {
337+
assert(parity(buckets.len) == .even);
338+
return .{ .buckets = buckets };
339+
}
340+
325341
pub fn reset(filter: CuckooFilter) void {
326-
@memset(filter.buckets, [1]Fingerprint{.none} ** 4);
342+
@memset(filter.buckets, [1]Fingerprint{.none} ** @typeInfo(Bucket).array.len);
327343
}
328344

329-
pub fn capacityForCount(count: usize) error{Overflow}!usize {
330-
const fill_rate = 0.95;
331-
return try std.math.ceilPowerOfTwo(usize, @intFromFloat(@ceil(@as(f32, @floatFromInt(count)) / fill_rate)));
345+
pub fn capacityForCount(count: u32) error{Overflow}!u32 {
346+
return count + (count & 1);
332347
}
333348

334-
// Use a hash (fnv) for randomness.
335349
pub fn append(filter: CuckooFilter, random: std.Random, trigram: Trigram) error{EvictionFailed}!void {
336-
const triplet: Triplet = .initFromTrigram(trigram);
350+
const triplet: Triplet = .initFromTrigram(trigram, @intCast(filter.buckets.len));
337351

338352
if (filter.appendToBucket(triplet.index_1, triplet.fingerprint) or
339353
filter.appendToBucket(triplet.index_2, triplet.fingerprint))
@@ -345,7 +359,7 @@ pub const CuckooFilter = struct {
345359
var index = if (random.boolean()) triplet.index_1 else triplet.index_2;
346360
for (0..500) |_| {
347361
fingerprint = filter.swapFromBucket(random, index, fingerprint);
348-
index = index.alternate(fingerprint);
362+
index = index.alternate(fingerprint, @intCast(filter.buckets.len));
349363

350364
if (filter.appendToBucket(index, fingerprint)) {
351365
return;
@@ -356,8 +370,7 @@ pub const CuckooFilter = struct {
356370
}
357371

358372
fn bucketAt(filter: CuckooFilter, index: BucketIndex) *Bucket {
359-
assert(std.math.isPowerOfTwo(filter.buckets.len));
360-
return &filter.buckets[@intFromEnum(index) & (filter.buckets.len - 1)];
373+
return &filter.buckets[@intFromEnum(index)];
361374
}
362375

363376
fn appendToBucket(filter: CuckooFilter, index: BucketIndex, fingerprint: Fingerprint) bool {
@@ -382,6 +395,7 @@ pub const CuckooFilter = struct {
382395
) Fingerprint {
383396
assert(fingerprint != .none);
384397

398+
comptime assert(@typeInfo(Bucket).array.len == 4);
385399
const target = &filter.bucketAt(index)[random.int(u2)];
386400

387401
const old_fingerprint = target.*;
@@ -393,7 +407,7 @@ pub const CuckooFilter = struct {
393407
}
394408

395409
pub fn contains(filter: CuckooFilter, trigram: Trigram) bool {
396-
const triplet: Triplet = .initFromTrigram(trigram);
410+
const triplet: Triplet = .initFromTrigram(trigram, @intCast(filter.buckets.len));
397411

398412
return filter.containsInBucket(triplet.index_1, triplet.fingerprint) or
399413
filter.containsInBucket(triplet.index_2, triplet.fingerprint);
@@ -417,16 +431,15 @@ pub const CuckooFilter = struct {
417431
test CuckooFilter {
418432
const allocator = std.testing.allocator;
419433

420-
const element_count = 486;
434+
const element_count = 499;
421435
const filter_size = comptime CuckooFilter.capacityForCount(element_count) catch unreachable;
422-
try std.testing.expectEqual(512, filter_size);
423436

424437
var entries: std.AutoArrayHashMapUnmanaged(Trigram, void) = .empty;
425438
defer entries.deinit(allocator);
426439
try entries.ensureTotalCapacity(allocator, element_count);
427440

428441
var buckets: [filter_size]CuckooFilter.Bucket = undefined;
429-
var filter: CuckooFilter = .{ .buckets = &buckets };
442+
var filter: CuckooFilter = .init(&buckets);
430443
var filter_prng: std.Random.DefaultPrng = .init(42);
431444

432445
for (0..2_500) |gen_prng_seed| {

0 commit comments

Comments
 (0)