-
Notifications
You must be signed in to change notification settings - Fork 3
/
example.zig
294 lines (250 loc) · 12.7 KB
/
example.zig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
const std = @import("std");
const hasher = std.hash.Fnv1a_64;
const cuckoo = @import("./src/cuckoofilter.zig");
fn fingerprint8(x: []const u8) u8 {
return x[0];
}
fn fingerprint32(x: []const u8) u32 {
// Just a sample strategy, not suitable for all types
// of input. Imagine if you were adding images to the
// filter: all fingerprints would be the same because
// most formats have a standard header. In that case
// you want to make sure to use the actual graphical
// data to pluck your fingerprint from.
return @bytesToSlice(u32, x[0..@sizeOf(u32)])[0];
}
pub fn main() !void {
// Assume we want to keep track of max 1 Million items.
const universe_size = 1000000;
// Let's use Filter8, a filter with 1 byte long
// fingerprints and a 3% max *false positive* error rate.
// Note: Cuckoo filters cannot produce false negatives.
// Error % information:
_ = cuckoo.Filter8.MaxError;
// ╚═> 3.125e-02 (~0.03, i.e. 3%)
_ = cuckoo.Filter16.MaxError;
// ╚═> 1.22070312e-04 (~0.0001, i.e. 0.01%)
_ = cuckoo.Filter32.MaxError;
// ╚═> 9.31322574e-10 (~0.000000001, i.e. 0.0000001%)
// First let's calculate how big the filter has to be:
const memsize = comptime cuckoo.Filter8.size_for(universe_size);
// The value of memsize has to be a power of two and it
// is *strongly* recommended to keep the fill rate of a
// filter under 80%. size_for() will pad the number for
// you automatically and then round up to the closest
// power of 2. size_for_exactly() will not apply any
// padding before rounding up.
// Use capacity() to know how many items a slice of memory
// can store for the given filter type.
_ = cuckoo.Filter8.capacity(memsize); // => 2097152
// Note: this function will return the theoretical maximum
// capacity, without subtracting any padding. It's smart
// to adjust your expectations to match how much memory
// you have to allocate anyway, but don't get too greedy.
// I say `theoretical` because an overfilled filter will
// start refusing inserts with a TooFull error.
// This is how you allocate static memory for the filter:
var memory: [memsize]u8 align(cuckoo.Filter8.Align) = undefined;
// Note: the filter benefits from a specific alignment
// (which differs from type to type) so you must specify it
// when allocating memory. Failing to do so will result in
// a comptime error.
// Instantiating a filter
var cf8 = try cuckoo.Filter8.init(memory[0..]);
//
// FILTER USAGE
//
const banana_h = hasher.hash("banana");
const banana_fp = fingerprint8("banana");
const apple_h = hasher.hash("apple");
const apple_fp = fingerprint8("apple");
_ = try cf8.maybe_contains(banana_h, banana_fp); // => false
_ = try cf8.count(); // => 0
try cf8.add(banana_h, banana_fp);
_ = try cf8.maybe_contains(banana_h, banana_fp); // => true
_ = try cf8.maybe_contains(apple_h, apple_fp); // => false
_ = try cf8.count(); // => 1
try cf8.remove(banana_h, banana_fp);
_ = try cf8.maybe_contains(banana_h, banana_fp); // => false
_ = try cf8.count(); // => 0
// The filter can also be used with dynamic memory.
// It's up to you to manage that via an allocator.
const example_allocator = std.heap.c_allocator;
// Don't forget to free the memory afterwards.
const memsize32 = comptime cuckoo.Filter32.size_for_exactly(64);
var dyn_memory = try example_allocator.alignedAlloc(u8, cuckoo.Filter32.Align, memsize32);
defer example_allocator.free(dyn_memory);
var dyn_cf32 = try example_allocator.create(cuckoo.Filter32);
defer example_allocator.destroy(dyn_cf32);
dyn_cf32.* = try cuckoo.Filter32.init(dyn_memory);
// When restoring a persisted filter, you should only persist the individual fields
// as, for example, .buckets is a slice that points to `dyn_memory` which would be
// invalid upon restore (wrong pointer) and just a waste of space when stored.
// Upon loading, to reconnect the filter and its `dyn_memory`, use bytesToBuckets.
// Here's an example (which is not necessary to make this script work, as we just created
// the entire filter):
//
// dyn_cf32.buckets = cuckoo.Filter32.bytesToBuckets(dyn_memory);
//
// USAGE FAILURE SCENARIOS
//
// 1. Adding too many colliding items (because of bad entropy or
// because you are adding multiple copies of the same item)
const pear_h = hasher.hash("pear");
const pear_fp = fingerprint32("pear");
try dyn_cf32.add(pear_h, pear_fp);
try dyn_cf32.add(pear_h, pear_fp);
try dyn_cf32.add(pear_h, pear_fp);
try dyn_cf32.add(pear_h, pear_fp);
try dyn_cf32.add(pear_h, pear_fp);
// No more space for items with equal hash and fp,
// next insert will fail.
dyn_cf32.add(pear_h, pear_fp) catch |err| switch (err) {
error.TooFull => std.debug.warn("yep, too full\n"),
else => unreachable,
};
// Other inserts that don't collide can still succeed
const orange_h = hasher.hash("orange");
const orange_fp = fingerprint32("orange");
try dyn_cf32.add(orange_h, orange_fp);
// 2. You can only delete elements that were inserted before.
// Trying to delete a non-existing item has a chance of
// breaking the filter (makes false negatives possible).
// Deleting a non-existing item can either cause the
// deletion of another colliding item or fail to find
// a matching fingerprint in the filter. In the second
// case the filter locks down and returns an error for
// all operations, as it is now impossible to know what
// the correct state would be.
dyn_cf32.remove(0, 0) catch |err| switch (err) {
error.Broken => std.debug.warn(".remove, broken\n"),
};
_ = dyn_cf32.is_broken(); // => true
dyn_cf32.add(orange_fp, orange_fp) catch |err| switch (err) {
error.Broken => std.debug.warn(".add, broken\n"),
error.TooFull => {},
};
if (dyn_cf32.count()) |_| {
std.debug.warn(".count, works\n"); // won't be printed
} else |err| switch (err) {
error.Broken => std.debug.warn(".count, broken\n")
}
// Since searching does not mutate the filter, if the item
// is found, no error is returned:
_ = try dyn_cf32.maybe_contains(orange_h, orange_fp); // => true
// But if an item is not found, we don't know if it was wrongly
// deleted or not, so the filter has to return an error in order
// to ensure that it does not return a false negative response.
if (dyn_cf32.maybe_contains(0, 0)) |_| {
std.debug.warn(".maybe_contains, works\n"); // won't be printed
} else |err| switch (err) {
error.Broken => std.debug.warn(".maybe_contains, broken\n")
}
// You should *NEVER* get into that situation. If you do, it's
// a programming error. If your program runs in an environment
// where a request that involves the filter might be repeated
// (e.g. web servers), mark each request by a unique ID and
// keep some kind of commit log to ensure you don't run the
// same request twice, as it's semantically wrong to expect
// idempotence from Cuckoo filter commands.
// 3. Other small errors could be trying to pass to init memory
// with the wrong alignment or a wrong buffer size. Try to
// use the provided functions (i.e. size_for, size_for_exactly)
// to always have your buffers be the right size. You can
// also use those functions to reason about your data and even
// opt not to use a filter if the tradeoff is not worth it.
if (cuckoo.Filter8.init(memory[1..13])) |_| {
std.debug.warn(".init, works\n"); // won't be printed
} else |err| switch (err) {
error.BadLength => std.debug.warn(".init failed, use .size_for()!\n")
}
//
// FIXING TOO FULL
//
// Filter8 and Filter16 have 4 element-wide buckets,
// while Filter32 has 2 element-wide buckets.
// Each fingerprint has two buckets that can be used to
// house it. This means that you can have, respectively,
// up to 8 (F8, F16) and 4 (F32) collisions/copies before both
// buckets fill completely and you get TooFull. In practice,
// you get an extra chance because of how filters work internally.
// There's a special slot that houses a single fingerprint that
// could not find space in one of its 2 candidate slots.
// The problem is that once that "safety" slot is filled, the
// filter becomes much more succeptible to collisions and is forced
// to return TooFull when in fact it could try to make space.
// If you are also deleting elements from the filter, and
// not just adding them, this is what you can do to try and
// recover from that situation.
// Returns true if the safety slot is occupied.
var bad_situation = dyn_cf32.is_toofull();
// Note that you might not have ever received a TooFull error for
// this function to return true. In our previous example with
// dyn_cf32, it took us 5 insertions to obtain a TooFull error.
// This function would return true after 4.
// Try to fix the situation:
if (bad_situation) {
dyn_cf32.fix_toofull() catch |err| switch (err) {
error.Broken => {},
error.TooFull => {},
};
}
// With this function you can only fix TooFull, not Broken.
// If fix_toofull returns TooFull, it means that it failed.
// In practice you will need to free more elements before
// being able to fix the situation, but in theory calling
// the function multiple times might eventually fix the
// situation (i.e. it can make progress each call).
// That said, going back to practical usage, you are probably
// in a problematic situation when it gets to that point.
// To ensure you never have to deal with these problems,
// make sure you:
// (1) Never overfill/undersize a filter.
// (2) Get entropy right for the fingerprinting function.
//
// A trick to get (2) right is to pluck it not out of the
// original element, but out of hash2(element). Just make sure
// you use a different hasing function, independent from the
// fitst one, otherwise you're going to still end up with too
// little entropy, and be aware of the increased computational
// cost. Secondary hashing might be worth it for semi-strucutred
// data where you might find it hard to know if you're plucking
// "variable" data or part of the structure (e.g. JSON data),
// since the latter is bound to have lower entropy.
//
// PRNG Stuff
//
// Cuckoo Filters need a random number generator to decide which
// fingerprint to evict when a given bucket pair is full. This
// library provides a default implementation that uses the Zig
// standard library's Xoroshiro implementation, seeded by default to 42.
// If your application has short-lived sessions, a static seed won't be
// good enough, as it will basically result in giving out the same
// number over and over again, similarly to what is shown in that one
// dilbert strip. To fix that use seed_default_prng:
var buf: [8]u8 = undefined;
try std.crypto.randomBytes(buf[0..]);
const seed = std.mem.readIntSliceLittle(u64, buf[0..8]);
cuckoo.seed_default_prng(seed);
// Additionally, you might also want to provide your own PRNG
// implementation, either because you have specific needs (CSPRNG) or
// because you might want to make the filter fully deterministic and thus
// need to be able to persist and restore the PRNG's state.
// You can customize the PRNG of each filter by providing an appropriate
// function pointer:
dyn_cf32.rand_fn = DilbertRandom;
// From now on `dyn_cf32` will stop using the default implementation
// (shared by default by all filters) and will instead only use the
// provided function. If you use this functionality, *make sure to
// set the function pointer again when loading the filter from disk*.
// If you're fine with the default implementation and want more control
// than just seeding, use .get_default_prng_state() and
// .set_default_prng_state(), but beware that you are modifying a
// "singleton" struct used by all filters. If you are in a multi-threaded
// context this might cause problems if you are executing
// .add / .delete / .fix_toofull and altering the prng singleton at the
// same time. In that case you will have to customize .rand_fn
}
fn DilbertRandom() u1 {
return 1;
}