-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #60 from outbrain/cms
Count-min-sketch with an extension
- Loading branch information
Showing
5 changed files
with
166 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from __future__ import annotations | ||
|
||
import sys | ||
from collections import Counter | ||
|
||
import numpy as np | ||
from numba import njit | ||
from numba import prange | ||
|
||
|
||
@njit | ||
def cms_hash(x, seed, width): | ||
x_hash = np.uint32(hash(x)) | ||
return (x_hash + seed) % width | ||
|
||
class CountMinSketch: | ||
""" | ||
A memory-efficient implementation of the count min sketch algorithm with optimized hashing using Numba JIT. | ||
""" | ||
|
||
def __init__(self, depth=6, width=2**22, M=None): | ||
self.depth = depth | ||
self.width = width | ||
self.hash_seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=depth), dtype=np.uint32) | ||
self.M = np.zeros((depth, width), dtype=np.int32) if M is None else M | ||
self.tmp_vals = set() | ||
|
||
@staticmethod | ||
@njit | ||
def _add(M, x, depth, width, hash_seeds, delta=1): | ||
for i in prange(depth): | ||
location = cms_hash(x, hash_seeds[i], width) | ||
M[i, location] += delta | ||
|
||
def add(self, x, delta=1): | ||
if len(self.tmp_vals) < 10 ** 5 or sys.getsizeof(self.tmp_vals) / (10 ** 3) < 100.0: | ||
self.tmp_vals.add(x) | ||
CountMinSketch._add(self.M, x, self.depth, self.width, self.hash_seeds, delta) | ||
|
||
def batch_add(self, lst, delta=1): | ||
for x in lst: | ||
self.add(x, delta) | ||
|
||
def query(self, x): | ||
return min(self.M[i][cms_hash(x, self.hash_seeds[i], self.width)] for i in range(self.depth)) | ||
|
||
def get_matrix(self): | ||
return self.M | ||
|
||
def stream_hist_update(self): | ||
""" A bit hacky way to aggregate cms results """ | ||
return Counter(self.query(x) for x in self.tmp_vals) | ||
|
||
|
||
if __name__ == '__main__': | ||
from collections import Counter | ||
|
||
depth = 8 | ||
width = 2**22 | ||
cms = CountMinSketch(depth, width) | ||
|
||
items = [1, 1, 2, 3, 3, 3, 4, 5, 2] * 1000 | ||
cms.batch_add(items) # Use the batch_add function | ||
|
||
print(cms.query(3)) # Query for frequency estimates | ||
print(cms.query(1)) | ||
print(cms.query(2)) | ||
print(cms.query(4)) | ||
print(cms.query(5)) | ||
|
||
print(Counter(items)) # Print the exact counts for comparison | ||
print(cms.stream_hist_update()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from __future__ import annotations | ||
|
||
import sys | ||
import unittest | ||
|
||
import numpy as np | ||
|
||
from outrank.algorithms.sketches.counting_cms import cms_hash | ||
from outrank.algorithms.sketches.counting_cms import CountMinSketch | ||
|
||
|
||
class TestCountMinSketch(unittest.TestCase): | ||
|
||
def setUp(self): | ||
# Set up a CountMinSketch instance with known parameters for testing | ||
self.depth = 6 | ||
self.width = 2**10 # smaller width for testing purposes | ||
self.cms = CountMinSketch(self.depth, self.width) | ||
|
||
def test_init(self): | ||
self.assertEqual(self.cms.depth, self.depth) | ||
self.assertEqual(self.cms.width, self.width) | ||
self.assertEqual(self.cms.M.shape, (self.depth, self.width)) | ||
self.assertEqual(len(self.cms.hash_seeds), self.depth) | ||
self.assertIsInstance(self.cms.tmp_vals, set) | ||
|
||
def test_add_and_query_single_element(self): | ||
# Test adding a single element and querying it | ||
element = 'test_element' | ||
self.cms.add(element) | ||
# The queried count should be at least 1 (could be higher due to hash collisions) | ||
self.assertGreaterEqual(self.cms.query(element), 1) | ||
|
||
def test_add_and_query_multiple_elements(self): | ||
elements = ['foo', 'bar', 'baz', 'qux', 'quux'] | ||
for elem in elements: | ||
self.cms.add(elem) | ||
|
||
for elem in elements: | ||
self.assertGreaterEqual(self.cms.query(elem), 1) | ||
|
||
def test_batch_add_and_query(self): | ||
elements = ['foo', 'bar', 'baz'] * 10 | ||
self.cms.batch_add(elements) | ||
|
||
for elem in set(elements): | ||
self.assertGreaterEqual(self.cms.query(elem), 10) | ||
|
||
def test_stream_hist_update(self): | ||
self.cms.add('foo') | ||
self.cms.add('foo') | ||
self.cms.add('bar') | ||
|
||
hist = self.cms.stream_hist_update() | ||
|
||
# Note: we cannot test for exact counts because the CountMinSketch is a probabilistic data structure | ||
# and may overcount. However, we never expect it to undercount an element. | ||
self.assertGreaterEqual(hist[self.cms.query('foo')], 1) | ||
self.assertGreaterEqual(hist[self.cms.query('bar')], 1) | ||
|
||
def test_overflow_protection(self): | ||
# This test ensures that the set doesn't grow beyond its allowed size and memory usage | ||
for i in range(100001): | ||
self.cms.add(f'element{i}') | ||
|
||
self.assertLessEqual(len(self.cms.tmp_vals), 100000) | ||
self.assertLessEqual(sys.getsizeof(self.cms.tmp_vals) / (10 ** 3), 4200.0) | ||
|
||
def test_hash_uniformity(self): | ||
# Basic check for hash function's distribution | ||
seeds = np.array(np.random.randint(low=0, high=2**31 - 1, size=self.depth), dtype=np.uint32) | ||
hashes = [cms_hash(i, seeds[0], self.width) for i in range(1000)] | ||
# Expect fewer collisions over a small sample with a large width | ||
unique_hashes = len(set(hashes)) | ||
self.assertGreater(unique_hashes, 900) |