From db4e8950671fb033427e9d235c5dd957712a92a6 Mon Sep 17 00:00:00 2001 From: Josep Pon Farreny Date: Wed, 26 Oct 2022 18:47:14 +0200 Subject: [PATCH 1/7] feat(cms): Added support for 64 bit cells --- bounter/count_min_sketch.py | 40 ++++++++++++++----- cbounter/cms64_conservative.h | 32 +++++++++++++++ cbounter/cms_cmodule.c | 7 +++- cbounter/{cms_common.c => cms_common.h} | 3 +- ...{cms_conservative.c => cms_conservative.h} | 7 +++- cbounter/cms_log1024.c | 2 +- cbounter/cms_log8.c | 2 +- 7 files changed, 78 insertions(+), 15 deletions(-) create mode 100644 cbounter/cms64_conservative.h rename cbounter/{cms_common.c => cms_common.h} (99%) rename cbounter/{cms_conservative.c => cms_conservative.h} (85%) diff --git a/bounter/count_min_sketch.py b/bounter/count_min_sketch.py index 6641c32..84f7a33 100644 --- a/bounter/count_min_sketch.py +++ b/bounter/count_min_sketch.py @@ -7,9 +7,15 @@ # This code is distributed under the terms and conditions # from the MIT License (MIT). +import enum import bounter_cmsc as cmsc +class CellSize(enum.Enum): + BITS_32 = 32 + BITS_64 = 64 + + class CountMinSketch(object): """ Data structure used to estimate frequencies of elements in massive data sets with fixed memory footprint. @@ -47,7 +53,13 @@ class CountMinSketch(object): counting as the collision bias will already be minimal. """ - def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): + def __init__( + self, + size_mb=64, + cell_size=CellSize.BITS_32, + width=None, + depth=None, + log_counting=None,): """ Initialize the Count-Min Sketch structure with the given parameters @@ -69,18 +81,18 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): - 8: 1B, value approximation error ~30% for values larger than 16 """ - cell_size = CountMinSketch.cell_size(log_counting) - self.cell_size_v = cell_size + cell_bytes = CountMinSketch.cell_size(cell_size, log_counting) + self.cell_size_v = cell_bytes if size_mb is None or not isinstance(size_mb, int): raise ValueError("size_mb must be an integer representing the maximum size of the structure in MB") if width is None and depth is None: - self.width = 1 << (size_mb * (2 ** 20) // (cell_size * 8 * 2)).bit_length() - self.depth = (size_mb * (2 ** 20)) // (self.width * cell_size) + self.width = 1 << (size_mb * (2 ** 20) // (cell_bytes * 8 * 2)).bit_length() + self.depth = (size_mb * (2 ** 20)) // (self.width * cell_bytes) elif width is None: self.depth = depth - avail_width = (size_mb * (2 ** 20)) // (depth * cell_size) + avail_width = (size_mb * (2 ** 20)) // (depth * cell_bytes) self.width = 1 << (avail_width.bit_length() - 1) if not self.width: raise ValueError("Requested depth is too large for maximum memory size.") @@ -88,7 +100,7 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): if width != 1 << (width.bit_length() - 1): raise ValueError("Requested width must be a power of 2.") self.width = width - self.depth = (size_mb * (2 ** 20)) // (width * cell_size) + self.depth = (size_mb * (2 ** 20)) // (width * cell_bytes) if not self.depth: raise ValueError("Requested width is too large for maximum memory size.") else: @@ -101,8 +113,10 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): self.cms = cmsc.CMS_Log8(width=self.width, depth=self.depth) elif log_counting == 1024: self.cms = cmsc.CMS_Log1024(width=self.width, depth=self.depth) - elif log_counting is None: + elif cell_size == CellSize.BITS_32: self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth) + elif cell_size == CellSize.BITS_64: + self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth) else: raise ValueError("Unsupported parameter log_counting=%s. Use None, 8, or 1024." % log_counting) @@ -110,12 +124,18 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None): self.increment = self.cms.increment @staticmethod - def cell_size(log_counting=None): + def cell_size(cell_size: CellSize, log_counting=None): if log_counting == 8: return 1 if log_counting == 1024: return 2 - return 4 + if cell_size == CellSize.BITS_32: + return 4 + if cell_size == CellSize.BITS_64: + return 8 + raise ValueError( + "cell_size must be one of [BITS32, BITS64] and log_counting one of [None, 8, 1024]" + ) @staticmethod def table_size(width, depth=4, log_counting=None): diff --git a/cbounter/cms64_conservative.h b/cbounter/cms64_conservative.h new file mode 100644 index 0000000..11ca3e0 --- /dev/null +++ b/cbounter/cms64_conservative.h @@ -0,0 +1,32 @@ +//----------------------------------------------------------------------------- +// Author: Josep Pon Farreny +// Copyright (C) 2017 Rare Technologies +// +// This code is distributed under the terms and conditions +// from the MIT License (MIT). + +#ifndef _CMS64_CONSERVATIE_H_ +#define _CMS64_CONSERVATIE_H_ + +#define CMS_TYPE CMS64_Conservative +#define CMS_TYPE_STRING "CMS64_Conservative" +#define CMS_CELL_TYPE uint64_t + +#include "cms_common.h" + +static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) +{ + return 1; +} + +static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) +{ + return value; +} + +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +{ + return v1 + v2; +} + +#endif /* _CMS64_CONSERVATIE_H_ */ \ No newline at end of file diff --git a/cbounter/cms_cmodule.c b/cbounter/cms_cmodule.c index db1ef7a..ba39123 100644 --- a/cbounter/cms_cmodule.c +++ b/cbounter/cms_cmodule.c @@ -18,7 +18,8 @@ static inline uint32_t rand_32b() return r; } -#include "cms_conservative.c" +#include "cms_conservative.h" +#include "cms64_conservative.h" #include "cms_log8.c" #include "cms_log1024.c" #include @@ -50,6 +51,7 @@ PyMODINIT_FUNC initbounter_cmsc(void) { PyObject* m; if (PyType_Ready(&CMS_ConservativeType) < 0 + || PyType_Ready(&CMS64_ConservativeType) < 0 || PyType_Ready(&CMS_Log8Type) < 0 || PyType_Ready(&CMS_Log1024Type) < 0) { @@ -76,6 +78,9 @@ PyMODINIT_FUNC initbounter_cmsc(void) Py_INCREF(&CMS_ConservativeType); PyModule_AddObject(m, "CMS_Conservative", (PyObject *)&CMS_ConservativeType); + Py_INCREF(&CMS64_ConservativeType); + PyModule_AddObject(m, "CMS64_Conservative", (PyObject *)&CMS64_ConservativeType); + srand(time(NULL)); Py_INCREF(&CMS_Log8Type); diff --git a/cbounter/cms_common.c b/cbounter/cms_common.h similarity index 99% rename from cbounter/cms_common.c rename to cbounter/cms_common.h index e58d757..2ed9f71 100644 --- a/cbounter/cms_common.c +++ b/cbounter/cms_common.h @@ -136,8 +136,9 @@ CMS_VARIANT(_increment_obj)(CMS_TYPE *self, char *data, Py_ssize_t dataLength, l } CMS_CELL_TYPE result = min_value; - for (; increment > 0; increment--) + for (; increment > 0; increment--) { result += CMS_VARIANT(should_inc)(result); + } if (result > min_value) { diff --git a/cbounter/cms_conservative.c b/cbounter/cms_conservative.h similarity index 85% rename from cbounter/cms_conservative.c rename to cbounter/cms_conservative.h index d2a2253..92f9e66 100644 --- a/cbounter/cms_conservative.c +++ b/cbounter/cms_conservative.h @@ -5,11 +5,14 @@ // This code is distributed under the terms and conditions // from the MIT License (MIT). +#ifndef _CMS_CONSERVATIE_H_ +#define _CMS_CONSERVATIE_H_ + #define CMS_TYPE CMS_Conservative #define CMS_TYPE_STRING "CMS_Conservative" #define CMS_CELL_TYPE uint32_t -#include "cms_common.c" +#include "cms_common.h" static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { @@ -25,3 +28,5 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL { return v1 + v2; } + +#endif /* _CMS_CONSERVATIE_H_ */ \ No newline at end of file diff --git a/cbounter/cms_log1024.c b/cbounter/cms_log1024.c index e6f7929..23ac059 100644 --- a/cbounter/cms_log1024.c +++ b/cbounter/cms_log1024.c @@ -9,7 +9,7 @@ #define CMS_TYPE_STRING "CMS_Log1024" #define CMS_CELL_TYPE uint16_t -#include "cms_common.c" +#include "cms_common.h" static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { diff --git a/cbounter/cms_log8.c b/cbounter/cms_log8.c index 33c47fa..c74fb25 100644 --- a/cbounter/cms_log8.c +++ b/cbounter/cms_log8.c @@ -9,7 +9,7 @@ #define CMS_TYPE_STRING "CMS_Log8" #define CMS_CELL_TYPE uint8_t -#include "cms_common.c" +#include "cms_common.h" static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { From 5b531df4d090ae563044df59aba8b3538dda61f7 Mon Sep 17 00:00:00 2001 From: Josep Pon Farreny Date: Wed, 2 Nov 2022 10:08:57 +0100 Subject: [PATCH 2/7] Added missing doc on new cell_size parameter --- bounter/count_min_sketch.py | 67 ++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/bounter/count_min_sketch.py b/bounter/count_min_sketch.py index 84f7a33..4cc5f19 100644 --- a/bounter/count_min_sketch.py +++ b/bounter/count_min_sketch.py @@ -54,12 +54,13 @@ class CountMinSketch(object): """ def __init__( - self, - size_mb=64, - cell_size=CellSize.BITS_32, - width=None, - depth=None, - log_counting=None,): + self, + size_mb=64, + cell_size=CellSize.BITS_32, + width=None, + depth=None, + log_counting=None, + ): """ Initialize the Count-Min Sketch structure with the given parameters @@ -75,8 +76,9 @@ def __init__( If width is not provided, the algorithm chooses the maximum width to fill the available size. The more, the better, should be very large, preferably in the same order of magnitude as the cardinality of the counted set. + cell_size (CellSize): Size of the cells when `log_counting` is None. log_counting (int): Use logarithmic approximate counter value for reduced bucket size: - - None (default): 4B, no counter error + - None (default): 4B or 8B according to `cell_size`, no counter error - 1024: 2B, value approximation error ~2% for values larger than 2048 - 8: 1B, value approximation error ~30% for values larger than 16 """ @@ -85,24 +87,30 @@ def __init__( self.cell_size_v = cell_bytes if size_mb is None or not isinstance(size_mb, int): - raise ValueError("size_mb must be an integer representing the maximum size of the structure in MB") + raise ValueError( + "size_mb must be an integer representing the maximum size of the structure in MB" + ) if width is None and depth is None: - self.width = 1 << (size_mb * (2 ** 20) // (cell_bytes * 8 * 2)).bit_length() - self.depth = (size_mb * (2 ** 20)) // (self.width * cell_bytes) + self.width = 1 << (size_mb * (2**20) // (cell_bytes * 8 * 2)).bit_length() + self.depth = (size_mb * (2**20)) // (self.width * cell_bytes) elif width is None: self.depth = depth - avail_width = (size_mb * (2 ** 20)) // (depth * cell_bytes) + avail_width = (size_mb * (2**20)) // (depth * cell_bytes) self.width = 1 << (avail_width.bit_length() - 1) if not self.width: - raise ValueError("Requested depth is too large for maximum memory size.") + raise ValueError( + "Requested depth is too large for maximum memory size." + ) elif depth is None: if width != 1 << (width.bit_length() - 1): raise ValueError("Requested width must be a power of 2.") self.width = width - self.depth = (size_mb * (2 ** 20)) // (width * cell_bytes) + self.depth = (size_mb * (2**20)) // (width * cell_bytes) if not self.depth: - raise ValueError("Requested width is too large for maximum memory size.") + raise ValueError( + "Requested width is too large for maximum memory size." + ) else: if width != 1 << (width.bit_length() - 1): raise ValueError("Requested width must be a power of 2.") @@ -113,12 +121,20 @@ def __init__( self.cms = cmsc.CMS_Log8(width=self.width, depth=self.depth) elif log_counting == 1024: self.cms = cmsc.CMS_Log1024(width=self.width, depth=self.depth) - elif cell_size == CellSize.BITS_32: - self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth) - elif cell_size == CellSize.BITS_64: - self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth) + elif log_counting is None: + if cell_size == CellSize.BITS_32: + self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth) + elif cell_size == CellSize.BITS_64: + self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth) + else: + raise ValueError( + f"Unsupported parameter cell_size={cell_size}." + " Use CellSize.BITS_32 or CellSize.BITS_64." + ) else: - raise ValueError("Unsupported parameter log_counting=%s. Use None, 8, or 1024." % log_counting) + raise ValueError( + f"Unsupported parameter log_counting={log_counting}. Use None, 8, or 1024." + ) # optimize calls by directly binding to C implementation self.increment = self.cms.increment @@ -129,10 +145,11 @@ def cell_size(cell_size: CellSize, log_counting=None): return 1 if log_counting == 1024: return 2 - if cell_size == CellSize.BITS_32: - return 4 - if cell_size == CellSize.BITS_64: - return 8 + if log_counting is None: + if cell_size == CellSize.BITS_32: + return 4 + if cell_size == CellSize.BITS_64: + return 8 raise ValueError( "cell_size must be one of [BITS32, BITS64] and log_counting one of [None, 8, 1024]" ) @@ -212,4 +229,6 @@ def __init__(self): super(CardinalityEstimator, self).__init__(width=1, depth=1) def __getitem__(self, key): - raise NotImplementedError("Individual item counting is not supported for cardinality estimator!") + raise NotImplementedError( + "Individual item counting is not supported for cardinality estimator!" + ) From 9785d7bfd37b2dcf51e21a9c17fe21b95e89c87b Mon Sep 17 00:00:00 2001 From: Josep Pon Farreny Date: Wed, 2 Nov 2022 11:08:50 +0100 Subject: [PATCH 3/7] Added undef to remove redefined warning --- cbounter/cms64_conservative.h | 6 +++++- cbounter/cms_conservative.h | 6 +++++- cbounter/cms_log1024.c | 11 ++++++++--- cbounter/cms_log8.c | 11 ++++++++--- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/cbounter/cms64_conservative.h b/cbounter/cms64_conservative.h index 11ca3e0..5447cdf 100644 --- a/cbounter/cms64_conservative.h +++ b/cbounter/cms64_conservative.h @@ -24,9 +24,13 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) return value; } -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { return v1 + v2; } +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE + #endif /* _CMS64_CONSERVATIE_H_ */ \ No newline at end of file diff --git a/cbounter/cms_conservative.h b/cbounter/cms_conservative.h index 92f9e66..de21232 100644 --- a/cbounter/cms_conservative.h +++ b/cbounter/cms_conservative.h @@ -24,9 +24,13 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) return value; } -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { return v1 + v2; } +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE + #endif /* _CMS_CONSERVATIE_H_ */ \ No newline at end of file diff --git a/cbounter/cms_log1024.c b/cbounter/cms_log1024.c index 23ac059..d4e456f 100644 --- a/cbounter/cms_log1024.c +++ b/cbounter/cms_log1024.c @@ -17,7 +17,8 @@ static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { uint8_t shift = 33 - (value >> 10); uint32_t mask = 0xFFFFFFFF >> shift; - if (mask & rand_32b()) return 0; + if (mask & rand_32b()) + return 0; } return 1; } @@ -30,7 +31,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) return (1024 + (value & 1023)) << ((value >> 10) - 1); } -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { long long decoded = CMS_VARIANT(decode)(v1); decoded += CMS_VARIANT(decode)(v2); @@ -55,8 +56,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL uint32_t mask = 0xFFFFFFFF >> shift; uint32_t r; - MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r); + MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r); uint32_t remainder = mask & decoded; return (log_result << 10) + (h & 1023) + ((mask & r) < remainder); } + +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE \ No newline at end of file diff --git a/cbounter/cms_log8.c b/cbounter/cms_log8.c index c74fb25..be9d247 100644 --- a/cbounter/cms_log8.c +++ b/cbounter/cms_log8.c @@ -17,7 +17,8 @@ static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value) { uint8_t shift = 33 - (value >> 3); uint32_t mask = 0xFFFFFFFF >> shift; - if (mask & rand_32b()) return 0; + if (mask & rand_32b()) + return 0; } return 1; } @@ -32,7 +33,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value) #include -static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) +static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed) { long long decoded = CMS_VARIANT(decode)(v1); decoded += CMS_VARIANT(decode)(v2); @@ -57,8 +58,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL uint32_t mask = 0xFFFFFFFF >> shift; uint32_t r; - MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r); + MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r); uint32_t remainder = mask & decoded; return (log_result << 3) + (h & 7) + ((mask & r) < remainder); } + +#undef CMS_TYPE +#undef CMS_TYPE_STRING +#undef CMS_CELL_TYPE \ No newline at end of file From 46836a1737c6c3e491c2a1f4146e5617c732b79e Mon Sep 17 00:00:00 2001 From: Josep Pon Farreny Date: Wed, 2 Nov 2022 11:15:04 +0100 Subject: [PATCH 4/7] Removed f-strings in favor of good old ".format". The project CI runs on 3.4 and 3.5 which do not support f-strings. --- bounter/count_min_sketch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bounter/count_min_sketch.py b/bounter/count_min_sketch.py index 4cc5f19..3a434a6 100644 --- a/bounter/count_min_sketch.py +++ b/bounter/count_min_sketch.py @@ -128,12 +128,15 @@ def __init__( self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth) else: raise ValueError( - f"Unsupported parameter cell_size={cell_size}." - " Use CellSize.BITS_32 or CellSize.BITS_64." + "Unsupported parameter cell_size={}. Use CellSize.BITS_32 or CellSize.BITS_64.".format( + cell_size + ) ) else: raise ValueError( - f"Unsupported parameter log_counting={log_counting}. Use None, 8, or 1024." + "Unsupported parameter log_counting={}. Use None, 8, or 1024.".format( + log_counting + ) ) # optimize calls by directly binding to C implementation From 930d7f2d889ef881815fa45d98fc9faa05a5345e Mon Sep 17 00:00:00 2001 From: Josep Pon Farreny Date: Wed, 2 Nov 2022 11:23:35 +0100 Subject: [PATCH 5/7] Removed typing syntax (not supported in 2.7) --- bounter/count_min_sketch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bounter/count_min_sketch.py b/bounter/count_min_sketch.py index 3a434a6..e3dd443 100644 --- a/bounter/count_min_sketch.py +++ b/bounter/count_min_sketch.py @@ -143,7 +143,7 @@ def __init__( self.increment = self.cms.increment @staticmethod - def cell_size(cell_size: CellSize, log_counting=None): + def cell_size(cell_size, log_counting=None): if log_counting == 8: return 1 if log_counting == 1024: From ed61741313e4b92cf4ff78aa21312261e7eb3bd2 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 3 Nov 2022 14:39:52 +0900 Subject: [PATCH 6/7] Update .travis.yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2153e12..59ba7df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ python: - "3.4" - "3.5" - "3.6" - - "3.7" + - "3.7.13" install: - pip install -r requirements.txt From 3930655e3a4838fed8f0bc0c6dbdc0178b522f5e Mon Sep 17 00:00:00 2001 From: Josep Pon Farreny Date: Thu, 3 Nov 2022 15:56:56 +0100 Subject: [PATCH 7/7] fix(cms): Changed .format to % as requested --- bounter/count_min_sketch.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/bounter/count_min_sketch.py b/bounter/count_min_sketch.py index e3dd443..307ab1b 100644 --- a/bounter/count_min_sketch.py +++ b/bounter/count_min_sketch.py @@ -34,9 +34,9 @@ class CountMinSketch(object): To calculate memory footprint: ( width * depth * cell_size ) + HLL size Cell size is - - 4B for default counting - - 2B for log1024 counting - - 1B for log8 counting + - 4B for default counting + - 2B for log1024 counting + - 1B for log8 counting HLL size is 64 KB Memory usage example: width 2^25 (33 554 432), depth 8, log1024 (2B) has 2^(25 + 3 + 1) + 64 KB = 512.06 MB @@ -56,10 +56,10 @@ class CountMinSketch(object): def __init__( self, size_mb=64, - cell_size=CellSize.BITS_32, width=None, depth=None, log_counting=None, + cell_size=CellSize.BITS_32, ): """ Initialize the Count-Min Sketch structure with the given parameters @@ -76,11 +76,11 @@ def __init__( If width is not provided, the algorithm chooses the maximum width to fill the available size. The more, the better, should be very large, preferably in the same order of magnitude as the cardinality of the counted set. - cell_size (CellSize): Size of the cells when `log_counting` is None. log_counting (int): Use logarithmic approximate counter value for reduced bucket size: - None (default): 4B or 8B according to `cell_size`, no counter error - 1024: 2B, value approximation error ~2% for values larger than 2048 - 8: 1B, value approximation error ~30% for values larger than 16 + cell_size (CellSize): Size of the cells when `log_counting` is None. """ cell_bytes = CountMinSketch.cell_size(cell_size, log_counting) @@ -128,15 +128,13 @@ def __init__( self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth) else: raise ValueError( - "Unsupported parameter cell_size={}. Use CellSize.BITS_32 or CellSize.BITS_64.".format( - cell_size - ) + "Unsupported parameter cell_size=%s. Use CellSize.BITS_32 or CellSize.BITS_64." + % (cell_size) ) else: raise ValueError( - "Unsupported parameter log_counting={}. Use None, 8, or 1024.".format( - log_counting - ) + "Unsupported parameter log_counting=%s. Use None, 8, or 1024." + % (log_counting) ) # optimize calls by directly binding to C implementation