Skip to content

Commit

Permalink
Merge pull request #53 from jponf/master
Browse files Browse the repository at this point in the history
Add 64bit Min Sketch
  • Loading branch information
mpenkov authored Nov 4, 2022
2 parents 21aeda1 + 3930655 commit b34ab50
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 32 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
- "3.7.13"

install:
- pip install -r requirements.txt
Expand Down
78 changes: 59 additions & 19 deletions bounter/count_min_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@
# This code is distributed under the terms and conditions
# from the MIT License (MIT).

import enum
import bounter_cmsc as cmsc


class CellSize(enum.Enum):
BITS_32 = 32
BITS_64 = 64


class CountMinSketch(object):
"""
Data structure used to estimate frequencies of elements in massive data sets with fixed memory footprint.
Expand All @@ -28,9 +34,9 @@ class CountMinSketch(object):
To calculate memory footprint:
( width * depth * cell_size ) + HLL size
Cell size is
- 4B for default counting
- 2B for log1024 counting
- 1B for log8 counting
- 4B for default counting
- 2B for log1024 counting
- 1B for log8 counting
HLL size is 64 KB
Memory usage example:
width 2^25 (33 554 432), depth 8, log1024 (2B) has 2^(25 + 3 + 1) + 64 KB = 512.06 MB
Expand All @@ -47,7 +53,14 @@ class CountMinSketch(object):
counting as the collision bias will already be minimal.
"""

def __init__(self, size_mb=64, width=None, depth=None, log_counting=None):
def __init__(
self,
size_mb=64,
width=None,
depth=None,
log_counting=None,
cell_size=CellSize.BITS_32,
):
"""
Initialize the Count-Min Sketch structure with the given parameters
Expand All @@ -64,33 +77,40 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None):
The more, the better, should be very large, preferably in the same order of magnitude as the cardinality
of the counted set.
log_counting (int): Use logarithmic approximate counter value for reduced bucket size:
- None (default): 4B, no counter error
- None (default): 4B or 8B according to `cell_size`, no counter error
- 1024: 2B, value approximation error ~2% for values larger than 2048
- 8: 1B, value approximation error ~30% for values larger than 16
cell_size (CellSize): Size of the cells when `log_counting` is None.
"""

cell_size = CountMinSketch.cell_size(log_counting)
self.cell_size_v = cell_size
cell_bytes = CountMinSketch.cell_size(cell_size, log_counting)
self.cell_size_v = cell_bytes

if size_mb is None or not isinstance(size_mb, int):
raise ValueError("size_mb must be an integer representing the maximum size of the structure in MB")
raise ValueError(
"size_mb must be an integer representing the maximum size of the structure in MB"
)

if width is None and depth is None:
self.width = 1 << (size_mb * (2 ** 20) // (cell_size * 8 * 2)).bit_length()
self.depth = (size_mb * (2 ** 20)) // (self.width * cell_size)
self.width = 1 << (size_mb * (2**20) // (cell_bytes * 8 * 2)).bit_length()
self.depth = (size_mb * (2**20)) // (self.width * cell_bytes)
elif width is None:
self.depth = depth
avail_width = (size_mb * (2 ** 20)) // (depth * cell_size)
avail_width = (size_mb * (2**20)) // (depth * cell_bytes)
self.width = 1 << (avail_width.bit_length() - 1)
if not self.width:
raise ValueError("Requested depth is too large for maximum memory size.")
raise ValueError(
"Requested depth is too large for maximum memory size."
)
elif depth is None:
if width != 1 << (width.bit_length() - 1):
raise ValueError("Requested width must be a power of 2.")
self.width = width
self.depth = (size_mb * (2 ** 20)) // (width * cell_size)
self.depth = (size_mb * (2**20)) // (width * cell_bytes)
if not self.depth:
raise ValueError("Requested width is too large for maximum memory size.")
raise ValueError(
"Requested width is too large for maximum memory size."
)
else:
if width != 1 << (width.bit_length() - 1):
raise ValueError("Requested width must be a power of 2.")
Expand All @@ -102,20 +122,38 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None):
elif log_counting == 1024:
self.cms = cmsc.CMS_Log1024(width=self.width, depth=self.depth)
elif log_counting is None:
self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth)
if cell_size == CellSize.BITS_32:
self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth)
elif cell_size == CellSize.BITS_64:
self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth)
else:
raise ValueError(
"Unsupported parameter cell_size=%s. Use CellSize.BITS_32 or CellSize.BITS_64."
% (cell_size)
)
else:
raise ValueError("Unsupported parameter log_counting=%s. Use None, 8, or 1024." % log_counting)
raise ValueError(
"Unsupported parameter log_counting=%s. Use None, 8, or 1024."
% (log_counting)
)

# optimize calls by directly binding to C implementation
self.increment = self.cms.increment

@staticmethod
def cell_size(log_counting=None):
def cell_size(cell_size, log_counting=None):
if log_counting == 8:
return 1
if log_counting == 1024:
return 2
return 4
if log_counting is None:
if cell_size == CellSize.BITS_32:
return 4
if cell_size == CellSize.BITS_64:
return 8
raise ValueError(
"cell_size must be one of [BITS32, BITS64] and log_counting one of [None, 8, 1024]"
)

@staticmethod
def table_size(width, depth=4, log_counting=None):
Expand Down Expand Up @@ -192,4 +230,6 @@ def __init__(self):
super(CardinalityEstimator, self).__init__(width=1, depth=1)

def __getitem__(self, key):
raise NotImplementedError("Individual item counting is not supported for cardinality estimator!")
raise NotImplementedError(
"Individual item counting is not supported for cardinality estimator!"
)
36 changes: 36 additions & 0 deletions cbounter/cms64_conservative.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//-----------------------------------------------------------------------------
// Author: Josep Pon Farreny <jponfarreny@gmail.com.com>
// Copyright (C) 2017 Rare Technologies
//
// This code is distributed under the terms and conditions
// from the MIT License (MIT).

#ifndef _CMS64_CONSERVATIE_H_
#define _CMS64_CONSERVATIE_H_

#define CMS_TYPE CMS64_Conservative
#define CMS_TYPE_STRING "CMS64_Conservative"
#define CMS_CELL_TYPE uint64_t

#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
return 1;
}

static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)
{
return value;
}

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
return v1 + v2;
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE

#endif /* _CMS64_CONSERVATIE_H_ */
7 changes: 6 additions & 1 deletion cbounter/cms_cmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ static inline uint32_t rand_32b()
return r;
}

#include "cms_conservative.c"
#include "cms_conservative.h"
#include "cms64_conservative.h"
#include "cms_log8.c"
#include "cms_log1024.c"
#include <time.h>
Expand Down Expand Up @@ -50,6 +51,7 @@ PyMODINIT_FUNC initbounter_cmsc(void)
{
PyObject* m;
if (PyType_Ready(&CMS_ConservativeType) < 0
|| PyType_Ready(&CMS64_ConservativeType) < 0
|| PyType_Ready(&CMS_Log8Type) < 0
|| PyType_Ready(&CMS_Log1024Type) < 0) {

Expand All @@ -76,6 +78,9 @@ PyMODINIT_FUNC initbounter_cmsc(void)
Py_INCREF(&CMS_ConservativeType);
PyModule_AddObject(m, "CMS_Conservative", (PyObject *)&CMS_ConservativeType);

Py_INCREF(&CMS64_ConservativeType);
PyModule_AddObject(m, "CMS64_Conservative", (PyObject *)&CMS64_ConservativeType);

srand(time(NULL));

Py_INCREF(&CMS_Log8Type);
Expand Down
3 changes: 2 additions & 1 deletion cbounter/cms_common.c → cbounter/cms_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,9 @@ CMS_VARIANT(_increment_obj)(CMS_TYPE *self, char *data, Py_ssize_t dataLength, l
}

CMS_CELL_TYPE result = min_value;
for (; increment > 0; increment--)
for (; increment > 0; increment--) {
result += CMS_VARIANT(should_inc)(result);
}

if (result > min_value)
{
Expand Down
13 changes: 11 additions & 2 deletions cbounter/cms_conservative.c → cbounter/cms_conservative.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
// This code is distributed under the terms and conditions
// from the MIT License (MIT).

#ifndef _CMS_CONSERVATIE_H_
#define _CMS_CONSERVATIE_H_

#define CMS_TYPE CMS_Conservative
#define CMS_TYPE_STRING "CMS_Conservative"
#define CMS_CELL_TYPE uint32_t

#include "cms_common.c"
#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
Expand All @@ -21,7 +24,13 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)
return value;
}

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
return v1 + v2;
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE

#endif /* _CMS_CONSERVATIE_H_ */
13 changes: 9 additions & 4 deletions cbounter/cms_log1024.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#define CMS_TYPE_STRING "CMS_Log1024"
#define CMS_CELL_TYPE uint16_t

#include "cms_common.c"
#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
if (value >= 2048)
{
uint8_t shift = 33 - (value >> 10);
uint32_t mask = 0xFFFFFFFF >> shift;
if (mask & rand_32b()) return 0;
if (mask & rand_32b())
return 0;
}
return 1;
}
Expand All @@ -30,7 +31,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)
return (1024 + (value & 1023)) << ((value >> 10) - 1);
}

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
long long decoded = CMS_VARIANT(decode)(v1);
decoded += CMS_VARIANT(decode)(v2);
Expand All @@ -55,8 +56,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL
uint32_t mask = 0xFFFFFFFF >> shift;

uint32_t r;
MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r);
MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r);
uint32_t remainder = mask & decoded;

return (log_result << 10) + (h & 1023) + ((mask & r) < remainder);
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE
13 changes: 9 additions & 4 deletions cbounter/cms_log8.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#define CMS_TYPE_STRING "CMS_Log8"
#define CMS_CELL_TYPE uint8_t

#include "cms_common.c"
#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
if (value >= 16)
{
uint8_t shift = 33 - (value >> 3);
uint32_t mask = 0xFFFFFFFF >> shift;
if (mask & rand_32b()) return 0;
if (mask & rand_32b())
return 0;
}
return 1;
}
Expand All @@ -32,7 +33,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)

#include <stdio.h>

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
long long decoded = CMS_VARIANT(decode)(v1);
decoded += CMS_VARIANT(decode)(v2);
Expand All @@ -57,8 +58,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL
uint32_t mask = 0xFFFFFFFF >> shift;

uint32_t r;
MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r);
MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r);
uint32_t remainder = mask & decoded;

return (log_result << 3) + (h & 7) + ((mask & r) < remainder);
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE

0 comments on commit b34ab50

Please sign in to comment.