Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 64bit Min Sketch #53

Merged
merged 7 commits into from
Nov 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
- "3.7.13"

install:
- pip install -r requirements.txt
Expand Down
78 changes: 59 additions & 19 deletions bounter/count_min_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,15 @@
# This code is distributed under the terms and conditions
# from the MIT License (MIT).

import enum
import bounter_cmsc as cmsc


class CellSize(enum.Enum):
BITS_32 = 32
BITS_64 = 64


class CountMinSketch(object):
"""
Data structure used to estimate frequencies of elements in massive data sets with fixed memory footprint.
Expand All @@ -28,9 +34,9 @@ class CountMinSketch(object):
To calculate memory footprint:
( width * depth * cell_size ) + HLL size
Cell size is
- 4B for default counting
- 2B for log1024 counting
- 1B for log8 counting
- 4B for default counting
- 2B for log1024 counting
- 1B for log8 counting
HLL size is 64 KB
Memory usage example:
width 2^25 (33 554 432), depth 8, log1024 (2B) has 2^(25 + 3 + 1) + 64 KB = 512.06 MB
Expand All @@ -47,7 +53,14 @@ class CountMinSketch(object):
counting as the collision bias will already be minimal.
"""

def __init__(self, size_mb=64, width=None, depth=None, log_counting=None):
def __init__(
self,
size_mb=64,
width=None,
depth=None,
log_counting=None,
cell_size=CellSize.BITS_32,
):
"""
Initialize the Count-Min Sketch structure with the given parameters

Expand All @@ -64,33 +77,40 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None):
The more, the better, should be very large, preferably in the same order of magnitude as the cardinality
of the counted set.
log_counting (int): Use logarithmic approximate counter value for reduced bucket size:
- None (default): 4B, no counter error
- None (default): 4B or 8B according to `cell_size`, no counter error
- 1024: 2B, value approximation error ~2% for values larger than 2048
- 8: 1B, value approximation error ~30% for values larger than 16
cell_size (CellSize): Size of the cells when `log_counting` is None.
"""

cell_size = CountMinSketch.cell_size(log_counting)
self.cell_size_v = cell_size
cell_bytes = CountMinSketch.cell_size(cell_size, log_counting)
self.cell_size_v = cell_bytes

if size_mb is None or not isinstance(size_mb, int):
raise ValueError("size_mb must be an integer representing the maximum size of the structure in MB")
raise ValueError(
"size_mb must be an integer representing the maximum size of the structure in MB"
)

if width is None and depth is None:
self.width = 1 << (size_mb * (2 ** 20) // (cell_size * 8 * 2)).bit_length()
self.depth = (size_mb * (2 ** 20)) // (self.width * cell_size)
self.width = 1 << (size_mb * (2**20) // (cell_bytes * 8 * 2)).bit_length()
self.depth = (size_mb * (2**20)) // (self.width * cell_bytes)
elif width is None:
self.depth = depth
avail_width = (size_mb * (2 ** 20)) // (depth * cell_size)
avail_width = (size_mb * (2**20)) // (depth * cell_bytes)
self.width = 1 << (avail_width.bit_length() - 1)
if not self.width:
raise ValueError("Requested depth is too large for maximum memory size.")
raise ValueError(
"Requested depth is too large for maximum memory size."
)
elif depth is None:
if width != 1 << (width.bit_length() - 1):
raise ValueError("Requested width must be a power of 2.")
self.width = width
self.depth = (size_mb * (2 ** 20)) // (width * cell_size)
self.depth = (size_mb * (2**20)) // (width * cell_bytes)
if not self.depth:
raise ValueError("Requested width is too large for maximum memory size.")
raise ValueError(
"Requested width is too large for maximum memory size."
)
else:
if width != 1 << (width.bit_length() - 1):
raise ValueError("Requested width must be a power of 2.")
Expand All @@ -102,20 +122,38 @@ def __init__(self, size_mb=64, width=None, depth=None, log_counting=None):
elif log_counting == 1024:
self.cms = cmsc.CMS_Log1024(width=self.width, depth=self.depth)
elif log_counting is None:
self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth)
if cell_size == CellSize.BITS_32:
self.cms = cmsc.CMS_Conservative(width=self.width, depth=self.depth)
elif cell_size == CellSize.BITS_64:
self.cms = cmsc.CMS64_Conservative(width=self.width, depth=self.depth)
else:
raise ValueError(
"Unsupported parameter cell_size=%s. Use CellSize.BITS_32 or CellSize.BITS_64."
% (cell_size)
)
else:
raise ValueError("Unsupported parameter log_counting=%s. Use None, 8, or 1024." % log_counting)
raise ValueError(
"Unsupported parameter log_counting=%s. Use None, 8, or 1024."
% (log_counting)
)

# optimize calls by directly binding to C implementation
self.increment = self.cms.increment

@staticmethod
def cell_size(log_counting=None):
def cell_size(cell_size, log_counting=None):
if log_counting == 8:
return 1
if log_counting == 1024:
return 2
return 4
if log_counting is None:
if cell_size == CellSize.BITS_32:
return 4
if cell_size == CellSize.BITS_64:
return 8
raise ValueError(
"cell_size must be one of [BITS32, BITS64] and log_counting one of [None, 8, 1024]"
)

@staticmethod
def table_size(width, depth=4, log_counting=None):
Expand Down Expand Up @@ -192,4 +230,6 @@ def __init__(self):
super(CardinalityEstimator, self).__init__(width=1, depth=1)

def __getitem__(self, key):
raise NotImplementedError("Individual item counting is not supported for cardinality estimator!")
raise NotImplementedError(
"Individual item counting is not supported for cardinality estimator!"
)
36 changes: 36 additions & 0 deletions cbounter/cms64_conservative.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
//-----------------------------------------------------------------------------
// Author: Josep Pon Farreny <jponfarreny@gmail.com.com>
// Copyright (C) 2017 Rare Technologies
//
// This code is distributed under the terms and conditions
// from the MIT License (MIT).

#ifndef _CMS64_CONSERVATIE_H_
#define _CMS64_CONSERVATIE_H_

#define CMS_TYPE CMS64_Conservative
#define CMS_TYPE_STRING "CMS64_Conservative"
#define CMS_CELL_TYPE uint64_t

#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
return 1;
}

static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)
{
return value;
}

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
return v1 + v2;
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE

#endif /* _CMS64_CONSERVATIE_H_ */
7 changes: 6 additions & 1 deletion cbounter/cms_cmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ static inline uint32_t rand_32b()
return r;
}

#include "cms_conservative.c"
#include "cms_conservative.h"
#include "cms64_conservative.h"
#include "cms_log8.c"
#include "cms_log1024.c"
#include <time.h>
Expand Down Expand Up @@ -50,6 +51,7 @@ PyMODINIT_FUNC initbounter_cmsc(void)
{
PyObject* m;
if (PyType_Ready(&CMS_ConservativeType) < 0
|| PyType_Ready(&CMS64_ConservativeType) < 0
|| PyType_Ready(&CMS_Log8Type) < 0
|| PyType_Ready(&CMS_Log1024Type) < 0) {

Expand All @@ -76,6 +78,9 @@ PyMODINIT_FUNC initbounter_cmsc(void)
Py_INCREF(&CMS_ConservativeType);
PyModule_AddObject(m, "CMS_Conservative", (PyObject *)&CMS_ConservativeType);

Py_INCREF(&CMS64_ConservativeType);
PyModule_AddObject(m, "CMS64_Conservative", (PyObject *)&CMS64_ConservativeType);

srand(time(NULL));

Py_INCREF(&CMS_Log8Type);
Expand Down
3 changes: 2 additions & 1 deletion cbounter/cms_common.c → cbounter/cms_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,9 @@ CMS_VARIANT(_increment_obj)(CMS_TYPE *self, char *data, Py_ssize_t dataLength, l
}

CMS_CELL_TYPE result = min_value;
for (; increment > 0; increment--)
for (; increment > 0; increment--) {
result += CMS_VARIANT(should_inc)(result);
}

if (result > min_value)
{
Expand Down
13 changes: 11 additions & 2 deletions cbounter/cms_conservative.c → cbounter/cms_conservative.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
// This code is distributed under the terms and conditions
// from the MIT License (MIT).

#ifndef _CMS_CONSERVATIE_H_
#define _CMS_CONSERVATIE_H_

#define CMS_TYPE CMS_Conservative
#define CMS_TYPE_STRING "CMS_Conservative"
#define CMS_CELL_TYPE uint32_t

#include "cms_common.c"
#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
Expand All @@ -21,7 +24,13 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)
return value;
}

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
return v1 + v2;
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE

#endif /* _CMS_CONSERVATIE_H_ */
13 changes: 9 additions & 4 deletions cbounter/cms_log1024.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#define CMS_TYPE_STRING "CMS_Log1024"
#define CMS_CELL_TYPE uint16_t

#include "cms_common.c"
#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
if (value >= 2048)
{
uint8_t shift = 33 - (value >> 10);
uint32_t mask = 0xFFFFFFFF >> shift;
if (mask & rand_32b()) return 0;
if (mask & rand_32b())
return 0;
}
return 1;
}
Expand All @@ -30,7 +31,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)
return (1024 + (value & 1023)) << ((value >> 10) - 1);
}

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
long long decoded = CMS_VARIANT(decode)(v1);
decoded += CMS_VARIANT(decode)(v2);
Expand All @@ -55,8 +56,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL
uint32_t mask = 0xFFFFFFFF >> shift;

uint32_t r;
MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r);
MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r);
uint32_t remainder = mask & decoded;

return (log_result << 10) + (h & 1023) + ((mask & r) < remainder);
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE
13 changes: 9 additions & 4 deletions cbounter/cms_log8.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#define CMS_TYPE_STRING "CMS_Log8"
#define CMS_CELL_TYPE uint8_t

#include "cms_common.c"
#include "cms_common.h"

static inline int CMS_VARIANT(should_inc)(CMS_CELL_TYPE value)
{
if (value >= 16)
{
uint8_t shift = 33 - (value >> 3);
uint32_t mask = 0xFFFFFFFF >> shift;
if (mask & rand_32b()) return 0;
if (mask & rand_32b())
return 0;
}
return 1;
}
Expand All @@ -32,7 +33,7 @@ static inline long long CMS_VARIANT(decode)(CMS_CELL_TYPE value)

#include <stdio.h>

static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value)(CMS_CELL_TYPE v1, CMS_CELL_TYPE v2, uint32_t merge_seed)
{
long long decoded = CMS_VARIANT(decode)(v1);
decoded += CMS_VARIANT(decode)(v2);
Expand All @@ -57,8 +58,12 @@ static inline CMS_CELL_TYPE CMS_VARIANT(_merge_value) (CMS_CELL_TYPE v1, CMS_CEL
uint32_t mask = 0xFFFFFFFF >> shift;

uint32_t r;
MurmurHash3_x86_32 ((void *) &decoded, 8, merge_seed, (void *) &r);
MurmurHash3_x86_32((void *)&decoded, 8, merge_seed, (void *)&r);
uint32_t remainder = mask & decoded;

return (log_result << 3) + (h & 7) + ((mask & r) < remainder);
}

#undef CMS_TYPE
#undef CMS_TYPE_STRING
#undef CMS_CELL_TYPE