Skip to content

Commit

Permalink
Merge pull request dib-lab#738 from ged-lab/tests/hll_coverage
Browse files Browse the repository at this point in the history
HLL coverage improvements
  • Loading branch information
mr-c committed Feb 25, 2015
2 parents 3ed54b8 + 9b81a5a commit 4ffae39
Show file tree
Hide file tree
Showing 8 changed files with 399 additions and 39 deletions.
7 changes: 7 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
* sandbox/unique-kmers.py: add call to 'info' with 'hll' in the
algorithms list.

2015-02-24 Luiz Irber <irberlui@msu.edu>

* khmer/_khmermodule.cc: expose HLL internals as read-only attributes.
* lib/hllcounter.{cc,hh}: simplify error checking, add getters for HLL.
* tests/test_hll.py: add test cases for increasing coverage, also fix
some of the previous ones using the new HLL read-only attributes.

2015-02-24 Luiz Irber <irberlui@msu.edu>

* khmer/_khmermodule.cc: Fix coding style violations.
Expand Down
14 changes: 14 additions & 0 deletions khmer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,20 @@ def __new__(cls, k, starting_size, n_tables):


class HLLCounter(_HLLCounter):
"""
A HyperLogLog counter is a probabilistic data structure specialized on
cardinality estimation.
There is a precision/memory consumption trade-off: error rate determines
how much memory is consumed.
# Creating a new HLLCounter:
>>> khmer.HLLCounter(error_rate, ksize)
where the default values are:
- error_rate: 0.01
- ksize: 20
"""

def __len__(self):
return self.estimate_cardinality()
151 changes: 139 additions & 12 deletions khmer/_khmermodule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4339,24 +4339,17 @@ static PyObject* khmer_hllcounter_new(PyTypeObject * type, PyObject * args,
self = (khmer_KHLLCounter_Object *)type->tp_alloc(type, 0);

if (self != NULL) {
double error_rate = 0;
WordLength ksize = 0;
double error_rate = 0.01;
WordLength ksize = 20;

if (!PyArg_ParseTuple(args, "db", &error_rate, &ksize)) {
if (!PyArg_ParseTuple(args, "|db", &error_rate, &ksize)) {
Py_DECREF(self);
return NULL;
}

if ((error_rate < 0) || (error_rate > 1.0)) {
Py_DECREF(self);
PyErr_SetString(PyExc_ValueError,
"Error rate should be between 0.0 and 1.0");
return NULL;
}

try {
self->hllcounter = new HLLCounter(error_rate, ksize);
} catch (khmer_exception &e) {
} catch (InvalidValue &e) {
Py_DECREF(self);
PyErr_SetString(PyExc_ValueError, e.what());
return NULL;
Expand Down Expand Up @@ -4455,6 +4448,108 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me,
return Py_BuildValue("IK", total_reads, n_consumed);
}

static
PyObject *
hllcounter_get_erate(khmer_KHLLCounter_Object * me)
{
return PyFloat_FromDouble(me->hllcounter->get_erate());
}

static
PyObject *
hllcounter_get_ksize(khmer_KHLLCounter_Object * me)
{
return PyLong_FromLong(me->hllcounter->get_ksize());
}

static
int
hllcounter_set_ksize(khmer_KHLLCounter_Object * me, PyObject *value,
void *closure)
{
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete attribute");
return -1;
}

long ksize = 0;
if (PyLong_Check(value)) {
ksize = PyLong_AsLong(value);
} else if (PyInt_Check(value)) {
ksize = PyInt_AsLong(value);
} else {
PyErr_SetString(PyExc_TypeError,
"Please use an integer value for k-mer size");
return -1;
}

if (ksize <= 0) {
PyErr_SetString(PyExc_ValueError, "Please set k-mer size to a value "
"greater than zero");
return -1;
}

try {
me->hllcounter->set_ksize(ksize);
} catch (ReadOnlyAttribute &e) {
PyErr_SetString(PyExc_AttributeError, e.what());
return -1;
}

return 0;
}

static
int
hllcounter_set_erate(khmer_KHLLCounter_Object * me, PyObject *value,
void *closure)
{
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete attribute");
return -1;
}

if (!PyFloat_Check(value)) {
PyErr_SetString(PyExc_TypeError,
"Please use a float value for k-mer size");
return -1;
}

double erate = PyFloat_AsDouble(value);
try {
me->hllcounter->set_erate(erate);
} catch (InvalidValue &e) {
PyErr_SetString(PyExc_ValueError, e.what());
return -1;
} catch (ReadOnlyAttribute &e) {
PyErr_SetString(PyExc_AttributeError, e.what());
return -1;
}

return 0;
}

static
PyObject *
hllcounter_getalpha(khmer_KHLLCounter_Object * me)
{
return PyFloat_FromDouble(me->hllcounter->get_alpha());
}

static
PyObject *
hllcounter_getcounters(khmer_KHLLCounter_Object * me)
{
std::vector<int> counters = me->hllcounter->get_M();

PyObject * x = PyList_New(counters.size());
for (size_t i = 0; i < counters.size(); i++) {
PyList_SET_ITEM(x, i, PyLong_FromLong(counters[i]));
}

return x;
}

static PyMethodDef khmer_hllcounter_methods[] = {
{
"add", (PyCFunction)hllcounter_add,
Expand All @@ -4480,6 +4575,38 @@ static PyMethodDef khmer_hllcounter_methods[] = {
{NULL} /* Sentinel */
};

static PyGetSetDef khmer_hllcounter_getseters[] = {
{
(char *)"alpha",
(getter)hllcounter_getalpha, NULL,
(char *)"alpha constant for this HLL counter.",
NULL
},
{
(char *)"error_rate",
(getter)hllcounter_get_erate, (setter)hllcounter_set_erate,
(char *)"Error rate for this HLL counter. "
"Can be changed prior to first counting, but becomes read-only after "
"that (raising AttributeError)",
NULL
},
{
(char *)"ksize",
(getter)hllcounter_get_ksize, (setter)hllcounter_set_ksize,
(char *)"k-mer size for this HLL counter."
"Can be changed prior to first counting, but becomes read-only after "
"that (raising AttributeError)",
NULL
},
{
(char *)"counters",
(getter)hllcounter_getcounters, NULL,
(char *)"Read-only internal counters.",
NULL
},
{NULL} /* Sentinel */
};

static PyTypeObject khmer_KHLLCounter_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"_khmer.KHLLCounter", /* tp_name */
Expand Down Expand Up @@ -4510,7 +4637,7 @@ static PyTypeObject khmer_KHLLCounter_Type = {
0, /* tp_iternext */
khmer_hllcounter_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
khmer_hllcounter_getseters, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
Expand Down
58 changes: 44 additions & 14 deletions lib/hllcounter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,16 @@ std::map<int, std::vector<double> > rawEstimateData;
std::map<int, std::vector<double> > biasData;


double get_alpha(const int p)
double calc_alpha(const int p)
{
if ((p < 4) or (p > 16)) {
double valid_lower_bound = 1.04 / std::sqrt(std::pow(static_cast<float>(2),
17));
double valid_upper_bound = 1.04 / std::sqrt(std::pow(static_cast<float>(2), 3));

std::stringstream message;
if (p < 4) {
message << "Max error should be smaller than " << valid_upper_bound;
} else {
message << "Min error should be greater than " << valid_lower_bound;
}
throw khmer_exception(message.str().c_str());
if (p < 4) {
// ceil(log2((1.04 / x) ^ 2)) = 4, solve for x
throw InvalidValue("Please set error rate to a value "
"smaller than 0.367696");
} else if (p > 16) {
// ceil(log2((1.04 / x) ^ 2)) = 16, solve for x
throw InvalidValue("Please set error rate to a value "
"greater than 0.0040624");
}

/*
Expand Down Expand Up @@ -237,6 +233,10 @@ int get_rho(HashIntoType w, int max_width)

HLLCounter::HLLCounter(double error_rate, WordLength ksize)
{
if (error_rate < 0) {
throw InvalidValue("Please set error rate to a value "
"greater than zero");
}
int p = ceil(log2(pow(1.04 / error_rate, 2)));
this->init(p, ksize);
}
Expand All @@ -248,7 +248,7 @@ HLLCounter::HLLCounter(int p, WordLength ksize)

void HLLCounter::init(int p, WordLength ksize)
{
this->alpha = get_alpha(p);
this->alpha = calc_alpha(p);
this->p = p;
this->_ksize = ksize;
this->m = 1 << p;
Expand All @@ -259,6 +259,36 @@ void HLLCounter::init(int p, WordLength ksize)
init_bias_data();
}

double HLLCounter::get_erate()
{
return 1.04 / sqrt(this->m);
}

void HLLCounter::set_erate(double error_rate)
{
if (count(this->M.begin(), this->M.end(), 0) != this->m) {
throw ReadOnlyAttribute("You can only change error rate prior to "
"first counting");
}

if (error_rate < 0) {
throw InvalidValue("Please set error rate to a value "
"greater than zero");
}
int p = ceil(log2(pow(1.04 / error_rate, 2)));
this->init(p, this->_ksize);
}

void HLLCounter::set_ksize(WordLength new_ksize)
{
if (count(this->M.begin(), this->M.end(), 0) != this->m) {
throw ReadOnlyAttribute("You can only change k-mer size prior to "
"first counting");
}

this->init(this->p, new_ksize);
}

double HLLCounter::_Ep()
{
double sum = accumulate(this->M.begin(), this->M.end(), 0.0, ep_sum);
Expand Down
24 changes: 24 additions & 0 deletions lib/hllcounter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,30 @@ public:
HashIntoType estimate_cardinality();
void merge(HLLCounter &);
virtual ~HLLCounter() {}

double get_alpha()
{
return alpha;
}
int get_p()
{
return p;
}
int get_m()
{
return m;
}
void set_ksize(WordLength new_ksize);
int get_ksize()
{
return _ksize;
}
std::vector<int> get_M()
{
return M;
}
double get_erate();
void set_erate(double new_erate);
private:
double _Ep();
double alpha;
Expand Down
25 changes: 25 additions & 0 deletions lib/khmer_exception.hh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,31 @@ public:
StreamReadError(const char * msg) : khmer_file_exception(msg) {}
};


///
// An exception for invalid arguments to functions
//

class InvalidValue : public khmer_exception
{
public:
explicit InvalidValue(const char * msg) : khmer_exception(msg) { }
explicit InvalidValue(const std::string& msg)
: khmer_exception(msg) { }
};

///
// An exception for trying to change a read-only attributes
//

class ReadOnlyAttribute : public khmer_exception
{
public:
explicit ReadOnlyAttribute(const char * msg) : khmer_exception(msg) { }
explicit ReadOnlyAttribute(const std::string& msg)
: khmer_exception(msg) { }
};

}

#endif // KHMER_EXCEPTION_HH
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def check_for_openmp():
BUILD_DEPENDS = []
BUILD_DEPENDS.extend(path_join("lib", bn + ".hh") for bn in [
"khmer", "kmer_hash", "hashtable", "counting", "hashbits", "labelhash",
"hllcounter"])
"hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers"])

SOURCES = ["khmer/_khmermodule.cc"]
SOURCES.extend(path_join("lib", bn + ".cc") for bn in [
Expand Down
Loading

0 comments on commit 4ffae39

Please sign in to comment.