Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HLL coverage improvements #738

Merged
merged 12 commits into from
Feb 25, 2015
7 changes: 7 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
2015-02-24 Luiz Irber <irberlui@msu.edu>

* khmer/_khmermodule.cc: expose HLL internals as read-only attributes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

newline please

* lib/hllcounter.{cc,hh}: simplify error checking, add getters for HLL.
* tests/test_hll.py: add test cases for increasing coverage, also fix
some of the previous ones using the new HLL read-only attributes.

2015-02-24 Luiz Irber <irberlui@msu.edu>

* khmer/_khmermodule.cc: Fix coding style violations.
Expand Down
14 changes: 14 additions & 0 deletions khmer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,20 @@ def __new__(cls, k, starting_size, n_tables):


class HLLCounter(_HLLCounter):
"""
A HyperLogLog counter is a probabilistic data structure specialized on
cardinality estimation.
There is a precision/memory consumption trade-off: error rate determines
how much memory is consumed.

# Creating a new HLLCounter:

>>> khmer.HLLCounter(error_rate, ksize)

where the default values are:
- error_rate: 0.01
- ksize: 20
"""

def __len__(self):
return self.estimate_cardinality()
151 changes: 139 additions & 12 deletions khmer/_khmermodule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4339,24 +4339,17 @@ static PyObject* khmer_hllcounter_new(PyTypeObject * type, PyObject * args,
self = (khmer_KHLLCounter_Object *)type->tp_alloc(type, 0);

if (self != NULL) {
double error_rate = 0;
WordLength ksize = 0;
double error_rate = 0.01;
WordLength ksize = 20;

if (!PyArg_ParseTuple(args, "db", &error_rate, &ksize)) {
if (!PyArg_ParseTuple(args, "|db", &error_rate, &ksize)) {
Py_DECREF(self);
return NULL;
}

if ((error_rate < 0) || (error_rate > 1.0)) {
Py_DECREF(self);
PyErr_SetString(PyExc_ValueError,
"Error rate should be between 0.0 and 1.0");
return NULL;
}

try {
self->hllcounter = new HLLCounter(error_rate, ksize);
} catch (khmer_exception &e) {
} catch (InvalidValue &e) {
Py_DECREF(self);
PyErr_SetString(PyExc_ValueError, e.what());
return NULL;
Expand Down Expand Up @@ -4455,6 +4448,108 @@ static PyObject * hllcounter_consume_fasta(khmer_KHLLCounter_Object * me,
return Py_BuildValue("IK", total_reads, n_consumed);
}

static
PyObject *
hllcounter_get_erate(khmer_KHLLCounter_Object * me)
{
return PyFloat_FromDouble(me->hllcounter->get_erate());
}

static
PyObject *
hllcounter_get_ksize(khmer_KHLLCounter_Object * me)
{
return PyLong_FromLong(me->hllcounter->get_ksize());
}

static
int
hllcounter_set_ksize(khmer_KHLLCounter_Object * me, PyObject *value,
void *closure)
{
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete attribute");
return -1;
}

long ksize = 0;
if (PyLong_Check(value)) {
ksize = PyLong_AsLong(value);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No code coverage for line 4477

} else if (PyInt_Check(value)) {
ksize = PyInt_AsLong(value);
} else {
PyErr_SetString(PyExc_TypeError,
"Please use an integer value for k-mer size");
return -1;
}

if (ksize <= 0) {
PyErr_SetString(PyExc_ValueError, "Please set k-mer size to a value "
"greater than zero");
return -1;
}

try {
me->hllcounter->set_ksize(ksize);
} catch (ReadOnlyAttribute &e) {
PyErr_SetString(PyExc_AttributeError, e.what());
return -1;
}

return 0;
}

static
int
hllcounter_set_erate(khmer_KHLLCounter_Object * me, PyObject *value,
void *closure)
{
if (value == NULL) {
PyErr_SetString(PyExc_TypeError, "Cannot delete attribute");
return -1;
}

if (!PyFloat_Check(value)) {
PyErr_SetString(PyExc_TypeError,
"Please use a float value for k-mer size");
return -1;
}

double erate = PyFloat_AsDouble(value);
try {
me->hllcounter->set_erate(erate);
} catch (InvalidValue &e) {
PyErr_SetString(PyExc_ValueError, e.what());
return -1;
} catch (ReadOnlyAttribute &e) {
PyErr_SetString(PyExc_AttributeError, e.what());
return -1;
}

return 0;
}

static
PyObject *
hllcounter_getalpha(khmer_KHLLCounter_Object * me)
{
return PyFloat_FromDouble(me->hllcounter->get_alpha());
}

static
PyObject *
hllcounter_getcounters(khmer_KHLLCounter_Object * me)
{
std::vector<int> counters = me->hllcounter->get_M();

PyObject * x = PyList_New(counters.size());
for (size_t i = 0; i < counters.size(); i++) {
PyList_SET_ITEM(x, i, PyLong_FromLong(counters[i]));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was really tempted to use the buffer interface, but I think it is better to do it when implementing #714

}

return x;
}

static PyMethodDef khmer_hllcounter_methods[] = {
{
"add", (PyCFunction)hllcounter_add,
Expand All @@ -4480,6 +4575,38 @@ static PyMethodDef khmer_hllcounter_methods[] = {
{NULL} /* Sentinel */
};

static PyGetSetDef khmer_hllcounter_getseters[] = {
{
(char *)"alpha",
(getter)hllcounter_getalpha, NULL,
(char *)"alpha constant for this HLL counter.",
NULL
},
{
(char *)"error_rate",
(getter)hllcounter_get_erate, (setter)hllcounter_set_erate,
(char *)"Error rate for this HLL counter. "
"Can be changed prior to first counting, but becomes read-only after "
"that (raising AttributeError)",
NULL
},
{
(char *)"ksize",
(getter)hllcounter_get_ksize, (setter)hllcounter_set_ksize,
(char *)"k-mer size for this HLL counter."
"Can be changed prior to first counting, but becomes read-only after "
"that (raising AttributeError)",
NULL
},
{
(char *)"counters",
(getter)hllcounter_getcounters, NULL,
(char *)"Read-only internal counters.",
NULL
},
{NULL} /* Sentinel */
};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

khmer/_khmermodule.cc:4611:1: warning: deprecated conversion from string constant to ‘char*’ [-Wwrite-strings] ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I don't really get where is it coming from. Is it from the sentinel?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get multiple warnings for the same line number so I'm guessing it is all the conversions within the code block.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get multiple warnings for the same line number so I'm guessing it is all
the conversions within the code block.

On Tue, Feb 24, 2015, 19:06 Luiz Irber notifications@github.com wrote:

In khmer/_khmermodule.cc
#738 (comment):

  • {
  •    (char const *)"ksize",
    
  •    (getter)hllcounter_get_ksize, (setter)hllcounter_set_ksize,
    
  •    (char const *)"k-mer size for this HLL counter."
    
  •    "Can be changed prior to first counting, but becomes read-only after "
    
  •    "that (raising AttributeError)",
    
  •    NULL
    
  • },
  • {
  •    (char const *)"counters",
    
  •    (getter)hllcounter_getcounters, NULL,
    
  •    (char const *)"Read-only internal counters.",
    
  •    NULL
    
  • },
  • {NULL} /* Sentinel */
    +};

So, I don't really get where is it coming from. Is it from the sentinel?


Reply to this email directly or view it on GitHub
https://github.com/ged-lab/khmer/pull/738/files#r25306902.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found it, shouldn't have used const.


static PyTypeObject khmer_KHLLCounter_Type = {
PyVarObject_HEAD_INIT(NULL, 0)
"_khmer.KHLLCounter", /* tp_name */
Expand Down Expand Up @@ -4510,7 +4637,7 @@ static PyTypeObject khmer_KHLLCounter_Type = {
0, /* tp_iternext */
khmer_hllcounter_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
khmer_hllcounter_getseters, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
Expand Down
58 changes: 44 additions & 14 deletions lib/hllcounter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,16 @@ std::map<int, std::vector<double> > rawEstimateData;
std::map<int, std::vector<double> > biasData;


double get_alpha(const int p)
double calc_alpha(const int p)
{
if ((p < 4) or (p > 16)) {
double valid_lower_bound = 1.04 / std::sqrt(std::pow(static_cast<float>(2),
17));
double valid_upper_bound = 1.04 / std::sqrt(std::pow(static_cast<float>(2), 3));

std::stringstream message;
if (p < 4) {
message << "Max error should be smaller than " << valid_upper_bound;
} else {
message << "Min error should be greater than " << valid_lower_bound;
}
throw khmer_exception(message.str().c_str());
if (p < 4) {
// ceil(log2((1.04 / x) ^ 2)) = 4, solve for x
throw InvalidValue("Please set error rate to a value "
"smaller than 0.367696");
} else if (p > 16) {
// ceil(log2((1.04 / x) ^ 2)) = 16, solve for x
throw InvalidValue("Please set error rate to a value "
"greater than 0.0040624");
}

/*
Expand Down Expand Up @@ -237,6 +233,10 @@ int get_rho(HashIntoType w, int max_width)

HLLCounter::HLLCounter(double error_rate, WordLength ksize)
{
if (error_rate < 0) {
throw InvalidValue("Please set error rate to a value "
"greater than zero");
}
int p = ceil(log2(pow(1.04 / error_rate, 2)));
this->init(p, ksize);
}
Expand All @@ -248,7 +248,7 @@ HLLCounter::HLLCounter(int p, WordLength ksize)

void HLLCounter::init(int p, WordLength ksize)
{
this->alpha = get_alpha(p);
this->alpha = calc_alpha(p);
this->p = p;
this->_ksize = ksize;
this->m = 1 << p;
Expand All @@ -259,6 +259,36 @@ void HLLCounter::init(int p, WordLength ksize)
init_bias_data();
}

double HLLCounter::get_erate()
{
return 1.04 / sqrt(this->m);
}

void HLLCounter::set_erate(double error_rate)
{
if (count(this->M.begin(), this->M.end(), 0) != this->m) {
throw ReadOnlyAttribute("You can only change error rate prior to "
"first counting");
}

if (error_rate < 0) {
throw InvalidValue("Please set error rate to a value "
"greater than zero");
}
int p = ceil(log2(pow(1.04 / error_rate, 2)));
this->init(p, this->_ksize);
}

void HLLCounter::set_ksize(WordLength new_ksize)
{
if (count(this->M.begin(), this->M.end(), 0) != this->m) {
throw ReadOnlyAttribute("You can only change k-mer size prior to "
"first counting");
}

this->init(this->p, new_ksize);
}

double HLLCounter::_Ep()
{
double sum = accumulate(this->M.begin(), this->M.end(), 0.0, ep_sum);
Expand Down
24 changes: 24 additions & 0 deletions lib/hllcounter.hh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,30 @@ public:
HashIntoType estimate_cardinality();
void merge(HLLCounter &);
virtual ~HLLCounter() {}

double get_alpha()
{
return alpha;
}
int get_p()
{
return p;
}
int get_m()
{
return m;
}
void set_ksize(WordLength new_ksize);
int get_ksize()
{
return _ksize;
}
std::vector<int> get_M()
{
return M;
}
double get_erate();
void set_erate(double new_erate);
private:
double _Ep();
double alpha;
Expand Down
25 changes: 25 additions & 0 deletions lib/khmer_exception.hh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,31 @@ public:
StreamReadError(const char * msg) : khmer_file_exception(msg) {}
};


///
// An exception for invalid arguments to functions
//

class InvalidValue : public khmer_exception
{
public:
explicit InvalidValue(const char * msg) : khmer_exception(msg) { }
explicit InvalidValue(const std::string& msg)
: khmer_exception(msg) { }
};

///
// An exception for trying to change a read-only attributes
//

class ReadOnlyAttribute : public khmer_exception
{
public:
explicit ReadOnlyAttribute(const char * msg) : khmer_exception(msg) { }
explicit ReadOnlyAttribute(const std::string& msg)
: khmer_exception(msg) { }
};

}

#endif // KHMER_EXCEPTION_HH
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def check_for_openmp():
BUILD_DEPENDS = []
BUILD_DEPENDS.extend(path_join("lib", bn + ".hh") for bn in [
"khmer", "kmer_hash", "hashtable", "counting", "hashbits", "labelhash",
"hllcounter"])
"hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers"])

SOURCES = ["khmer/_khmermodule.cc"]
SOURCES.extend(path_join("lib", bn + ".cc") for bn in [
Expand Down
Loading