diff --git a/ChangeLog b/ChangeLog index 8d4c8d361c..4f09736039 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +2016-11-15 Titus Brown * khmer/__init__.py,tests/test_functions.py: Fix get_n_primes_near_x to diff --git a/Makefile b/Makefile index 118b396d37..b8b6207755 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ # `SHELL=bash` Will break Titus's laptop, so don't use BASH-isms like # `[[` conditional expressions. -CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmer.cc) setup.py +CPPSOURCES=$(wildcard lib/*.cc lib/*.hh khmer/_khmer.cc khmer/*.hh) setup.py PYSOURCES=$(filter-out khmer/_version.py, \ $(wildcard khmer/*.py scripts/*.py oxli/*.py) ) SOURCES=$(PYSOURCES) $(CPPSOURCES) setup.py diff --git a/examples/c++-api/count-demo.cc b/examples/c++-api/count-demo.cc index 3b83e85637..be3a85749c 100644 --- a/examples/c++-api/count-demo.cc +++ b/examples/c++-api/count-demo.cc @@ -4,7 +4,7 @@ #include #include #include "khmer.hh" -#include "counting.hh" +#include "hashtable.hh" using namespace khmer; @@ -22,7 +22,7 @@ int main() std::vector tablesize; tablesize.push_back(pow(4, ksize)); - CountingHash ktable(ksize, tablesize); + Counttable ktable(ksize, tablesize); ktable.consume_string("ATGGCGATGGCAAGTAGGACCCAGATGGACCAAAG"); diff --git a/khmer/__init__.py b/khmer/__init__.py index 8e398172a8..5788c2660d 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -41,8 +41,10 @@ import json from khmer._khmer import Countgraph as _Countgraph +from khmer._khmer import Counttable as _Counttable from khmer._khmer import GraphLabels as _GraphLabels from khmer._khmer import Nodegraph as _Nodegraph +from khmer._khmer import Nodetable as _Nodetable from khmer._khmer import HLLCounter as _HLLCounter from khmer._khmer import ReadAligner as _ReadAligner from khmer._khmer import LinearAssembler @@ -278,6 +280,15 @@ def __new__(cls, k, starting_size, n_tables): return countgraph +class Counttable(_Counttable): + + def __new__(cls, k, starting_size, n_tables): + primes = get_n_primes_near_x(n_tables, starting_size) + counttable = _Counttable.__new__(cls, k, primes) + counttable.primes = primes + return counttable + + class GraphLabels(_GraphLabels): def __new__(cls, k, starting_size, n_tables): @@ -306,6 +317,15 @@ def __new__(cls, k, starting_size, n_tables): return nodegraph +class Nodetable(_Nodetable): + + def __new__(cls, k, starting_size, n_tables): + primes = get_n_primes_near_x(n_tables, starting_size) + nodetable = _Nodetable.__new__(cls, k, primes) + nodetable.primes = primes + return nodetable + + class HLLCounter(_HLLCounter): """HyperLogLog counter. diff --git a/khmer/_cpy_counttable.hh b/khmer/_cpy_counttable.hh new file mode 100644 index 0000000000..429e95e5e5 --- /dev/null +++ b/khmer/_cpy_counttable.hh @@ -0,0 +1,128 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2010-2015, Michigan State University. +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ + +typedef struct { + khmer_KHashtable_Object khashtable; + Counttable * counttable; +} khmer_KCounttable_Object; + +static PyObject* khmer_counttable_new(PyTypeObject * type, PyObject * args, + PyObject * kwds); + +static PyTypeObject khmer_KCounttable_Type +CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCounttable_Object") += { + PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ + "_khmer.Counttable", /* tp_name */ + sizeof(khmer_KCounttable_Object), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "counttable object", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + khmer_counttable_new, /* tp_new */ +}; + + +// +// khmer_counttable_new +// + +static PyObject* khmer_counttable_new(PyTypeObject * type, PyObject * args, + PyObject * kwds) +{ + khmer_KCounttable_Object * self; + + self = (khmer_KCounttable_Object *)type->tp_alloc(type, 0); + + if (self != NULL) { + WordLength k = 0; + PyListObject * sizes_list_o = NULL; + + if (!PyArg_ParseTuple(args, "bO!", &k, &PyList_Type, &sizes_list_o)) { + Py_DECREF(self); + return NULL; + } + + std::vector sizes; + if (!convert_Pytablesizes_to_vector(sizes_list_o, sizes)) { + Py_DECREF(self); + return NULL; + } + + try { + self->counttable = new Counttable(k, sizes); + } catch (std::bad_alloc &e) { + Py_DECREF(self); + return PyErr_NoMemory(); + } + self->khashtable.hashtable = + dynamic_cast(self->counttable); + } + + return (PyObject *) self; +} diff --git a/khmer/_cpy_hashgraph.hh b/khmer/_cpy_hashgraph.hh new file mode 100644 index 0000000000..3ac1173ec1 --- /dev/null +++ b/khmer/_cpy_hashgraph.hh @@ -0,0 +1,1587 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2010-2015, Michigan State University. +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ + +static PyTypeObject khmer_KHashgraph_Type +CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashgraph_Object") += { + PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ + "_khmer.Hashgraph", /* tp_name */ + sizeof(khmer_KHashgraph_Object), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "hashgraph object" /* tp_doc */ +}; + +// +// Method definitions +// + +static +PyObject * +hashgraph_find_high_degree_nodes(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * long_str; + + if (!PyArg_ParseTuple(args, "s", &long_str)) { + return NULL; + } + + if (strlen(long_str) < hashgraph->ksize()) { + PyErr_SetString(PyExc_ValueError, + "string length must >= the hashgraph k-mer size"); + return NULL; + } + + SeenSet * hashes = new SeenSet; + hashgraph->find_high_degree_nodes(long_str, *hashes); + + khmer_HashSet_Object * o; + o = create_HashSet_Object(hashes, hashgraph->ksize()); + + return (PyObject *) o; +} + +static +PyObject * +hashgraph_neighbors(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + PyObject * val_obj; + + if (!PyArg_ParseTuple(args, "O", &val_obj)) { + return NULL; + } + + Kmer start_kmer; + if (!ht_convert_PyObject_to_Kmer(val_obj, start_kmer, hashgraph)) { + return NULL; + } + + KmerQueue node_q; + Traverser traverser(hashgraph); + + traverser.traverse(start_kmer, node_q); + + PyObject * x = PyList_New(node_q.size()); + if (x == NULL) { + return NULL; + } + + unsigned int i; + PyObject * value = nullptr; + for (i = 0; node_q.size() > 0; i++) { + const HashIntoType h = node_q.front(); + node_q.pop(); + convert_HashIntoType_to_PyObject(h, &value); + PyList_SET_ITEM(x, i, value); + } + + return x; +} + +static +PyObject * +hashgraph_traverse_linear_path(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + PyObject * val_o; + khmer_KNodegraph_Object * nodegraph_o = NULL; + khmer_HashSet_Object * hdn_o = NULL; + + if (!PyArg_ParseTuple(args, "OO!O!", &val_o, + &khmer_HashSet_Type, &hdn_o, + &khmer_KNodegraph_Type, &nodegraph_o)) { + return NULL; + } + Kmer start_kmer; + if (!ht_convert_PyObject_to_Kmer(val_o, start_kmer, hashgraph)) { + return NULL; + } + + SeenSet * adj = new SeenSet; + SeenSet * visited = new SeenSet; + unsigned int size = hashgraph->traverse_linear_path(start_kmer, + *adj, *visited, + *nodegraph_o->nodegraph, + *hdn_o->hashes); + + khmer_HashSet_Object * adj_o = create_HashSet_Object(adj, + hashgraph->ksize()); + khmer_HashSet_Object * visited_o = create_HashSet_Object(visited, + hashgraph->ksize()); + + PyObject * ret = Py_BuildValue("kOO", (unsigned long) size, + (PyObject *) adj_o, (PyObject *) visited_o); + Py_DECREF(adj_o); + Py_DECREF(visited_o); + + return ret; +} + +static +PyObject * +hashgraph_assemble_linear_path(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + PyObject * val_o; + khmer_KNodegraph_Object * nodegraph_o = NULL; + Nodegraph * stop_bf = NULL; + + if (!PyArg_ParseTuple(args, "O|O!", &val_o, + &khmer_KNodegraph_Type, &nodegraph_o)) { + return NULL; + } + + Kmer start_kmer; + if (!ht_convert_PyObject_to_Kmer(val_o, start_kmer, hashgraph)) { + return NULL; + } + + if (nodegraph_o) { + stop_bf = nodegraph_o->nodegraph; + } + LinearAssembler assembler(hashgraph); + + std::string contig = assembler.assemble(start_kmer, stop_bf); + + PyObject * ret = Py_BuildValue("s", contig.c_str()); + + return ret; +} + +static +PyObject * +hashgraph_n_tags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + return PyLong_FromSize_t(hashgraph->n_tags()); +} + +static +PyObject * +hashgraph_print_stop_tags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + hashgraph->print_stop_tags(filename); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_print_tagset(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + hashgraph->print_tagset(filename); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_load_stop_tags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + PyObject * clear_tags_o = NULL; + + if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) { + return NULL; + } + + bool clear_tags = true; + if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) { + clear_tags = false; + } + + + try { + hashgraph->load_stop_tags(filename, clear_tags); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + + +static +PyObject * +hashgraph_save_stop_tags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + try { + hashgraph->save_stop_tags(filename); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + +static PyObject * hashgraph_repartition_largest_partition( + khmer_KHashgraph_Object * me, + PyObject * args); + +static +PyObject * +hashgraph_calc_connected_graph_size(khmer_KHashgraph_Object * me, + PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * _kmer; + unsigned int max_size = 0; + PyObject * break_on_circum_o = NULL; + if (!PyArg_ParseTuple(args, "s|IO", &_kmer, &max_size, &break_on_circum_o)) { + return NULL; + } + + bool break_on_circum = false; + if (break_on_circum_o && PyObject_IsTrue(break_on_circum_o)) { + break_on_circum = true; + } + + unsigned long long size = 0; + Kmer start_kmer = hashgraph->build_kmer(_kmer); + + Py_BEGIN_ALLOW_THREADS + KmerSet keeper; + hashgraph->calc_connected_graph_size(start_kmer, size, keeper, max_size, + break_on_circum); + Py_END_ALLOW_THREADS + + return PyLong_FromUnsignedLongLong(size); +} + +static +PyObject * +hashgraph_kmer_degree(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer_s = NULL; + + if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + return NULL; + } + + return PyLong_FromLong(hashgraph->kmer_degree(kmer_s)); +} + +static +PyObject * +hashgraph_trim_on_stoptags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * seq = NULL; + + if (!PyArg_ParseTuple(args, "s", &seq)) { + return NULL; + } + + size_t trim_at; + Py_BEGIN_ALLOW_THREADS + + trim_at = hashgraph->trim_on_stoptags(seq); + + Py_END_ALLOW_THREADS; + + PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); + if (trim_seq == NULL) { + return NULL; + } + PyObject * ret = Py_BuildValue("Ok", trim_seq, (unsigned long) trim_at); + Py_DECREF(trim_seq); + + return ret; +} + +static +PyObject * +hashgraph_do_subset_partition(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + PyObject * start_kmer_obj; + PyObject * end_kmer_obj; + HashIntoType start_kmer, end_kmer; + PyObject * break_on_stop_tags_o = NULL; + PyObject * stop_big_traversals_o = NULL; + + if (!PyArg_ParseTuple(args, "|OOOO", &start_kmer_obj, &end_kmer_obj, + &break_on_stop_tags_o, + &stop_big_traversals_o)) { + return NULL; + } + if (!ht_convert_PyObject_to_HashIntoType(start_kmer_obj, start_kmer, + hashgraph)) { + return NULL; + } + if (!ht_convert_PyObject_to_HashIntoType(end_kmer_obj, end_kmer, + hashgraph)) { + return NULL; + } + + bool break_on_stop_tags = false; + if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { + break_on_stop_tags = true; + } + bool stop_big_traversals = false; + if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { + stop_big_traversals = true; + } + + SubsetPartition * subset_p = NULL; + try { + Py_BEGIN_ALLOW_THREADS + subset_p = new SubsetPartition(hashgraph); + subset_p->do_partition(start_kmer, end_kmer, break_on_stop_tags, + stop_big_traversals); + Py_END_ALLOW_THREADS + } catch (std::bad_alloc &e) { + return PyErr_NoMemory(); + } + + khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ + PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); + + if (subset_obj == NULL) { + delete subset_p; + return NULL; + } + + subset_obj->subset = subset_p; + + return (PyObject *) subset_obj; +} + + +static +PyObject * +hashgraph_merge_subset(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + khmer_KSubsetPartition_Object * subset_obj = NULL; + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, + &subset_obj)) { + return NULL; + } + + SubsetPartition * subset_p = subset_obj->subset; + + hashgraph->partition->merge(subset_p); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_merge_from_disk(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + try { + hashgraph->partition->merge_from_disk(filename); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_consume_fasta_and_tag_with_reads_parser(khmer_KHashgraph_Object * me, + PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + python::khmer_ReadParser_Object * rparser_obj = NULL; + + if (!PyArg_ParseTuple( args, "O!", &python::khmer_ReadParser_Type, + &rparser_obj)) { + return NULL; + } + + read_parsers:: IParser * rparser = rparser_obj-> parser; + + // call the C++ function, and trap signals => Python + const char *value_exception = NULL; + const char *file_exception = NULL; + unsigned long long n_consumed = 0; + unsigned int total_reads = 0; + std::string exc_string; + + Py_BEGIN_ALLOW_THREADS + try { + hashgraph->consume_fasta_and_tag(rparser, total_reads, n_consumed); + } catch (khmer_file_exception &exc) { + exc_string = exc.what(); + file_exception = exc_string.c_str(); + } catch (khmer_value_exception &exc) { + exc_string = exc.what(); + value_exception = exc_string.c_str(); + } + Py_END_ALLOW_THREADS + + if (file_exception != NULL) { + PyErr_SetString(PyExc_OSError, file_exception); + return NULL; + } + if (value_exception != NULL) { + PyErr_SetString(PyExc_ValueError, value_exception); + } + + return Py_BuildValue("IK", total_reads, n_consumed); +} + +static +PyObject * +hashgraph_consume_partitioned_fasta(khmer_KHashgraph_Object * me, + PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + // call the C++ function, and trap signals => Python + + unsigned long long n_consumed; + unsigned int total_reads; + + try { + hashgraph->consume_partitioned_fasta(filename, total_reads, n_consumed); + } catch (khmer_file_exception &exc) { + PyErr_SetString(PyExc_OSError, exc.what()); + return NULL; + } catch (khmer_value_exception &exc) { + PyErr_SetString(PyExc_ValueError, exc.what()); + return NULL; + } + + return Py_BuildValue("IK", total_reads, n_consumed); +} + +static +PyObject * +hashgraph_find_all_tags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer_s = NULL; + + if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + return NULL; + } + + if (strlen(kmer_s) != hashgraph->ksize()) { + PyErr_SetString( PyExc_ValueError, + "k-mer size must equal the k-mer size of the presence table"); + return NULL; + } + + pre_partition_info * ppi = NULL; + + Kmer kmer = hashgraph->build_kmer(kmer_s); + + Py_BEGIN_ALLOW_THREADS + + try { + ppi = new pre_partition_info(kmer); + } catch (std::bad_alloc &e) { + return PyErr_NoMemory(); + } + hashgraph->partition->find_all_tags(kmer, ppi->tagged_kmers, + hashgraph->all_tags); + hashgraph->add_kmer_to_tags(kmer); + + Py_END_ALLOW_THREADS + + khmer_PrePartitionInfo_Object * ppi_obj = (khmer_PrePartitionInfo_Object *) \ + PyObject_New(khmer_PrePartitionInfo_Object, &khmer_PrePartitionInfo_Type); + + ppi_obj->PrePartitionInfo = ppi; + + return (PyObject*)ppi_obj; +} + +static +PyObject * +hashgraph_assign_partition_id(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + khmer_PrePartitionInfo_Object * ppi_obj; + if (!PyArg_ParseTuple(args, "O!", &khmer_PrePartitionInfo_Type, &ppi_obj)) { + return NULL; + } + + pre_partition_info * ppi; + ppi = ppi_obj->PrePartitionInfo; + + PartitionID p; + p = hashgraph->partition->assign_partition_id(ppi->kmer, + ppi->tagged_kmers); + + return PyLong_FromLong(p); +} + +static +PyObject * +hashgraph_add_tag(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer_s = NULL; + if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + return NULL; + } + + HashIntoType kmer = hashgraph->hash_dna(kmer_s); + hashgraph->add_tag(kmer); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_add_stop_tag(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer_s = NULL; + if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + return NULL; + } + + HashIntoType kmer = hashgraph->hash_dna(kmer_s); + hashgraph->add_stop_tag(kmer); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_get_stop_tags(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + SeenSet::const_iterator si; + + PyObject * x = PyList_New(hashgraph->stop_tags.size()); + unsigned long long i = 0; + for (si = hashgraph->stop_tags.begin(); si != hashgraph->stop_tags.end(); + ++si) { + std::string s = hashgraph->unhash_dna(*si); + PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str())); + i++; + } + + return x; +} + +static +PyObject * +hashgraph_get_tagset(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + SeenSet::const_iterator si; + + PyObject * x = PyList_New(hashgraph->all_tags.size()); + unsigned long long i = 0; + for (si = hashgraph->all_tags.begin(); si != hashgraph->all_tags.end(); + ++si) { + std::string s = hashgraph->unhash_dna(*si); + PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str())); + i++; + } + + return x; +} + +static +PyObject * +hashgraph_output_partitions(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + const char * output = NULL; + PyObject * output_unassigned_o = NULL; + + if (!PyArg_ParseTuple(args, "ss|O", &filename, &output, + &output_unassigned_o)) { + return NULL; + } + + bool output_unassigned = false; + if (output_unassigned_o != NULL && PyObject_IsTrue(output_unassigned_o)) { + output_unassigned = true; + } + + size_t n_partitions = 0; + + try { + SubsetPartition * subset_p = hashgraph->partition; + n_partitions = subset_p->output_partitioned_file(filename, + output, + output_unassigned); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } catch (khmer_value_exception &exc) { + PyErr_SetString(PyExc_ValueError, exc.what()); + return NULL; + } + + return PyLong_FromLong(n_partitions); +} + +static +PyObject * +hashgraph_save_partitionmap(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + try { + hashgraph->partition->save_partitionmap(filename); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_load_partitionmap(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + try { + hashgraph->partition->load_partitionmap(filename); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph__validate_partitionmap(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + hashgraph->partition->_validate_pmap(); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_count_partitions(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + size_t n_partitions = 0, n_unassigned = 0; + hashgraph->partition->count_partitions(n_partitions, n_unassigned); + + return Py_BuildValue("nn", (Py_ssize_t) n_partitions, + (Py_ssize_t) n_unassigned); +} + +static +PyObject * +hashgraph_subset_count_partitions(khmer_KHashgraph_Object * me, PyObject * args) +{ + khmer_KSubsetPartition_Object * subset_obj = NULL; + + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, + &subset_obj)) { + return NULL; + } + + + size_t n_partitions = 0, n_unassigned = 0; + subset_obj->subset->count_partitions(n_partitions, n_unassigned); + + return Py_BuildValue("nn", (Py_ssize_t) n_partitions, + (Py_ssize_t) n_unassigned); +} + +static +PyObject * +hashgraph_subset_partition_size_distribution(khmer_KHashgraph_Object * me, + PyObject * args) +{ + khmer_KSubsetPartition_Object * subset_obj = NULL; + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, + &subset_obj)) { + return NULL; + } + + SubsetPartition * subset_p = subset_obj->subset; + + PartitionCountDistribution d; + + unsigned int n_unassigned = 0; + subset_p->partition_size_distribution(d, n_unassigned); + + PyObject * x = PyList_New(d.size()); + if (x == NULL) { + return NULL; + } + PartitionCountDistribution::iterator di; + + unsigned int i; + for (i = 0, di = d.begin(); di != d.end(); ++di, i++) { + PyObject * value = Py_BuildValue("KK", di->first, di->second); + if (value == NULL) { + Py_DECREF(x); + return NULL; + } + PyList_SET_ITEM(x, i, value); + } + if (!(i == d.size())) { + throw khmer_exception(); + } + + PyObject * returnValue = Py_BuildValue("NI", x, n_unassigned); + if (returnValue == NULL) { + Py_DECREF(x); + return NULL; + } + return returnValue; +} + +static +PyObject * +hashgraph_load_tagset(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + PyObject * clear_tags_o = NULL; + + if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) { + return NULL; + } + + bool clear_tags = true; + if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) { + clear_tags = false; + } + + try { + hashgraph->load_tagset(filename, clear_tags); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_save_tagset(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + try { + hashgraph->save_tagset(filename); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_save_subset_partitionmap(khmer_KHashgraph_Object * me, + PyObject * args) +{ + const char * filename = NULL; + khmer_KSubsetPartition_Object * subset_obj = NULL; + + if (!PyArg_ParseTuple(args, "O!s", &khmer_KSubsetPartition_Type, + &subset_obj, &filename)) { + return NULL; + } + + SubsetPartition * subset_p = subset_obj->subset; + + Py_BEGIN_ALLOW_THREADS + + try { + subset_p->save_partitionmap(filename); + } catch (khmer_file_exception &e) { + PyErr_SetString(PyExc_OSError, e.what()); + return NULL; + } + + Py_END_ALLOW_THREADS + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_load_subset_partitionmap(khmer_KHashgraph_Object * me, + PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename = NULL; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + SubsetPartition * subset_p; + try { + subset_p = new SubsetPartition(hashgraph); + } catch (std::bad_alloc &e) { + return PyErr_NoMemory(); + } + + const char *file_exception = NULL; + + std::string exc_string ; + Py_BEGIN_ALLOW_THREADS + try { + subset_p->load_partitionmap(filename); + } catch (khmer_file_exception &exc) { + exc_string = exc.what(); + file_exception = exc_string.c_str(); + } + Py_END_ALLOW_THREADS + + if (file_exception != NULL) { + PyErr_SetString(PyExc_OSError, file_exception); + delete subset_p; + return NULL; + } + + khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ + PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); + + if (subset_obj == NULL) { + delete subset_p; + return NULL; + } + + subset_obj->subset = subset_p; + + return (PyObject *) subset_obj; +} + +static +PyObject * +hashgraph__set_tag_density(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + unsigned int d; + if (!PyArg_ParseTuple(args, "I", &d)) { + return NULL; + } + + hashgraph->_set_tag_density(d); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph__get_tag_density(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + unsigned int d = hashgraph->_get_tag_density(); + + return PyLong_FromLong(d); +} + +static +PyObject * +hashgraph__validate_subset_partitionmap(khmer_KHashgraph_Object * me, + PyObject * args) +{ + khmer_KSubsetPartition_Object * subset_obj = NULL; + + if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, + &subset_obj)) { + return NULL; + } + + SubsetPartition * subset_p = subset_obj->subset; + + subset_p->_validate_pmap(); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_set_partition_id(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer = NULL; + PartitionID p = 0; + + if (!PyArg_ParseTuple(args, "sI", &kmer, &p)) { + return NULL; + } + + hashgraph->partition->set_partition_id(kmer, p); + + Py_RETURN_NONE; +} + +static +PyObject * +hashgraph_join_partitions(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + PartitionID p1 = 0, p2 = 0; + + if (!PyArg_ParseTuple(args, "II", &p1, &p2)) { + return NULL; + } + + p1 = hashgraph->partition->join_partitions(p1, p2); + + return PyLong_FromLong(p1); +} + +static +PyObject * +hashgraph_get_partition_id(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer = NULL; + + if (!PyArg_ParseTuple(args, "s", &kmer)) { + return NULL; + } + + PartitionID partition_id; + partition_id = hashgraph->partition->get_partition_id(kmer); + + return PyLong_FromLong(partition_id); +} + +static +PyObject * +hashgraph_divide_tags_into_subsets(khmer_KHashgraph_Object * me, + PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + unsigned int subset_size = 0; + + if (!PyArg_ParseTuple(args, "I", &subset_size)) { + return NULL; + } + + SeenSet * divvy = new SeenSet; + hashgraph->divide_tags_into_subsets(subset_size, *divvy); + + PyObject * x = (PyObject *) create_HashSet_Object(divvy, + hashgraph->ksize()); + return x; +} + +static +PyObject * +hashgraph_count_kmers_within_radius(khmer_KHashgraph_Object * me, + PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer = NULL; + unsigned int radius = 0; + unsigned int max_count = 0; + + if (!PyArg_ParseTuple(args, "sI|I", &kmer, &radius, &max_count)) { + return NULL; + } + + unsigned int n; + + Py_BEGIN_ALLOW_THREADS + Kmer start_kmer = hashgraph->build_kmer(kmer); + KmerSet seen; + n = hashgraph->traverse_from_kmer(start_kmer, radius, + seen, max_count); + + Py_END_ALLOW_THREADS + + return PyLong_FromUnsignedLong(n); +} + +static +PyObject * +hashgraph_extract_unique_paths(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * sequence = NULL; + unsigned int min_length = 0; + float min_unique_f = 0; + if (!PyArg_ParseTuple(args, "sIf", &sequence, &min_length, &min_unique_f)) { + return NULL; + } + + std::vector results; + hashgraph->extract_unique_paths(sequence, min_length, min_unique_f, results); + + PyObject * x = PyList_New(results.size()); + if (x == NULL) { + return NULL; + } + + for (unsigned int i = 0; i < results.size(); i++) { + PyList_SET_ITEM(x, i, PyUnicode_FromString(results[i].c_str())); + } + + return x; +} + +static +PyObject * +hashgraph_consume_and_tag(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * seq; + + if (!PyArg_ParseTuple(args, "s", &seq)) { + return NULL; + } + + // call the C++ function, and trap signals => Python + + unsigned long long n_consumed = 0; + + // @CTB needs to normalize + hashgraph->consume_sequence_and_tag(seq, n_consumed); + + return Py_BuildValue("K", n_consumed); +} + +static +PyObject * +hashgraph_get_tags_and_positions(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * seq; + + if (!PyArg_ParseTuple(args, "s", &seq)) { + return NULL; + } + + // call the C++ function, and trap signals => Python + + std::vector posns; + std::vector tags; + + unsigned int pos = 1; + KmerIterator kmers(seq, hashgraph->ksize()); + + while (!kmers.done()) { + HashIntoType kmer = kmers.next(); + if (set_contains(hashgraph->all_tags, kmer)) { + posns.push_back(pos); + tags.push_back(kmer); + } + pos++; + } + + PyObject * tag = nullptr; + PyObject * posns_list = PyList_New(posns.size()); + for (size_t i = 0; i < posns.size(); i++) { + convert_HashIntoType_to_PyObject(tags[i], &tag); + PyObject * tup = Py_BuildValue("IO", posns[i], tag); + PyList_SET_ITEM(posns_list, i, tup); + } + + return posns_list; +} + +static +PyObject * +hashgraph_find_all_tags_list(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * kmer_s = NULL; + + if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + return NULL; + } + + if (strlen(kmer_s) != hashgraph->ksize()) { + PyErr_SetString(PyExc_ValueError, + "k-mer length must equal the counting table k-mer size"); + return NULL; + } + + SeenSet * tags = new SeenSet; + + Kmer start_kmer = hashgraph->build_kmer(kmer_s); + + Py_BEGIN_ALLOW_THREADS + + hashgraph->partition->find_all_tags(start_kmer, *tags, + hashgraph->all_tags); + + Py_END_ALLOW_THREADS + + PyObject * x = (PyObject *) create_HashSet_Object(tags, + hashgraph->ksize()); + return x; +} + +static +PyObject * +hashgraph_consume_fasta_and_tag(khmer_KHashgraph_Object * me, PyObject * args) +{ + Hashgraph * hashgraph = me->hashgraph; + + const char * filename; + + if (!PyArg_ParseTuple(args, "s", &filename)) { + return NULL; + } + + // call the C++ function, and trap signals => Python + + unsigned long long n_consumed; + unsigned int total_reads; + + try { + hashgraph->consume_fasta_and_tag(filename, total_reads, n_consumed); + } catch (khmer_file_exception &exc) { + PyErr_SetString(PyExc_OSError, exc.what()); + return NULL; + } catch (khmer_value_exception &exc) { + PyErr_SetString(PyExc_ValueError, exc.what()); + return NULL; + } + + return Py_BuildValue("IK", total_reads, n_consumed); +} + +static PyMethodDef khmer_hashgraph_methods[] = { + // + // graph/traversal functionality + // + + { + "neighbors", + (PyCFunction)hashgraph_neighbors, METH_VARARGS, + "Get a list of neighbor nodes for this k-mer.", + }, + { + "calc_connected_graph_size", + (PyCFunction)hashgraph_calc_connected_graph_size, METH_VARARGS, "" + }, + { + "kmer_degree", + (PyCFunction)hashgraph_kmer_degree, METH_VARARGS, + "Calculate the number of immediate neighbors this k-mer has in " + "the graph." + }, + { + "count_kmers_within_radius", + (PyCFunction)hashgraph_count_kmers_within_radius, METH_VARARGS, + "Calculate the number of neighbors with given radius in the graph." + }, + { + "find_high_degree_nodes", + (PyCFunction)hashgraph_find_high_degree_nodes, METH_VARARGS, + "Examine the given sequence for degree > 2 nodes and add to " + "list; used in graph contraction.", + }, + { + "traverse_linear_path", + (PyCFunction)hashgraph_traverse_linear_path, METH_VARARGS, + "Traverse the path through the graph starting with the given " + "k-mer and avoiding high-degree nodes, finding (and returning) " + "traversed k-mers and any encountered high-degree nodes.", + }, + { + "assemble_linear_path", + (PyCFunction)hashgraph_assemble_linear_path, METH_VARARGS, + "Assemble a purely linear path starting with the given " + "k-mer, returning traversed k-mers and any encountered high-degree " + "nodes.", + }, + + // + // tagging / sparse graph functionality + // + + { + "consume_and_tag", + (PyCFunction)hashgraph_consume_and_tag, METH_VARARGS, + "Consume a sequence and tag it." + }, + { + "get_tags_and_positions", + (PyCFunction)hashgraph_get_tags_and_positions, METH_VARARGS, + "Retrieve tags and their positions in a sequence." + }, + { + "find_all_tags_list", + (PyCFunction)hashgraph_find_all_tags_list, METH_VARARGS, + "Find all tags within range of the given k-mer, return as list" + }, + { + "consume_fasta_and_tag", + (PyCFunction)hashgraph_consume_fasta_and_tag, METH_VARARGS, + "Consume all sequences in a FASTA/FASTQ file and tag the resulting " + "graph." + }, + { + "extract_unique_paths", + (PyCFunction)hashgraph_extract_unique_paths, METH_VARARGS, + "@CTB remove." + }, + { + "print_tagset", + (PyCFunction)hashgraph_print_tagset, METH_VARARGS, + "Print out all of the tags." + }, + { + "add_tag", + (PyCFunction)hashgraph_add_tag, METH_VARARGS, + "Add a k-mer to the tagset." + }, + { + "get_tagset", + (PyCFunction)hashgraph_get_tagset, METH_VARARGS, + "Get all tagged k-mers as DNA strings." + }, + { + "load_tagset", + (PyCFunction)hashgraph_load_tagset, METH_VARARGS, + "Load tags from a file." + }, + { + "save_tagset", + (PyCFunction)hashgraph_save_tagset, METH_VARARGS, + "Save tags to a file." + }, + { + "n_tags", + (PyCFunction)hashgraph_n_tags, METH_VARARGS, + "Return the count of all tags." + }, + { + "divide_tags_into_subsets", + (PyCFunction)hashgraph_divide_tags_into_subsets, METH_VARARGS, + "Divide tags equally up into subsets of given size." + }, + { + "_get_tag_density", + (PyCFunction)hashgraph__get_tag_density, METH_VARARGS, + "Get the tagging density." + }, + { + "_set_tag_density", + (PyCFunction)hashgraph__set_tag_density, METH_VARARGS, + "Set the tagging density." + }, + + // + // partitioning + // + { + "do_subset_partition", + (PyCFunction)hashgraph_do_subset_partition, METH_VARARGS, + "Partition the graph starting from a given subset of tags." + }, + { + "find_all_tags", + (PyCFunction)hashgraph_find_all_tags, METH_VARARGS, + "Starting from the given k-mer, find all closely connected tags." + }, + { + "assign_partition_id", + (PyCFunction)hashgraph_assign_partition_id, METH_VARARGS, + "Assign a partition ID to a given tag." + }, + { + "output_partitions", + (PyCFunction)hashgraph_output_partitions, METH_VARARGS, + "Write out sequences in given filename to another file, annotating " + "with partition IDs." + }, + { + "load_partitionmap", + (PyCFunction)hashgraph_load_partitionmap, METH_VARARGS, + "Load a partitionmap for a given subset." + }, + { + "save_partitionmap", + (PyCFunction)hashgraph_save_partitionmap, METH_VARARGS, + "Save a partitionmap for the given subset." + }, + { + "_validate_partitionmap", + (PyCFunction)hashgraph__validate_partitionmap, METH_VARARGS, + "Run internal validation checks." + }, + { + "consume_fasta_and_tag_with_reads_parser", + (PyCFunction)hashgraph_consume_fasta_and_tag_with_reads_parser, + METH_VARARGS, + "Count all k-mers using the given reads parser" + }, + { + "consume_partitioned_fasta", + (PyCFunction)hashgraph_consume_partitioned_fasta, METH_VARARGS, + "Count all k-mers in a given file" + }, + { + "merge_subset", + (PyCFunction)hashgraph_merge_subset, METH_VARARGS, + "Merge the given subset into this one." + }, + { + "merge_subset_from_disk", + (PyCFunction)hashgraph_merge_from_disk, METH_VARARGS, + "Merge the given subset (filename) into this one." + }, + { + "count_partitions", + (PyCFunction)hashgraph_count_partitions, METH_VARARGS, + "Count the number of partitions in the master partitionmap." + }, + { + "subset_count_partitions", + (PyCFunction)hashgraph_subset_count_partitions, METH_VARARGS, + "Count the number of partitions in this subset partitionmap." + }, + { + "subset_partition_size_distribution", + (PyCFunction)hashgraph_subset_partition_size_distribution, + METH_VARARGS, + "Get the size distribution of partitions in this subset." + }, + { + "save_subset_partitionmap", + (PyCFunction)hashgraph_save_subset_partitionmap, METH_VARARGS, + "Save the partition map for this subset." + }, + { + "load_subset_partitionmap", + (PyCFunction)hashgraph_load_subset_partitionmap, METH_VARARGS, + "Save the partition map for this subset." + }, + { + "_validate_subset_partitionmap", + (PyCFunction)hashgraph__validate_subset_partitionmap, METH_VARARGS, + "Run internal validation checks on this subset." + }, + { + "set_partition_id", + (PyCFunction)hashgraph_set_partition_id, METH_VARARGS, + "Set the partition ID for this tag." + }, + { + "join_partitions", + (PyCFunction)hashgraph_join_partitions, METH_VARARGS, + "Join the partitions of these two tags." + }, + { + "get_partition_id", + (PyCFunction)hashgraph_get_partition_id, METH_VARARGS, + "Get the partition ID of this tag." + }, + { + "repartition_largest_partition", + (PyCFunction)hashgraph_repartition_largest_partition, METH_VARARGS, + "Repartition the largest partition (in the face of stop tags)." + }, + + // stop tags + { + "load_stop_tags", + (PyCFunction)hashgraph_load_stop_tags, METH_VARARGS, + "Load the set of stop tags." + }, + { + "save_stop_tags", + (PyCFunction)hashgraph_save_stop_tags, METH_VARARGS, + "Save the set of stop tags." + }, + { + "print_stop_tags", + (PyCFunction)hashgraph_print_stop_tags, METH_VARARGS, + "Print out the set of stop tags." + }, + { + "trim_on_stoptags", + (PyCFunction)hashgraph_trim_on_stoptags, METH_VARARGS, + "Trim the reads on the given stop tags." + }, + { + "add_stop_tag", + (PyCFunction)hashgraph_add_stop_tag, METH_VARARGS, + "Add this k-mer as a stop tag." + }, + { + "get_stop_tags", + (PyCFunction)hashgraph_get_stop_tags, METH_VARARGS, + "Return a DNA list of all of the stop tags." + }, + {NULL, NULL, 0, NULL} /* sentinel */ +}; diff --git a/khmer/_cpy_nodetable.hh b/khmer/_cpy_nodetable.hh new file mode 100644 index 0000000000..b4353df75c --- /dev/null +++ b/khmer/_cpy_nodetable.hh @@ -0,0 +1,131 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2010-2015, Michigan State University. +Copyright (C) 2015-2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the Michigan State University nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ + +typedef struct { + khmer_KHashtable_Object khashtable; + Nodetable * nodetable; +} khmer_KNodetable_Object; + +static PyMethodDef khmer_nodetable_methods[] = { + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +static PyObject* khmer_nodetable_new(PyTypeObject * type, PyObject * args, + PyObject * kwds); + +static PyTypeObject khmer_KNodetable_Type +CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KNodetable_Object") += { + PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ + "_khmer.Notetable", /* tp_name */ + sizeof(khmer_KNodetable_Object), /* tp_basicsize */ + 0, /* tp_itemsize */ + 0, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "nodetable object", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + khmer_nodetable_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + khmer_nodetable_new, /* tp_new */ +}; + +// +// khmer_nodetable_new +// + +static PyObject* khmer_nodetable_new(PyTypeObject * type, PyObject * args, + PyObject * kwds) +{ + khmer_KNodetable_Object * self; + + self = (khmer_KNodetable_Object *)type->tp_alloc(type, 0); + + if (self != NULL) { + WordLength k = 0; + PyListObject * sizes_list_o = NULL; + + if (!PyArg_ParseTuple(args, "bO!", &k, &PyList_Type, &sizes_list_o)) { + Py_DECREF(self); + return NULL; + } + + std::vector sizes; + if (!convert_Pytablesizes_to_vector(sizes_list_o, sizes)) { + Py_DECREF(self); + return NULL; + } + + try { + self->nodetable = new Nodetable(k, sizes); + } catch (std::bad_alloc &e) { + Py_DECREF(self); + return PyErr_NoMemory(); + } + self->khashtable.hashtable = + dynamic_cast(self->nodetable); + } + + return (PyObject *) self; +} diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc index d398155240..85ed899f1a 100644 --- a/khmer/_khmer.cc +++ b/khmer/_khmer.cc @@ -48,8 +48,7 @@ Contact: khmer-project@idyll.org #include "khmer.hh" #include "kmer_hash.hh" #include "hashtable.hh" -#include "hashbits.hh" -#include "counting.hh" +#include "hashgraph.hh" #include "assembler.hh" #include "read_aligner.hh" #include "labelhash.hh" @@ -139,9 +138,6 @@ static bool convert_PyLong_to_HashIntoType(PyObject * value, // Take a Python object and (try to) convert it to a HashIntoType. // Note: will set error condition and return false if cannot do. -// -// This method uses the _hash function directly, instead of taking a -// Hashtable object and using its hash_dna method. static bool convert_PyObject_to_HashIntoType(PyObject * value, HashIntoType& hashval, @@ -258,6 +254,33 @@ static bool ht_convert_PyObject_to_Kmer(PyObject * value, } +static bool convert_Pytablesizes_to_vector(PyListObject * sizes_list_o, + std::vector& sizes) +{ + Py_ssize_t sizes_list_o_length = PyList_GET_SIZE(sizes_list_o); + if (sizes_list_o_length < 1) { + PyErr_SetString(PyExc_ValueError, + "tablesizes needs to be one or more numbers"); + return false; + } + for (Py_ssize_t i = 0; i < sizes_list_o_length; i++) { + PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); + if (PyLong_Check(size_o)) { + sizes.push_back(PyLong_AsUnsignedLongLong(size_o)); + } else if (PyInt_Check(size_o)) { + sizes.push_back(PyInt_AsLong(size_o)); + } else if (PyFloat_Check(size_o)) { + sizes.push_back(PyFloat_AS_DOUBLE(size_o)); + } else { + PyErr_SetString(PyExc_TypeError, + "2nd argument must be a list of ints, longs, or floats"); + return false; + } + } + return true; +} + + /***********************************************************************/ // @@ -1339,9 +1362,14 @@ static khmer_HashSet_Object * create_HashSet_Object(SeenSet * h, WordLength k) typedef struct { PyObject_HEAD - Hashgraph * hashtable; + Hashtable * hashtable; } khmer_KHashtable_Object; +typedef struct { + khmer_KHashtable_Object khashtable; + Hashgraph * hashgraph; +} khmer_KHashgraph_Object; + typedef struct { PyObject_HEAD SubsetPartition * subset; @@ -1374,22 +1402,22 @@ static PyTypeObject khmer_KSubsetPartition_Type = { }; typedef struct { - khmer_KHashtable_Object khashtable; - Hashbits * hashbits; -} khmer_KHashbits_Object; + khmer_KHashgraph_Object khashgraph; + Nodegraph * nodegraph; +} khmer_KNodegraph_Object; -static void khmer_hashbits_dealloc(khmer_KHashbits_Object * obj); -static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, +static void khmer_nodegraph_dealloc(khmer_KNodegraph_Object * obj); +static PyObject* khmer_nodegraph_new(PyTypeObject * type, PyObject * args, PyObject * kwds); static PyTypeObject khmer_KNodegraph_Type -CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object") +CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KNodegraph_Object") = { PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ "_khmer.Nodegraph", /* tp_name */ - sizeof(khmer_KHashbits_Object), /* tp_basicsize */ + sizeof(khmer_KNodegraph_Object), /* tp_basicsize */ 0, /* tp_itemsize */ - (destructor)khmer_hashbits_dealloc, /*tp_dealloc*/ + (destructor)khmer_nodegraph_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ @@ -1405,7 +1433,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object") 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "hashbits object", /* tp_doc */ + "nodegraph object", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ @@ -1422,7 +1450,7 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashbits_Object") 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ - khmer_hashbits_new, /* tp_new */ + khmer_nodegraph_new, /* tp_new */ }; @@ -1430,7 +1458,7 @@ static PyObject * hashtable_ksize(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; if (!PyArg_ParseTuple(args, "")) { return NULL; @@ -1445,7 +1473,7 @@ static PyObject * hashtable_hash(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; char * kmer; if (!PyArg_ParseTuple(args, "s", &kmer)) { @@ -1467,7 +1495,7 @@ static PyObject * hashtable_reverse_hash(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; PyObject * val_o; HashIntoType val; @@ -1475,7 +1503,7 @@ hashtable_reverse_hash(khmer_KHashtable_Object * me, PyObject * args) return NULL; } - if (!convert_PyObject_to_HashIntoType(val_o, val, 0)) { + if (!ht_convert_PyObject_to_HashIntoType(val_o, val, hashtable)) { return NULL; } @@ -1486,7 +1514,7 @@ static PyObject * hashtable_n_occupied(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; if (!PyArg_ParseTuple(args, "")) { return NULL; @@ -1501,7 +1529,7 @@ static PyObject * hashtable_n_unique_kmers(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; uint64_t n = hashtable->n_unique_kmers(); @@ -1512,7 +1540,7 @@ static PyObject * hashtable_count(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; PyObject * v; if (!PyArg_ParseTuple(args, "O", &v)) { @@ -1534,7 +1562,7 @@ static PyObject * hashtable_consume_fasta(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * filename; @@ -1563,7 +1591,7 @@ PyObject * hashtable_consume_fasta_with_reads_parser(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; PyObject * rparser_obj = NULL; @@ -1609,7 +1637,7 @@ static PyObject * hashtable_consume(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * long_str; @@ -1633,7 +1661,7 @@ static PyObject * hashtable_get(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; PyObject * arg; @@ -1653,9 +1681,43 @@ hashtable_get(khmer_KHashtable_Object * me, PyObject * args) static PyObject * -hashtable_find_high_degree_nodes(khmer_KHashtable_Object * me, PyObject * args) +hashtable_set_use_bigcount(khmer_KHashtable_Object * me, PyObject * args) +{ + Hashtable * hashtable = me->hashtable; + + PyObject * x; + if (!PyArg_ParseTuple(args, "O", &x)) { + return NULL; + } + int setme = PyObject_IsTrue(x); + if (setme < 0) { + return NULL; + } + hashtable->set_use_bigcount((bool)setme); + + Py_RETURN_NONE; +} + +static +PyObject * +hashtable_get_use_bigcount(khmer_KHashtable_Object * me, PyObject * args) +{ + Hashtable * hashtable = me->hashtable; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + bool val = hashtable->get_use_bigcount(); + + return PyBool_FromLong((int)val); +} + +static +PyObject * +hashtable_get_min_count(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * long_str; @@ -1669,130 +1731,261 @@ hashtable_find_high_degree_nodes(khmer_KHashtable_Object * me, PyObject * args) return NULL; } - SeenSet * hashes = new SeenSet; - hashtable->find_high_degree_nodes(long_str, *hashes); - - khmer_HashSet_Object * o; - o = create_HashSet_Object(hashes, hashtable->ksize()); + BoundedCounterType c = hashtable->get_min_count(long_str); + unsigned int N = c; - return (PyObject *) o; + return PyLong_FromLong(N); } static PyObject * -hashtable_neighbors(khmer_KHashtable_Object * me, PyObject * args) +hashtable_get_max_count(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; - PyObject * val_obj; + Hashtable * hashtable = me->hashtable; + + const char * long_str; - if (!PyArg_ParseTuple(args, "O", &val_obj)) { + if (!PyArg_ParseTuple(args, "s", &long_str)) { return NULL; } - Kmer start_kmer; - if (!ht_convert_PyObject_to_Kmer(val_obj, start_kmer, hashtable)) { + if (strlen(long_str) < hashtable->ksize()) { + PyErr_SetString(PyExc_ValueError, + "string length must >= the hashtable k-mer size"); return NULL; } - KmerQueue node_q; - Traverser traverser(hashtable); + BoundedCounterType c = hashtable->get_max_count(long_str); + unsigned int N = c; - traverser.traverse(start_kmer, node_q); + return PyLong_FromLong(N); +} - PyObject * x = PyList_New(node_q.size()); - if (x == NULL) { +static +PyObject * +hashtable_abundance_distribution_with_reads_parser(khmer_KHashtable_Object * me, + PyObject * args) +{ + Hashtable * hashtable = me->hashtable; + + khmer :: python :: khmer_ReadParser_Object * rparser_obj = NULL; + khmer_KNodegraph_Object *tracking_obj = NULL; + + if (!PyArg_ParseTuple(args, "O!O!", &python::khmer_ReadParser_Type, + &rparser_obj, &khmer_KNodegraph_Type, &tracking_obj)) { return NULL; } - unsigned int i; - PyObject * value = nullptr; - for (i = 0; node_q.size() > 0; i++) { - const HashIntoType h = node_q.front(); - node_q.pop(); - convert_HashIntoType_to_PyObject(h, &value); - PyList_SET_ITEM(x, i, value); + read_parsers::IParser *rparser = rparser_obj->parser; + Nodegraph *nodegraph = tracking_obj->nodegraph; + uint64_t *dist = NULL; + const char *value_exception = NULL; + const char *file_exception = NULL; + std::string exc_string; + + Py_BEGIN_ALLOW_THREADS + try { + dist = hashtable->abundance_distribution(rparser, nodegraph); + } catch (khmer_file_exception &exc) { + exc_string = exc.what(); + file_exception = exc_string.c_str(); + } catch (khmer_value_exception &exc) { + exc_string = exc.what(); + value_exception = exc_string.c_str(); + } + Py_END_ALLOW_THREADS + + if (file_exception != NULL) { + PyErr_SetString(PyExc_OSError, file_exception); + return NULL; + } + if (value_exception != NULL) { + PyErr_SetString(PyExc_ValueError, value_exception); + return NULL; } + PyObject * x = PyList_New(MAX_BIGCOUNT + 1); + if (x == NULL) { + delete[] dist; + return NULL; + } + for (int i = 0; i < MAX_BIGCOUNT + 1; i++) { + PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i])); + } + + delete[] dist; return x; } static PyObject * -hashtable_traverse_linear_path(khmer_KHashtable_Object * me, PyObject * args) +hashtable_trim_on_abundance(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; - PyObject * val_o; - khmer_KHashbits_Object * nodegraph_o = NULL; - khmer_HashSet_Object * hdn_o = NULL; + const char * seq = NULL; + unsigned int min_count_i = 0; - if (!PyArg_ParseTuple(args, "OO!O!", &val_o, - &khmer_HashSet_Type, &hdn_o, - &khmer_KNodegraph_Type, &nodegraph_o)) { + if (!PyArg_ParseTuple(args, "sI", &seq, &min_count_i)) { return NULL; } - Kmer start_kmer; - if (!ht_convert_PyObject_to_Kmer(val_o, start_kmer, hashtable)) { + + unsigned long trim_at; + Py_BEGIN_ALLOW_THREADS + + BoundedCounterType min_count = min_count_i; + + trim_at = hashtable->trim_on_abundance(seq, min_count); + + Py_END_ALLOW_THREADS; + + PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); + if (trim_seq == NULL) { + return NULL; + } + PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at); + Py_DECREF(trim_seq); + + return ret; +} + +static +PyObject * +hashtable_trim_below_abundance(khmer_KHashtable_Object * me, PyObject * args) +{ + Hashtable * hashtable = me->hashtable; + + const char * seq = NULL; + BoundedCounterType max_count_i = 0; + + if (!PyArg_ParseTuple(args, "sH", &seq, &max_count_i)) { return NULL; } - SeenSet * adj = new SeenSet; - SeenSet * visited = new SeenSet; - unsigned int size = hashtable->traverse_linear_path(start_kmer, - *adj, *visited, - *nodegraph_o->hashbits, - *hdn_o->hashes); + unsigned long trim_at; + Py_BEGIN_ALLOW_THREADS + + BoundedCounterType max_count = max_count_i; + + trim_at = hashtable->trim_below_abundance(seq, max_count); - khmer_HashSet_Object * adj_o = create_HashSet_Object(adj, - hashtable->ksize()); - khmer_HashSet_Object * visited_o = create_HashSet_Object(visited, - hashtable->ksize()); + Py_END_ALLOW_THREADS; - PyObject * ret = Py_BuildValue("kOO", (unsigned long) size, - (PyObject *) adj_o, (PyObject *) visited_o); - Py_DECREF(adj_o); - Py_DECREF(visited_o); + PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); + if (trim_seq == NULL) { + return NULL; + } + PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at); + Py_DECREF(trim_seq); return ret; } static PyObject * -hashtable_assemble_linear_path(khmer_KHashtable_Object * me, PyObject * args) +hashtable_find_spectral_error_positions(khmer_KHashtable_Object * me, + PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; - PyObject * val_o; - khmer_KHashbits_Object * nodegraph_o = NULL; - Hashbits * stop_bf = NULL; + const char * seq = NULL; + BoundedCounterType max_count = 0; // unsigned short int - if (!PyArg_ParseTuple(args, "O|O!", &val_o, - &khmer_KNodegraph_Type, &nodegraph_o)) { + if (!PyArg_ParseTuple(args, "sH", &seq, &max_count)) { return NULL; } - Kmer start_kmer; - if (!ht_convert_PyObject_to_Kmer(val_o, start_kmer, hashtable)) { + std::vector posns; + + try { + posns = hashtable->find_spectral_error_positions(seq, max_count); + } catch (khmer_exception &e) { + PyErr_SetString(PyExc_ValueError, e.what()); return NULL; } - if (nodegraph_o) { - stop_bf = nodegraph_o->hashbits; + Py_ssize_t posns_size = posns.size(); + + PyObject * x = PyList_New(posns_size); + if (x == NULL) { + return NULL; + } + for (Py_ssize_t i = 0; i < posns_size; i++) { + PyList_SET_ITEM(x, i, PyLong_FromLong(posns[i])); } - LinearAssembler assembler(hashtable); - std::string contig = assembler.assemble(start_kmer, stop_bf); + return x; +} + +static +PyObject * +hashtable_abundance_distribution(khmer_KHashtable_Object * me, PyObject * args) +{ + Hashtable * hashtable = me->hashtable; - PyObject * ret = Py_BuildValue("s", contig.c_str()); + const char * filename = NULL; + khmer_KNodegraph_Object * tracking_obj = NULL; + if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KNodegraph_Type, + &tracking_obj)) { + return NULL; + } - return ret; + Nodegraph *nodegraph = tracking_obj->nodegraph; + uint64_t *dist = NULL; + const char *value_exception = NULL; + const char *file_exception = NULL; + std::string exc_string; + + Py_BEGIN_ALLOW_THREADS + try { + dist = hashtable->abundance_distribution(filename, nodegraph); + } catch (khmer_file_exception &exc) { + exc_string = exc.what(); + file_exception = exc_string.c_str(); + } catch (khmer_value_exception &exc) { + exc_string = exc.what(); + value_exception = exc_string.c_str(); + } + Py_END_ALLOW_THREADS + + if (file_exception != NULL) { + PyErr_SetString(PyExc_OSError, file_exception); + if (dist != NULL) { + delete []dist; + } + return NULL; + } + if (value_exception != NULL) { + PyErr_SetString(PyExc_ValueError, value_exception); + if (dist != NULL) { + delete []dist; + } + return NULL; + } + + PyObject * x = PyList_New(MAX_BIGCOUNT + 1); + if (x == NULL) { + if (dist != NULL) { + delete []dist; + } + return NULL; + } + for (int i = 0; i < MAX_BIGCOUNT + 1; i++) { + PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i])); + } + + if (dist != NULL) { + delete []dist; + } + + return x; } static PyObject * hashtable_load(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * filename = NULL; @@ -1814,7 +2007,7 @@ static PyObject * hashtable_save(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * filename = NULL; @@ -1836,7 +2029,7 @@ static PyObject * hashtable_get_hashsizes(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; if (!PyArg_ParseTuple(args, "")) { @@ -1855,1177 +2048,61 @@ hashtable_get_hashsizes(khmer_KHashtable_Object * me, PyObject * args) static PyObject * -hashtable_consume_and_tag(khmer_KHashtable_Object * me, PyObject * args) +hashtable_get_median_count(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; - const char * seq; + const char * long_str; - if (!PyArg_ParseTuple(args, "s", &seq)) { + if (!PyArg_ParseTuple(args, "s", &long_str)) { return NULL; } - // call the C++ function, and trap signals => Python + if (strlen(long_str) < hashtable->ksize()) { + PyErr_SetString(PyExc_ValueError, + "string length must >= the hashtable k-mer size"); + return NULL; + } - unsigned long long n_consumed = 0; + BoundedCounterType med = 0; + float average = 0, stddev = 0; - // @CTB needs to normalize - hashtable->consume_sequence_and_tag(seq, n_consumed); + hashtable->get_median_count(long_str, med, average, stddev); - return Py_BuildValue("K", n_consumed); + return Py_BuildValue("iff", med, average, stddev); } static PyObject * -hashtable_get_tags_and_positions(khmer_KHashtable_Object * me, PyObject * args) +hashtable_median_at_least(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; - const char * seq; + const char * long_str; + unsigned int cutoff; - if (!PyArg_ParseTuple(args, "s", &seq)) { + if (!PyArg_ParseTuple(args, "sI", &long_str, &cutoff)) { return NULL; } - // call the C++ function, and trap signals => Python - - std::vector posns; - std::vector tags; - - unsigned int pos = 1; - KmerIterator kmers(seq, hashtable->ksize()); - - while (!kmers.done()) { - HashIntoType kmer = kmers.next(); - if (set_contains(hashtable->all_tags, kmer)) { - posns.push_back(pos); - tags.push_back(kmer); - } - pos++; - } - - PyObject * tag = nullptr; - PyObject * posns_list = PyList_New(posns.size()); - for (size_t i = 0; i < posns.size(); i++) { - convert_HashIntoType_to_PyObject(tags[i], &tag); - PyObject * tup = Py_BuildValue("IO", posns[i], tag); - PyList_SET_ITEM(posns_list, i, tup); - } - - return posns_list; -} - -static -PyObject * -hashtable_find_all_tags_list(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer_s = NULL; - - if (!PyArg_ParseTuple(args, "s", &kmer_s)) { - return NULL; - } - - if (strlen(kmer_s) != hashtable->ksize()) { - PyErr_SetString(PyExc_ValueError, - "k-mer length must equal the counting table k-mer size"); - return NULL; - } - - SeenSet * tags = new SeenSet; - - Kmer start_kmer = hashtable->build_kmer(kmer_s); - - Py_BEGIN_ALLOW_THREADS - - hashtable->partition->find_all_tags(start_kmer, *tags, - hashtable->all_tags); - - Py_END_ALLOW_THREADS - - PyObject * x = (PyObject *) create_HashSet_Object(tags, - hashtable->ksize()); - return x; -} - -static -PyObject * -hashtable_consume_fasta_and_tag(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - // call the C++ function, and trap signals => Python - - unsigned long long n_consumed; - unsigned int total_reads; - - try { - hashtable->consume_fasta_and_tag(filename, total_reads, n_consumed); - } catch (khmer_file_exception &exc) { - PyErr_SetString(PyExc_OSError, exc.what()); - return NULL; - } catch (khmer_value_exception &exc) { - PyErr_SetString(PyExc_ValueError, exc.what()); - return NULL; - } - - return Py_BuildValue("IK", total_reads, n_consumed); -} - -static -PyObject * -hashtable_get_median_count(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * long_str; - - if (!PyArg_ParseTuple(args, "s", &long_str)) { - return NULL; - } - - if (strlen(long_str) < hashtable->ksize()) { - PyErr_SetString(PyExc_ValueError, - "string length must >= the hashtable k-mer size"); - return NULL; - } - - BoundedCounterType med = 0; - float average = 0, stddev = 0; - - hashtable->get_median_count(long_str, med, average, stddev); - - return Py_BuildValue("iff", med, average, stddev); -} - -static -PyObject * -hashtable_median_at_least(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * long_str; - unsigned int cutoff; - - if (!PyArg_ParseTuple(args, "sI", &long_str, &cutoff)) { - return NULL; - } - - if (strlen(long_str) < hashtable->ksize()) { - PyErr_SetString(PyExc_ValueError, - "string length must >= the hashtable k-mer size"); - return NULL; - } - - if (hashtable->median_at_least(long_str, cutoff)) { - Py_RETURN_TRUE; - } - Py_RETURN_FALSE; - -} - -static -PyObject * -hashtable_n_tags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - return PyLong_FromSize_t(hashtable->n_tags()); -} - -static -PyObject * -hashtable_print_stop_tags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - hashtable->print_stop_tags(filename); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_print_tagset(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - hashtable->print_tagset(filename); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_load_stop_tags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - PyObject * clear_tags_o = NULL; - - if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) { - return NULL; - } - - bool clear_tags = true; - if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) { - clear_tags = false; - } - - - try { - hashtable->load_stop_tags(filename, clear_tags); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - - -static -PyObject * -hashtable_save_stop_tags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - try { - hashtable->save_stop_tags(filename); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - -static PyObject * hashtable_repartition_largest_partition( - khmer_KHashtable_Object * me, - PyObject * args); - -static -PyObject * -hashtable_calc_connected_graph_size(khmer_KHashtable_Object * me, - PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * _kmer; - unsigned int max_size = 0; - PyObject * break_on_circum_o = NULL; - if (!PyArg_ParseTuple(args, "s|IO", &_kmer, &max_size, &break_on_circum_o)) { - return NULL; - } - - bool break_on_circum = false; - if (break_on_circum_o && PyObject_IsTrue(break_on_circum_o)) { - break_on_circum = true; - } - - unsigned long long size = 0; - Kmer start_kmer = hashtable->build_kmer(_kmer); - - Py_BEGIN_ALLOW_THREADS - KmerSet keeper; - hashtable->calc_connected_graph_size(start_kmer, size, keeper, max_size, - break_on_circum); - Py_END_ALLOW_THREADS - - return PyLong_FromUnsignedLongLong(size); -} - -static -PyObject * -hashtable_kmer_degree(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer_s = NULL; - - if (!PyArg_ParseTuple(args, "s", &kmer_s)) { - return NULL; - } - - return PyLong_FromLong(hashtable->kmer_degree(kmer_s)); -} - -static -PyObject * -hashtable_trim_on_stoptags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * seq = NULL; - - if (!PyArg_ParseTuple(args, "s", &seq)) { - return NULL; - } - - size_t trim_at; - Py_BEGIN_ALLOW_THREADS - - trim_at = hashtable->trim_on_stoptags(seq); - - Py_END_ALLOW_THREADS; - - PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); - if (trim_seq == NULL) { - return NULL; - } - PyObject * ret = Py_BuildValue("Ok", trim_seq, (unsigned long) trim_at); - Py_DECREF(trim_seq); - - return ret; -} - -static -PyObject * -hashtable_do_subset_partition(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - PyObject * start_kmer_obj; - PyObject * end_kmer_obj; - HashIntoType start_kmer, end_kmer; - PyObject * break_on_stop_tags_o = NULL; - PyObject * stop_big_traversals_o = NULL; - - if (!PyArg_ParseTuple(args, "|OOOO", &start_kmer_obj, &end_kmer_obj, - &break_on_stop_tags_o, - &stop_big_traversals_o)) { - return NULL; - } - if (!ht_convert_PyObject_to_HashIntoType(start_kmer_obj, start_kmer, - hashtable)) { - return NULL; - } - if (!ht_convert_PyObject_to_HashIntoType(end_kmer_obj, end_kmer, - hashtable)) { - return NULL; - } - - bool break_on_stop_tags = false; - if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { - break_on_stop_tags = true; - } - bool stop_big_traversals = false; - if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { - stop_big_traversals = true; - } - - SubsetPartition * subset_p = NULL; - try { - Py_BEGIN_ALLOW_THREADS - subset_p = new SubsetPartition(hashtable); - subset_p->do_partition(start_kmer, end_kmer, break_on_stop_tags, - stop_big_traversals); - Py_END_ALLOW_THREADS - } catch (std::bad_alloc &e) { - return PyErr_NoMemory(); - } - - khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ - PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); - - if (subset_obj == NULL) { - delete subset_p; - return NULL; - } - - subset_obj->subset = subset_p; - - return (PyObject *) subset_obj; -} - - -static -PyObject * -hashtable_merge_subset(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - khmer_KSubsetPartition_Object * subset_obj = NULL; - if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, - &subset_obj)) { - return NULL; - } - - SubsetPartition * subset_p = subset_obj->subset; - - hashtable->partition->merge(subset_p); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_merge_from_disk(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - try { - hashtable->partition->merge_from_disk(filename); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_consume_fasta_and_tag_with_reads_parser(khmer_KHashtable_Object * me, - PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - python::khmer_ReadParser_Object * rparser_obj = NULL; - - if (!PyArg_ParseTuple( args, "O!", &python::khmer_ReadParser_Type, - &rparser_obj)) { - return NULL; - } - - read_parsers:: IParser * rparser = rparser_obj-> parser; - - // call the C++ function, and trap signals => Python - const char *value_exception = NULL; - const char *file_exception = NULL; - unsigned long long n_consumed = 0; - unsigned int total_reads = 0; - std::string exc_string; - - Py_BEGIN_ALLOW_THREADS - try { - hashtable->consume_fasta_and_tag(rparser, total_reads, n_consumed); - } catch (khmer_file_exception &exc) { - exc_string = exc.what(); - file_exception = exc_string.c_str(); - } catch (khmer_value_exception &exc) { - exc_string = exc.what(); - value_exception = exc_string.c_str(); - } - Py_END_ALLOW_THREADS - - if (file_exception != NULL) { - PyErr_SetString(PyExc_OSError, file_exception); - return NULL; - } - if (value_exception != NULL) { - PyErr_SetString(PyExc_ValueError, value_exception); - } - - return Py_BuildValue("IK", total_reads, n_consumed); -} - -static -PyObject * -hashtable_consume_partitioned_fasta(khmer_KHashtable_Object * me, - PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - // call the C++ function, and trap signals => Python - - unsigned long long n_consumed; - unsigned int total_reads; - - try { - hashtable->consume_partitioned_fasta(filename, total_reads, n_consumed); - } catch (khmer_file_exception &exc) { - PyErr_SetString(PyExc_OSError, exc.what()); - return NULL; - } catch (khmer_value_exception &exc) { - PyErr_SetString(PyExc_ValueError, exc.what()); - return NULL; - } - - return Py_BuildValue("IK", total_reads, n_consumed); -} - -static -PyObject * -hashtable_find_all_tags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer_s = NULL; - - if (!PyArg_ParseTuple(args, "s", &kmer_s)) { - return NULL; - } - - if (strlen(kmer_s) != hashtable->ksize()) { - PyErr_SetString( PyExc_ValueError, - "k-mer size must equal the k-mer size of the presence table"); - return NULL; - } - - pre_partition_info * ppi = NULL; - - Kmer kmer = hashtable->build_kmer(kmer_s); - - Py_BEGIN_ALLOW_THREADS - - try { - ppi = new pre_partition_info(kmer); - } catch (std::bad_alloc &e) { - return PyErr_NoMemory(); - } - hashtable->partition->find_all_tags(kmer, ppi->tagged_kmers, - hashtable->all_tags); - hashtable->add_kmer_to_tags(kmer); - - Py_END_ALLOW_THREADS - - khmer_PrePartitionInfo_Object * ppi_obj = (khmer_PrePartitionInfo_Object *) \ - PyObject_New(khmer_PrePartitionInfo_Object, &khmer_PrePartitionInfo_Type); - - ppi_obj->PrePartitionInfo = ppi; - - return (PyObject*)ppi_obj; -} - -static -PyObject * -hashtable_assign_partition_id(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - khmer_PrePartitionInfo_Object * ppi_obj; - if (!PyArg_ParseTuple(args, "O!", &khmer_PrePartitionInfo_Type, &ppi_obj)) { - return NULL; - } - - pre_partition_info * ppi; - ppi = ppi_obj->PrePartitionInfo; - - PartitionID p; - p = hashtable->partition->assign_partition_id(ppi->kmer, - ppi->tagged_kmers); - - return PyLong_FromLong(p); -} - -static -PyObject * -hashtable_add_tag(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer_s = NULL; - if (!PyArg_ParseTuple(args, "s", &kmer_s)) { - return NULL; - } - - HashIntoType kmer = hashtable->hash_dna(kmer_s); - hashtable->add_tag(kmer); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_add_stop_tag(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer_s = NULL; - if (!PyArg_ParseTuple(args, "s", &kmer_s)) { - return NULL; - } - - HashIntoType kmer = hashtable->hash_dna(kmer_s); - hashtable->add_stop_tag(kmer); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_get_stop_tags(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - SeenSet::const_iterator si; - - PyObject * x = PyList_New(hashtable->stop_tags.size()); - unsigned long long i = 0; - for (si = hashtable->stop_tags.begin(); si != hashtable->stop_tags.end(); - ++si) { - std::string s = hashtable->unhash_dna(*si); - PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str())); - i++; - } - - return x; -} - -static -PyObject * -hashtable_get_tagset(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - SeenSet::const_iterator si; - - PyObject * x = PyList_New(hashtable->all_tags.size()); - unsigned long long i = 0; - for (si = hashtable->all_tags.begin(); si != hashtable->all_tags.end(); - ++si) { - std::string s = hashtable->unhash_dna(*si); - PyList_SET_ITEM(x, i, Py_BuildValue("s", s.c_str())); - i++; - } - - return x; -} - -static -PyObject * -hashtable_output_partitions(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - const char * output = NULL; - PyObject * output_unassigned_o = NULL; - - if (!PyArg_ParseTuple(args, "ss|O", &filename, &output, - &output_unassigned_o)) { - return NULL; - } - - bool output_unassigned = false; - if (output_unassigned_o != NULL && PyObject_IsTrue(output_unassigned_o)) { - output_unassigned = true; - } - - size_t n_partitions = 0; - - try { - SubsetPartition * subset_p = hashtable->partition; - n_partitions = subset_p->output_partitioned_file(filename, - output, - output_unassigned); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } catch (khmer_value_exception &exc) { - PyErr_SetString(PyExc_ValueError, exc.what()); - return NULL; - } - - return PyLong_FromLong(n_partitions); -} - -static -PyObject * -hashtable_save_partitionmap(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - try { - hashtable->partition->save_partitionmap(filename); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_load_partitionmap(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - try { - hashtable->partition->load_partitionmap(filename); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable__validate_partitionmap(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - hashtable->partition->_validate_pmap(); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_count_partitions(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - size_t n_partitions = 0, n_unassigned = 0; - hashtable->partition->count_partitions(n_partitions, n_unassigned); - - return Py_BuildValue("nn", (Py_ssize_t) n_partitions, - (Py_ssize_t) n_unassigned); -} - -static -PyObject * -hashtable_subset_count_partitions(khmer_KHashtable_Object * me, PyObject * args) -{ - khmer_KSubsetPartition_Object * subset_obj = NULL; - - if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, - &subset_obj)) { - return NULL; - } - - - size_t n_partitions = 0, n_unassigned = 0; - subset_obj->subset->count_partitions(n_partitions, n_unassigned); - - return Py_BuildValue("nn", (Py_ssize_t) n_partitions, - (Py_ssize_t) n_unassigned); -} - -static -PyObject * -hashtable_subset_partition_size_distribution(khmer_KHashtable_Object * me, - PyObject * args) -{ - khmer_KSubsetPartition_Object * subset_obj = NULL; - if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, - &subset_obj)) { - return NULL; - } - - SubsetPartition * subset_p = subset_obj->subset; - - PartitionCountDistribution d; - - unsigned int n_unassigned = 0; - subset_p->partition_size_distribution(d, n_unassigned); - - PyObject * x = PyList_New(d.size()); - if (x == NULL) { - return NULL; - } - PartitionCountDistribution::iterator di; - - unsigned int i; - for (i = 0, di = d.begin(); di != d.end(); ++di, i++) { - PyObject * value = Py_BuildValue("KK", di->first, di->second); - if (value == NULL) { - Py_DECREF(x); - return NULL; - } - PyList_SET_ITEM(x, i, value); - } - if (!(i == d.size())) { - throw khmer_exception(); - } - - PyObject * returnValue = Py_BuildValue("NI", x, n_unassigned); - if (returnValue == NULL) { - Py_DECREF(x); - return NULL; - } - return returnValue; -} - -static -PyObject * -hashtable_load_tagset(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - PyObject * clear_tags_o = NULL; - - if (!PyArg_ParseTuple(args, "s|O", &filename, &clear_tags_o)) { - return NULL; - } - - bool clear_tags = true; - if (clear_tags_o && !PyObject_IsTrue(clear_tags_o)) { - clear_tags = false; - } - - try { - hashtable->load_tagset(filename, clear_tags); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_save_tagset(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - try { - hashtable->save_tagset(filename); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_save_subset_partitionmap(khmer_KHashtable_Object * me, - PyObject * args) -{ - const char * filename = NULL; - khmer_KSubsetPartition_Object * subset_obj = NULL; - - if (!PyArg_ParseTuple(args, "O!s", &khmer_KSubsetPartition_Type, - &subset_obj, &filename)) { - return NULL; - } - - SubsetPartition * subset_p = subset_obj->subset; - - Py_BEGIN_ALLOW_THREADS - - try { - subset_p->save_partitionmap(filename); - } catch (khmer_file_exception &e) { - PyErr_SetString(PyExc_OSError, e.what()); - return NULL; - } - - Py_END_ALLOW_THREADS - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_load_subset_partitionmap(khmer_KHashtable_Object * me, - PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "s", &filename)) { - return NULL; - } - - SubsetPartition * subset_p; - try { - subset_p = new SubsetPartition(hashtable); - } catch (std::bad_alloc &e) { - return PyErr_NoMemory(); - } - - const char *file_exception = NULL; - - std::string exc_string ; - Py_BEGIN_ALLOW_THREADS - try { - subset_p->load_partitionmap(filename); - } catch (khmer_file_exception &exc) { - exc_string = exc.what(); - file_exception = exc_string.c_str(); - } - Py_END_ALLOW_THREADS - - if (file_exception != NULL) { - PyErr_SetString(PyExc_OSError, file_exception); - delete subset_p; - return NULL; - } - - khmer_KSubsetPartition_Object * subset_obj = (khmer_KSubsetPartition_Object *)\ - PyObject_New(khmer_KSubsetPartition_Object, &khmer_KSubsetPartition_Type); - - if (subset_obj == NULL) { - delete subset_p; - return NULL; - } - - subset_obj->subset = subset_p; - - return (PyObject *) subset_obj; -} - -static -PyObject * -hashtable__set_tag_density(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - unsigned int d; - if (!PyArg_ParseTuple(args, "I", &d)) { - return NULL; - } - - hashtable->_set_tag_density(d); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable__get_tag_density(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - unsigned int d = hashtable->_get_tag_density(); - - return PyLong_FromLong(d); -} - -static -PyObject * -hashtable__validate_subset_partitionmap(khmer_KHashtable_Object * me, - PyObject * args) -{ - khmer_KSubsetPartition_Object * subset_obj = NULL; - - if (!PyArg_ParseTuple(args, "O!", &khmer_KSubsetPartition_Type, - &subset_obj)) { - return NULL; - } - - SubsetPartition * subset_p = subset_obj->subset; - - subset_p->_validate_pmap(); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_set_partition_id(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer = NULL; - PartitionID p = 0; - - if (!PyArg_ParseTuple(args, "sI", &kmer, &p)) { - return NULL; - } - - hashtable->partition->set_partition_id(kmer, p); - - Py_RETURN_NONE; -} - -static -PyObject * -hashtable_join_partitions(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - PartitionID p1 = 0, p2 = 0; - - if (!PyArg_ParseTuple(args, "II", &p1, &p2)) { - return NULL; - } - - p1 = hashtable->partition->join_partitions(p1, p2); - - return PyLong_FromLong(p1); -} - -static -PyObject * -hashtable_get_partition_id(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer = NULL; - - if (!PyArg_ParseTuple(args, "s", &kmer)) { - return NULL; - } - - PartitionID partition_id; - partition_id = hashtable->partition->get_partition_id(kmer); - - return PyLong_FromLong(partition_id); -} - -static -PyObject * -hashtable_divide_tags_into_subsets(khmer_KHashtable_Object * me, - PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - unsigned int subset_size = 0; - - if (!PyArg_ParseTuple(args, "I", &subset_size)) { - return NULL; - } - - SeenSet * divvy = new SeenSet; - hashtable->divide_tags_into_subsets(subset_size, *divvy); - - PyObject * x = (PyObject *) create_HashSet_Object(divvy, - hashtable->ksize()); - return x; -} - -static -PyObject * -hashtable_count_kmers_within_radius(khmer_KHashtable_Object * me, - PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * kmer = NULL; - unsigned int radius = 0; - unsigned int max_count = 0; - - if (!PyArg_ParseTuple(args, "sI|I", &kmer, &radius, &max_count)) { - return NULL; - } - - unsigned int n; - - Py_BEGIN_ALLOW_THREADS - Kmer start_kmer = hashtable->build_kmer(kmer); - KmerSet seen; - n = hashtable->traverse_from_kmer(start_kmer, radius, - seen, max_count); - - Py_END_ALLOW_THREADS - - return PyLong_FromUnsignedLong(n); -} - -static -PyObject * -hashtable_extract_unique_paths(khmer_KHashtable_Object * me, PyObject * args) -{ - Hashgraph * hashtable = me->hashtable; - - const char * sequence = NULL; - unsigned int min_length = 0; - float min_unique_f = 0; - if (!PyArg_ParseTuple(args, "sIf", &sequence, &min_length, &min_unique_f)) { - return NULL; - } - - std::vector results; - hashtable->extract_unique_paths(sequence, min_length, min_unique_f, results); - - PyObject * x = PyList_New(results.size()); - if (x == NULL) { + if (strlen(long_str) < hashtable->ksize()) { + PyErr_SetString(PyExc_ValueError, + "string length must >= the hashtable k-mer size"); return NULL; } - for (unsigned int i = 0; i < results.size(); i++) { - PyList_SET_ITEM(x, i, PyUnicode_FromString(results[i].c_str())); + if (hashtable->median_at_least(long_str, cutoff)) { + Py_RETURN_TRUE; } + Py_RETURN_FALSE; - return x; } - static PyObject * hashtable_get_kmers(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * sequence; if (!PyArg_ParseTuple(args, "s", &sequence)) { @@ -3049,7 +2126,7 @@ static PyObject * hashtable_get_kmer_counts(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * sequence; if (!PyArg_ParseTuple(args, "s", &sequence)) { @@ -3073,7 +2150,7 @@ static PyObject * hashtable_get_kmer_hashes(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * sequence; if (!PyArg_ParseTuple(args, "s", &sequence)) { @@ -3099,7 +2176,7 @@ PyObject * hashtable_get_kmer_hashes_as_hashset(khmer_KHashtable_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; + Hashtable * hashtable = me->hashtable; const char * sequence; if (!PyArg_ParseTuple(args, "s", &sequence)) { @@ -3214,106 +2291,60 @@ static PyMethodDef khmer_hashtable_methods[] = { "Retrieve an ordered list of the counts of all k-mers in the string." }, - // - // graph/traversal functionality - // - { - "neighbors", - (PyCFunction)hashtable_neighbors, METH_VARARGS, - "Get a list of neighbor nodes for this k-mer.", + "set_use_bigcount", + (PyCFunction)hashtable_set_use_bigcount, METH_VARARGS, + "Count past maximum binsize of hashtable (set to T/F)" }, { - "calc_connected_graph_size", - (PyCFunction)hashtable_calc_connected_graph_size, METH_VARARGS, "" + "get_use_bigcount", + (PyCFunction)hashtable_get_use_bigcount, METH_VARARGS, + "Get value of bigcount flag (T/F)" }, { - "kmer_degree", - (PyCFunction)hashtable_kmer_degree, METH_VARARGS, - "Calculate the number of immediate neighbors this k-mer has in " - "the graph." + "get_min_count", + (PyCFunction)hashtable_get_min_count, METH_VARARGS, + "Get the smallest count of all the k-mers in the string" }, { - "count_kmers_within_radius", - (PyCFunction)hashtable_count_kmers_within_radius, METH_VARARGS, - "Calculate the number of neighbors with given radius in the graph." + "get_max_count", + (PyCFunction)hashtable_get_max_count, METH_VARARGS, + "Get the largest count of all the k-mers in the string" }, - { - "find_high_degree_nodes", - (PyCFunction)hashtable_find_high_degree_nodes, METH_VARARGS, - "Examine the given sequence for degree > 2 nodes and add to " - "list; used in graph contraction.", + "trim_on_abundance", + (PyCFunction)hashtable_trim_on_abundance, METH_VARARGS, + "Trim string at first k-mer below the given abundance" }, { - "traverse_linear_path", - (PyCFunction)hashtable_traverse_linear_path, METH_VARARGS, - "Traverse the path through the graph starting with the given " - "k-mer and avoiding high-degree nodes, finding (and returning) " - "traversed k-mers and any encountered high-degree nodes.", + "trim_below_abundance", + (PyCFunction)hashtable_trim_below_abundance, METH_VARARGS, + "Trim string at first k-mer above the given abundance" }, { - "assemble_linear_path", - (PyCFunction)hashtable_assemble_linear_path, METH_VARARGS, - "Assemble a purely linear path starting with the given " - "k-mer, returning traversed k-mers and any encountered high-degree " - "nodes.", + "find_spectral_error_positions", + (PyCFunction)hashtable_find_spectral_error_positions, METH_VARARGS, + "Identify positions of low-abundance k-mers" + }, + { + "abundance_distribution", + (PyCFunction)hashtable_abundance_distribution, METH_VARARGS, + "Calculate the k-mer abundance distribution of the given file" }, - - // - // tagging / sparse graph functionality - // - - { "consume_and_tag", (PyCFunction)hashtable_consume_and_tag, METH_VARARGS, "Consume a sequence and tag it" }, - { "get_tags_and_positions", (PyCFunction)hashtable_get_tags_and_positions, METH_VARARGS, "Retrieve tags and their positions in a sequence." }, - { "find_all_tags_list", (PyCFunction)hashtable_find_all_tags_list, METH_VARARGS, "Find all tags within range of the given k-mer, return as list" }, - { "consume_fasta_and_tag", (PyCFunction)hashtable_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" }, - { "get_median_count", (PyCFunction)hashtable_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, - { "median_at_least", (PyCFunction)hashtable_median_at_least, METH_VARARGS, "Return true if the median is at least the given cutoff" }, - { "extract_unique_paths", (PyCFunction)hashtable_extract_unique_paths, METH_VARARGS, "" }, - { "print_tagset", (PyCFunction)hashtable_print_tagset, METH_VARARGS, "" }, - { "add_tag", (PyCFunction)hashtable_add_tag, METH_VARARGS, "" }, - { "get_tagset", (PyCFunction)hashtable_get_tagset, METH_VARARGS, "" }, - { "load_tagset", (PyCFunction)hashtable_load_tagset, METH_VARARGS, "" }, - { "save_tagset", (PyCFunction)hashtable_save_tagset, METH_VARARGS, "" }, - { "n_tags", (PyCFunction)hashtable_n_tags, METH_VARARGS, "" }, - { "divide_tags_into_subsets", (PyCFunction)hashtable_divide_tags_into_subsets, METH_VARARGS, "" }, - { "_get_tag_density", (PyCFunction)hashtable__get_tag_density, METH_VARARGS, "" }, - { "_set_tag_density", (PyCFunction)hashtable__set_tag_density, METH_VARARGS, "" }, - - // partitioning - { "do_subset_partition", (PyCFunction)hashtable_do_subset_partition, METH_VARARGS, "" }, - { "find_all_tags", (PyCFunction)hashtable_find_all_tags, METH_VARARGS, "" }, - { "assign_partition_id", (PyCFunction)hashtable_assign_partition_id, METH_VARARGS, "" }, - { "output_partitions", (PyCFunction)hashtable_output_partitions, METH_VARARGS, "" }, - { "load_partitionmap", (PyCFunction)hashtable_load_partitionmap, METH_VARARGS, "" }, - { "save_partitionmap", (PyCFunction)hashtable_save_partitionmap, METH_VARARGS, "" }, - { "_validate_partitionmap", (PyCFunction)hashtable__validate_partitionmap, METH_VARARGS, "" }, { - "consume_fasta_and_tag_with_reads_parser", (PyCFunction)hashtable_consume_fasta_and_tag_with_reads_parser, - METH_VARARGS, "Count all k-mers using a given reads parser" + "abundance_distribution_with_reads_parser", + (PyCFunction)hashtable_abundance_distribution_with_reads_parser, + METH_VARARGS, + "Calculate the k-mer abundance distribution for a reads parser handle" + }, + { "get_median_count", + (PyCFunction)hashtable_get_median_count, METH_VARARGS, + "Get the median, average, and stddev of the k-mer counts in the string" + }, + { "median_at_least", + (PyCFunction)hashtable_median_at_least, METH_VARARGS, + "Return true if the median is at least the given cutoff" }, - { "consume_partitioned_fasta", (PyCFunction)hashtable_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" }, - { "merge_subset", (PyCFunction)hashtable_merge_subset, METH_VARARGS, "" }, - { "merge_subset_from_disk", (PyCFunction)hashtable_merge_from_disk, METH_VARARGS, "" }, - { "count_partitions", (PyCFunction)hashtable_count_partitions, METH_VARARGS, "" }, - { "subset_count_partitions", (PyCFunction)hashtable_subset_count_partitions, METH_VARARGS, "" }, - { "subset_partition_size_distribution", (PyCFunction)hashtable_subset_partition_size_distribution, METH_VARARGS, "" }, - { "save_subset_partitionmap", (PyCFunction)hashtable_save_subset_partitionmap, METH_VARARGS }, - { "load_subset_partitionmap", (PyCFunction)hashtable_load_subset_partitionmap, METH_VARARGS }, - { "_validate_subset_partitionmap", (PyCFunction)hashtable__validate_subset_partitionmap, METH_VARARGS, "" }, - { "set_partition_id", (PyCFunction)hashtable_set_partition_id, METH_VARARGS, "" }, - { "join_partitions", (PyCFunction)hashtable_join_partitions, METH_VARARGS, "" }, - { "get_partition_id", (PyCFunction)hashtable_get_partition_id, METH_VARARGS, "" }, - { "repartition_largest_partition", (PyCFunction)hashtable_repartition_largest_partition, METH_VARARGS, "" }, - - // stop tags - { "load_stop_tags", (PyCFunction)hashtable_load_stop_tags, METH_VARARGS, "" }, - { "save_stop_tags", (PyCFunction)hashtable_save_stop_tags, METH_VARARGS, "" }, - { "print_stop_tags", (PyCFunction)hashtable_print_stop_tags, METH_VARARGS, "" }, - { "trim_on_stoptags", (PyCFunction)hashtable_trim_on_stoptags, METH_VARARGS, "" }, - { "add_stop_tag", (PyCFunction)hashtable_add_stop_tag, METH_VARARGS, "" }, - { "get_stop_tags", (PyCFunction)hashtable_get_stop_tags, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */ }; @@ -3329,161 +2360,65 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KHashtable_Object") 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - "base hashtable object", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - khmer_hashtable_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - 0, /* tp_init */ - 0, /* tp_alloc */ - 0, /* tp_new */ -}; - -// -// KCountingHash object -// - -typedef struct { - khmer_KHashtable_Object khashtable; - CountingHash * counting; -} khmer_KCountingHash_Object; - -typedef struct { - PyObject_HEAD - ReadAligner * aligner; -} khmer_ReadAligner_Object; - -static void khmer_counting_dealloc(khmer_KCountingHash_Object * obj); - -static -PyObject * -count_trim_on_abundance(khmer_KCountingHash_Object * me, PyObject * args) -{ - CountingHash * counting = me->counting; - - const char * seq = NULL; - unsigned int min_count_i = 0; - - if (!PyArg_ParseTuple(args, "sI", &seq, &min_count_i)) { - return NULL; - } - - unsigned long trim_at; - Py_BEGIN_ALLOW_THREADS - - BoundedCounterType min_count = min_count_i; - - trim_at = counting->trim_on_abundance(seq, min_count); - - Py_END_ALLOW_THREADS; - - PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); - if (trim_seq == NULL) { - return NULL; - } - PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at); - Py_DECREF(trim_seq); - - return ret; -} - -static -PyObject * -count_trim_below_abundance(khmer_KCountingHash_Object * me, PyObject * args) -{ - CountingHash * counting = me->counting; - - const char * seq = NULL; - BoundedCounterType max_count_i = 0; - - if (!PyArg_ParseTuple(args, "sH", &seq, &max_count_i)) { - return NULL; - } - - unsigned long trim_at; - Py_BEGIN_ALLOW_THREADS - - BoundedCounterType max_count = max_count_i; - - trim_at = counting->trim_below_abundance(seq, max_count); - - Py_END_ALLOW_THREADS; - - PyObject * trim_seq = PyUnicode_FromStringAndSize(seq, trim_at); - if (trim_seq == NULL) { - return NULL; - } - PyObject * ret = Py_BuildValue("Ok", trim_seq, trim_at); - Py_DECREF(trim_seq); - - return ret; -} - -static -PyObject * -count_find_spectral_error_positions(khmer_KCountingHash_Object * me, - PyObject * args) -{ - CountingHash * counting = me->counting; - - const char * seq = NULL; - BoundedCounterType max_count = 0; // unsigned short int - - if (!PyArg_ParseTuple(args, "sH", &seq, &max_count)) { - return NULL; - } + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + "base hashtable object", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + khmer_hashtable_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; - std::vector posns; +#include "_cpy_nodetable.hh" +#include "_cpy_counttable.hh" +#include "_cpy_hashgraph.hh" - try { - posns = counting->find_spectral_error_positions(seq, max_count); - } catch (khmer_exception &e) { - PyErr_SetString(PyExc_ValueError, e.what()); - return NULL; - } +// +// KCountgraph object +// - Py_ssize_t posns_size = posns.size(); +typedef struct { + khmer_KHashgraph_Object khashgraph; + Countgraph * countgraph; +} khmer_KCountgraph_Object; - PyObject * x = PyList_New(posns_size); - if (x == NULL) { - return NULL; - } - for (Py_ssize_t i = 0; i < posns_size; i++) { - PyList_SET_ITEM(x, i, PyLong_FromLong(posns[i])); - } +typedef struct { + PyObject_HEAD + ReadAligner * aligner; +} khmer_ReadAligner_Object; - return x; -} +static void khmer_countgraph_dealloc(khmer_KCountgraph_Object * obj); static PyObject * -count_get_raw_tables(khmer_KCountingHash_Object * self, PyObject * args) +count_get_raw_tables(khmer_KCountgraph_Object * self, PyObject * args) { - CountingHash * counting = self->counting; + Countgraph * countgraph = self->countgraph; - Byte ** table_ptrs = counting->get_raw_tables(); - std::vector sizes = counting->get_tablesizes(); + khmer::Byte ** table_ptrs = countgraph->get_raw_tables(); + std::vector sizes = countgraph->get_tablesizes(); PyObject * raw_tables = PyList_New(sizes.size()); for (unsigned int i=0; icounting; - - PyObject * x; - if (!PyArg_ParseTuple(args, "O", &x)) { - return NULL; - } - int setme = PyObject_IsTrue(x); - if (setme < 0) { - return NULL; - } - counting->set_use_bigcount((bool)setme); - - Py_RETURN_NONE; -} - -static -PyObject * -count_get_use_bigcount(khmer_KCountingHash_Object * me, PyObject * args) -{ - CountingHash * counting = me->counting; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - bool val = counting->get_use_bigcount(); - - return PyBool_FromLong((int)val); -} - -static -PyObject * -count_get_min_count(khmer_KCountingHash_Object * me, PyObject * args) -{ - CountingHash * counting = me->counting; - - const char * long_str; - - if (!PyArg_ParseTuple(args, "s", &long_str)) { - return NULL; - } - - if (strlen(long_str) < counting->ksize()) { - PyErr_SetString(PyExc_ValueError, - "string length must >= the hashtable k-mer size"); - return NULL; - } - - BoundedCounterType c = counting->get_min_count(long_str); - unsigned int N = c; - - return PyLong_FromLong(N); -} - -static -PyObject * -count_get_max_count(khmer_KCountingHash_Object * me, PyObject * args) -{ - CountingHash * counting = me->counting; - - const char * long_str; - - if (!PyArg_ParseTuple(args, "s", &long_str)) { - return NULL; - } - - if (strlen(long_str) < counting->ksize()) { - PyErr_SetString(PyExc_ValueError, - "string length must >= the hashtable k-mer size"); - return NULL; - } - - BoundedCounterType c = counting->get_max_count(long_str); - unsigned int N = c; - - return PyLong_FromLong(N); -} - -static -PyObject * -count_abundance_distribution_with_reads_parser(khmer_KCountingHash_Object * me, - PyObject * args) -{ - CountingHash * counting = me->counting; - - khmer :: python :: khmer_ReadParser_Object * rparser_obj = NULL; - khmer_KHashbits_Object *tracking_obj = NULL; - - if (!PyArg_ParseTuple(args, "O!O!", &python::khmer_ReadParser_Type, - &rparser_obj, &khmer_KNodegraph_Type, &tracking_obj)) { - return NULL; - } - - read_parsers::IParser *rparser = rparser_obj->parser; - Hashbits *hashbits = tracking_obj->hashbits; - uint64_t *dist = NULL; - const char *value_exception = NULL; - const char *file_exception = NULL; - std::string exc_string; - - Py_BEGIN_ALLOW_THREADS - try { - dist = counting->abundance_distribution(rparser, hashbits); - } catch (khmer_file_exception &exc) { - exc_string = exc.what(); - file_exception = exc_string.c_str(); - } catch (khmer_value_exception &exc) { - exc_string = exc.what(); - value_exception = exc_string.c_str(); - } - Py_END_ALLOW_THREADS - - if (file_exception != NULL) { - PyErr_SetString(PyExc_OSError, file_exception); - return NULL; - } - if (value_exception != NULL) { - PyErr_SetString(PyExc_ValueError, value_exception); - return NULL; - } - - PyObject * x = PyList_New(MAX_BIGCOUNT + 1); - if (x == NULL) { - delete[] dist; - return NULL; - } - for (int i = 0; i < MAX_BIGCOUNT + 1; i++) { - PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i])); - } - - delete[] dist; - return x; -} - -static -PyObject * -count_abundance_distribution(khmer_KCountingHash_Object * me, PyObject * args) -{ - CountingHash * counting = me->counting; - - const char * filename = NULL; - khmer_KHashbits_Object * tracking_obj = NULL; - if (!PyArg_ParseTuple(args, "sO!", &filename, &khmer_KNodegraph_Type, - &tracking_obj)) { - return NULL; - } - - Hashbits *hashbits = tracking_obj->hashbits; - uint64_t *dist = NULL; - const char *value_exception = NULL; - const char *file_exception = NULL; - std::string exc_string; - - Py_BEGIN_ALLOW_THREADS - try { - dist = counting->abundance_distribution(filename, hashbits); - } catch (khmer_file_exception &exc) { - exc_string = exc.what(); - file_exception = exc_string.c_str(); - } catch (khmer_value_exception &exc) { - exc_string = exc.what(); - value_exception = exc_string.c_str(); - } - Py_END_ALLOW_THREADS - - if (file_exception != NULL) { - PyErr_SetString(PyExc_OSError, file_exception); - if (dist != NULL) { - delete []dist; - } - return NULL; - } - if (value_exception != NULL) { - PyErr_SetString(PyExc_ValueError, value_exception); - if (dist != NULL) { - delete []dist; - } - return NULL; - } - - PyObject * x = PyList_New(MAX_BIGCOUNT + 1); - if (x == NULL) { - if (dist != NULL) { - delete []dist; - } - return NULL; - } - for (int i = 0; i < MAX_BIGCOUNT + 1; i++) { - PyList_SET_ITEM(x, i, PyLong_FromUnsignedLongLong(dist[i])); - } - - if (dist != NULL) { - delete []dist; - } - - return x; -} - -static -PyObject * -count_do_subset_partition_with_abundance(khmer_KCountingHash_Object * me, +count_do_subset_partition_with_abundance(khmer_KCountgraph_Object * me, PyObject * args) { - CountingHash * counting = me->counting; + Countgraph * countgraph = me->countgraph; HashIntoType start_kmer = 0, end_kmer = 0; PyObject * break_on_stop_tags_o = NULL; @@ -3737,7 +2470,7 @@ count_do_subset_partition_with_abundance(khmer_KCountingHash_Object * me, SubsetPartition * subset_p = NULL; try { Py_BEGIN_ALLOW_THREADS - subset_p = new SubsetPartition(counting); + subset_p = new SubsetPartition(countgraph); subset_p->do_partition_with_abundance(start_kmer, end_kmer, min_count, max_count, break_on_stop_tags, @@ -3760,35 +2493,27 @@ count_do_subset_partition_with_abundance(khmer_KCountingHash_Object * me, return (PyObject *) subset_obj; } -static PyMethodDef khmer_counting_methods[] = { - { "set_use_bigcount", (PyCFunction)count_set_use_bigcount, METH_VARARGS, "" }, - { "get_use_bigcount", (PyCFunction)count_get_use_bigcount, METH_VARARGS, "" }, - { "get_min_count", (PyCFunction)count_get_min_count, METH_VARARGS, "Get the smallest count of all the k-mers in the string" }, - { "get_max_count", (PyCFunction)count_get_max_count, METH_VARARGS, "Get the largest count of all the k-mers in the string" }, - { "trim_on_abundance", (PyCFunction)count_trim_on_abundance, METH_VARARGS, "Trim on >= abundance" }, - { "trim_below_abundance", (PyCFunction)count_trim_below_abundance, METH_VARARGS, "Trim on >= abundance" }, - { "find_spectral_error_positions", (PyCFunction)count_find_spectral_error_positions, METH_VARARGS, "Identify positions of low-abundance k-mers" }, - { "abundance_distribution", (PyCFunction)count_abundance_distribution, METH_VARARGS, "" }, - { "abundance_distribution_with_reads_parser", (PyCFunction)count_abundance_distribution_with_reads_parser, METH_VARARGS, "" }, +static PyMethodDef khmer_countgraph_methods[] = { { - "get_raw_tables", (PyCFunction)count_get_raw_tables, - METH_VARARGS, "Get a list of the raw tables as memoryview objects" + "get_raw_tables", + (PyCFunction)count_get_raw_tables, METH_VARARGS, + "Get a list of the raw storage tables as memoryview objects." }, { "do_subset_partition_with_abundance", (PyCFunction)count_do_subset_partition_with_abundance, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */ }; -static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args, - PyObject * kwds); +static PyObject* khmer_countgraph_new(PyTypeObject * type, PyObject * args, + PyObject * kwds); static PyTypeObject khmer_KCountgraph_Type -CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object") +CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountgraph_Object") = { PyVarObject_HEAD_INIT(NULL, 0) /* init & ob_size */ "_khmer.Countgraph", /*tp_name*/ - sizeof(khmer_KCountingHash_Object), /*tp_basicsize*/ + sizeof(khmer_KCountgraph_Object), /*tp_basicsize*/ 0, /*tp_itemsize*/ - (destructor)khmer_counting_dealloc, /*tp_dealloc*/ + (destructor)khmer_countgraph_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ @@ -3804,14 +2529,14 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object") 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "counting hash object", /* tp_doc */ + "countgraph hash object", /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ - khmer_counting_methods, /* tp_methods */ + khmer_countgraph_methods, /* tp_methods */ 0, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ @@ -3821,19 +2546,19 @@ CPYCHECKER_TYPE_OBJECT_FOR_TYPEDEF("khmer_KCountingHash_Object") 0, /* tp_dictoffset */ 0, /* tp_init */ 0, /* tp_alloc */ - _new_counting_hash, /* tp_new */ + khmer_countgraph_new, /* tp_new */ }; // -// _new_counting_hash +// khmer_countgraph_new // -static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args, - PyObject * kwds) +static PyObject* khmer_countgraph_new(PyTypeObject * type, PyObject * args, + PyObject * kwds) { - khmer_KCountingHash_Object * self; + khmer_KCountgraph_Object * self; - self = (khmer_KCountingHash_Object *)type->tp_alloc(type, 0); + self = (khmer_KCountgraph_Object *)type->tp_alloc(type, 0); if (self != NULL) { WordLength k = 0; @@ -3845,35 +2570,20 @@ static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args, } std::vector sizes; - Py_ssize_t sizes_list_o_length = PyList_GET_SIZE(sizes_list_o); - if (sizes_list_o_length == -1) { + if (!convert_Pytablesizes_to_vector(sizes_list_o, sizes)) { Py_DECREF(self); - PyErr_SetString(PyExc_ValueError, "error with hashtable primes!"); return NULL; } - for (Py_ssize_t i = 0; i < sizes_list_o_length; i++) { - PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); - if (PyLong_Check(size_o)) { - sizes.push_back(PyLong_AsUnsignedLongLong(size_o)); - } else if (PyInt_Check(size_o)) { - sizes.push_back(PyInt_AsLong(size_o)); - } else if (PyFloat_Check(size_o)) { - sizes.push_back(PyFloat_AS_DOUBLE(size_o)); - } else { - Py_DECREF(self); - PyErr_SetString(PyExc_TypeError, - "2nd argument must be a list of ints, longs, or floats"); - return NULL; - } - } try { - self->counting = new CountingHash(k, sizes); + self->countgraph = new Countgraph(k, sizes); } catch (std::bad_alloc &e) { Py_DECREF(self); return PyErr_NoMemory(); } - self->khashtable.hashtable = dynamic_cast(self->counting); + self->khashgraph.khashtable.hashtable = + dynamic_cast(self->countgraph); + self->khashgraph.hashgraph = dynamic_cast(self->countgraph); } return (PyObject *) self; @@ -3881,20 +2591,20 @@ static PyObject* _new_counting_hash(PyTypeObject * type, PyObject * args, static PyObject * -hashbits_update(khmer_KHashbits_Object * me, PyObject * args) +nodegraph_update(khmer_KNodegraph_Object * me, PyObject * args) { - Hashbits * hashbits = me->hashbits; - Hashbits * other; - khmer_KHashbits_Object * other_o; + Nodegraph * nodegraph = me->nodegraph; + Nodegraph * other; + khmer_KNodegraph_Object * other_o; if (!PyArg_ParseTuple(args, "O!", &khmer_KNodegraph_Type, &other_o)) { return NULL; } - other = other_o->hashbits; + other = other_o->nodegraph; try { - hashbits->update_from(*other); + nodegraph->update_from(*other); } catch (khmer_exception &e) { PyErr_SetString(PyExc_ValueError, e.what()); return NULL; @@ -3905,12 +2615,12 @@ hashbits_update(khmer_KHashbits_Object * me, PyObject * args) static PyObject * -hashbits_get_raw_tables(khmer_KHashbits_Object * self, PyObject * args) +nodegraph_get_raw_tables(khmer_KNodegraph_Object * self, PyObject * args) { - Hashbits * counting = self->hashbits; + Nodegraph * countgraph = self->nodegraph; - Byte ** table_ptrs = counting->get_raw_tables(); - std::vector sizes = counting->get_tablesizes(); + khmer::Byte ** table_ptrs = countgraph->get_raw_tables(); + std::vector sizes = countgraph->get_tablesizes(); PyObject * raw_tables = PyList_New(sizes.size()); for (unsigned int i=0; itp_alloc(type, 0); + khmer_KNodegraph_Object * self; + self = (khmer_KNodegraph_Object *)type->tp_alloc(type, 0); if (self != NULL) { WordLength k = 0; @@ -3964,30 +2674,20 @@ static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, } std::vector sizes; - Py_ssize_t sizes_list_o_length = PyList_GET_SIZE(sizes_list_o); - for (Py_ssize_t i = 0; i < sizes_list_o_length; i++) { - PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); - if (PyLong_Check(size_o)) { - sizes.push_back(PyLong_AsUnsignedLongLong(size_o)); - } else if (PyInt_Check(size_o)) { - sizes.push_back(PyInt_AsLong(size_o)); - } else if (PyFloat_Check(size_o)) { - sizes.push_back(PyFloat_AS_DOUBLE(size_o)); - } else { - Py_DECREF(self); - PyErr_SetString(PyExc_TypeError, - "2nd argument must be a list of ints, longs, or floats"); - return NULL; - } + if (!convert_Pytablesizes_to_vector(sizes_list_o, sizes)) { + Py_DECREF(self); + return NULL; } try { - self->hashbits = new Hashbits(k, sizes); + self->nodegraph = new Nodegraph(k, sizes); } catch (std::bad_alloc &e) { Py_DECREF(self); return PyErr_NoMemory(); } - self->khashtable.hashtable = self->hashbits; + self->khashgraph.khashtable.hashtable = + dynamic_cast(self->nodegraph); + self->khashgraph.hashgraph = dynamic_cast(self->nodegraph); } return (PyObject *) self; } @@ -4118,14 +2818,14 @@ subset_partition_average_coverages(khmer_KSubsetPartition_Object * me, { SubsetPartition * subset_p = me->subset; - khmer_KCountingHash_Object * counting_o; + khmer_KCountgraph_Object * countgraph_o; - if (!PyArg_ParseTuple(args, "O!", &khmer_KCountgraph_Type, &counting_o)) { + if (!PyArg_ParseTuple(args, "O!", &khmer_KCountgraph_Type, &countgraph_o)) { return NULL; } PartitionCountMap cm; - subset_p->partition_average_coverages(cm, counting_o -> counting); + subset_p->partition_average_coverages(cm, countgraph_o -> countgraph); unsigned int i; PartitionCountMap::iterator mi; @@ -4203,20 +2903,20 @@ static PyObject * khmer_graphlabels_new(PyTypeObject *type, PyObject *args, self = (khmer_KGraphLabels_Object*)type->tp_alloc(type, 0); if (self != NULL) { - PyObject * hashtable_o; - khmer::Hashgraph * hashtable = NULL; // @CTB + PyObject * hashgraph_o; + khmer::Hashgraph * hashgraph = NULL; // @CTB - if (!PyArg_ParseTuple(args, "O", &hashtable_o)) { + if (!PyArg_ParseTuple(args, "O", &hashgraph_o)) { Py_DECREF(self); return NULL; } - if (PyObject_TypeCheck(hashtable_o, &khmer_KNodegraph_Type)) { - khmer_KHashbits_Object * kho = (khmer_KHashbits_Object *) hashtable_o; - hashtable = kho->hashbits; - } else if (PyObject_TypeCheck(hashtable_o, &khmer_KCountgraph_Type)) { - khmer_KCountingHash_Object * cho = (khmer_KCountingHash_Object *) hashtable_o; - hashtable = cho->counting; + if (PyObject_TypeCheck(hashgraph_o, &khmer_KNodegraph_Type)) { + khmer_KNodegraph_Object * kho = (khmer_KNodegraph_Object *) hashgraph_o; + hashgraph = kho->nodegraph; + } else if (PyObject_TypeCheck(hashgraph_o, &khmer_KCountgraph_Type)) { + khmer_KCountgraph_Object * cho = (khmer_KCountgraph_Object *) hashgraph_o; + hashgraph = cho->countgraph; } else { PyErr_SetString(PyExc_ValueError, "graph object must be a NodeGraph or CountGraph"); @@ -4225,7 +2925,7 @@ static PyObject * khmer_graphlabels_new(PyTypeObject *type, PyObject *args, } try { - self->labelhash = new LabelHash(hashtable); + self->labelhash = new LabelHash(hashgraph); } catch (std::bad_alloc &e) { Py_DECREF(self); return PyErr_NoMemory(); @@ -4549,8 +3249,8 @@ labelhash_assemble_labeled_path(khmer_KGraphLabels_Object * me, LabelHash* labelhash = me->labelhash; PyObject * val_o; - khmer_KHashbits_Object * nodegraph_o = NULL; - Hashbits * stop_bf = NULL; + khmer_KNodegraph_Object * nodegraph_o = NULL; + Nodegraph * stop_bf = NULL; if (!PyArg_ParseTuple(args, "O|O!", &val_o, &khmer_KNodegraph_Type, &nodegraph_o)) { @@ -4563,7 +3263,7 @@ labelhash_assemble_labeled_path(khmer_KGraphLabels_Object * me, } if (nodegraph_o) { - stop_bf = nodegraph_o->hashbits; + stop_bf = nodegraph_o->nodegraph; } SimpleLabeledAssembler assembler(labelhash); @@ -4685,18 +3385,18 @@ static PyTypeObject khmer_KGraphLabels_Type = { static PyObject * -hashtable_repartition_largest_partition(khmer_KHashtable_Object * me, +hashgraph_repartition_largest_partition(khmer_KHashgraph_Object * me, PyObject * args) { - Hashgraph * hashtable = me->hashtable; - khmer_KCountingHash_Object * counting_o = NULL; + Hashgraph * hashgraph = me->hashgraph; + khmer_KCountgraph_Object * countgraph_o = NULL; PyObject * subset_o = NULL; SubsetPartition * subset_p; unsigned int distance, threshold, frequency; if (!PyArg_ParseTuple(args, "OO!III", &subset_o, - &khmer_KCountgraph_Type, &counting_o, + &khmer_KCountgraph_Type, &countgraph_o, &distance, &threshold, &frequency)) { return NULL; } @@ -4704,15 +3404,15 @@ hashtable_repartition_largest_partition(khmer_KHashtable_Object * me, if (PyObject_TypeCheck(subset_o, &khmer_KSubsetPartition_Type)) { subset_p = ((khmer_KSubsetPartition_Object *) subset_o)->subset; } else { - subset_p = hashtable->partition; + subset_p = hashgraph->partition; } - CountingHash * counting = counting_o->counting; + Countgraph * countgraph = countgraph_o->countgraph; unsigned long next_largest; try { next_largest = subset_p->repartition_largest_partition(distance, - threshold, frequency, *counting); + threshold, frequency, *countgraph); } catch (khmer_exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return NULL; @@ -4876,7 +3576,7 @@ static PyObject* khmer_ReadAligner_new(PyTypeObject *type, PyObject * args, self = (khmer_ReadAligner_Object *)type->tp_alloc(type, 0); if (self != NULL) { - khmer_KCountingHash_Object * ch = NULL; + khmer_KCountgraph_Object * ch = NULL; unsigned short int trusted_cov_cutoff = 2; double bits_theta = 1; double scoring_matrix[] = { 0, 0, 0, 0 }; @@ -4901,7 +3601,7 @@ static PyObject* khmer_ReadAligner_new(PyTypeObject *type, PyObject * args, return NULL; } - self->aligner = new ReadAligner(ch->counting, trusted_cov_cutoff, + self->aligner = new ReadAligner(ch->countgraph, trusted_cov_cutoff, bits_theta, scoring_matrix, transitions); } @@ -4951,30 +3651,30 @@ static PyTypeObject khmer_ReadAlignerType = { }; // -// khmer_counting_dealloc -- clean up a counting hash object. +// khmer_countgraph_dealloc -- clean up a countgraph hash object. // -static void khmer_counting_dealloc(khmer_KCountingHash_Object * obj) +static void khmer_countgraph_dealloc(khmer_KCountgraph_Object * obj) { - delete obj->counting; - obj->counting = NULL; + delete obj->countgraph; + obj->countgraph = NULL; Py_TYPE(obj)->tp_free((PyObject*)obj); } // -// khmer_hashbits_dealloc -- clean up a hashbits object. +// khmer_nodegraph_dealloc -- clean up a nodegraph object. // -static void khmer_hashbits_dealloc(khmer_KHashbits_Object * obj) +static void khmer_nodegraph_dealloc(khmer_KNodegraph_Object * obj) { - delete obj->hashbits; - obj->hashbits = NULL; + delete obj->nodegraph; + obj->nodegraph = NULL; Py_TYPE(obj)->tp_free((PyObject*)obj); } // -// khmer_subset_dealloc -- clean up a hashbits object. +// khmer_subset_dealloc -- clean up a subset object. // static void khmer_subset_dealloc(khmer_KSubsetPartition_Object * obj) @@ -5377,20 +4077,20 @@ static PyObject * khmer_linearassembler_new(PyTypeObject *type, PyObject *args, self = (khmer_KLinearAssembler_Object*)type->tp_alloc(type, 0); if (self != NULL) { - PyObject * hashtable_o; - Hashgraph * hashtable = NULL; + PyObject * hashgraph_o; + Hashgraph * hashgraph = NULL; - if (!PyArg_ParseTuple(args, "O", &hashtable_o)) { + if (!PyArg_ParseTuple(args, "O", &hashgraph_o)) { Py_DECREF(self); return NULL; } - if (PyObject_TypeCheck(hashtable_o, &khmer_KNodegraph_Type)) { - khmer_KHashbits_Object * kho = (khmer_KHashbits_Object *) hashtable_o; - hashtable = kho->hashbits; - } else if (PyObject_TypeCheck(hashtable_o, &khmer_KCountgraph_Type)) { - khmer_KCountingHash_Object * cho = (khmer_KCountingHash_Object *) hashtable_o; - hashtable = cho->counting; + if (PyObject_TypeCheck(hashgraph_o, &khmer_KNodegraph_Type)) { + khmer_KNodegraph_Object * kho = (khmer_KNodegraph_Object *) hashgraph_o; + hashgraph = kho->nodegraph; + } else if (PyObject_TypeCheck(hashgraph_o, &khmer_KCountgraph_Type)) { + khmer_KCountgraph_Object * cho = (khmer_KCountgraph_Object *) hashgraph_o; + hashgraph = cho->countgraph; } else { PyErr_SetString(PyExc_ValueError, "graph object must be a NodeGraph or CountGraph"); @@ -5399,7 +4099,7 @@ static PyObject * khmer_linearassembler_new(PyTypeObject *type, PyObject *args, } try { - self->assembler = new LinearAssembler(hashtable); + self->assembler = new LinearAssembler(hashgraph); } catch (std::bad_alloc &e) { Py_DECREF(self); return PyErr_NoMemory(); @@ -5419,8 +4119,8 @@ linearassembler_assemble(khmer_KLinearAssembler_Object * me, LinearAssembler * assembler= me->assembler; PyObject * val_o; - khmer_KHashbits_Object * nodegraph_o = NULL; - Hashbits * stop_bf = NULL; + khmer_KNodegraph_Object * nodegraph_o = NULL; + Nodegraph * stop_bf = NULL; const char * dir_str = NULL; char dir = NULL; @@ -5444,7 +4144,7 @@ linearassembler_assemble(khmer_KLinearAssembler_Object * me, } if (nodegraph_o) { - stop_bf = nodegraph_o->hashbits; + stop_bf = nodegraph_o->nodegraph; } std::string contig; @@ -5579,8 +4279,8 @@ simplelabeledassembler_assemble(khmer_KSimpleLabeledAssembler_Object * me, SimpleLabeledAssembler * assembler = me->assembler; PyObject * val_o; - khmer_KHashbits_Object * nodegraph_o = NULL; - Hashbits * stop_bf = NULL; + khmer_KNodegraph_Object * nodegraph_o = NULL; + Nodegraph * stop_bf = NULL; const char *kwnames[] = {"seed_kmer", "stop_filter", NULL}; @@ -5598,7 +4298,7 @@ simplelabeledassembler_assemble(khmer_KSimpleLabeledAssembler_Object * me, } if (nodegraph_o) { - stop_bf = nodegraph_o->hashbits; + stop_bf = nodegraph_o->nodegraph; } std::vector contigs = assembler->assemble(start_kmer, stop_bf); @@ -5691,20 +4391,20 @@ static PyObject * khmer_junctioncountassembler_new(PyTypeObject *type, self = (khmer_KJunctionCountAssembler_Object*)type->tp_alloc(type, 0); if (self != NULL) { - PyObject * hashtable_o; - Hashgraph * hashtable = NULL; + PyObject * hashgraph_o; + Hashgraph * hashgraph = NULL; - if (!PyArg_ParseTuple(args, "O", &hashtable_o)) { + if (!PyArg_ParseTuple(args, "O", &hashgraph_o)) { Py_DECREF(self); return NULL; } - if (PyObject_TypeCheck(hashtable_o, &khmer_KNodegraph_Type)) { - khmer_KHashbits_Object * kho = (khmer_KHashbits_Object *) hashtable_o; - hashtable = kho->hashbits; - } else if (PyObject_TypeCheck(hashtable_o, &khmer_KCountgraph_Type)) { - khmer_KCountingHash_Object * cho = (khmer_KCountingHash_Object *) hashtable_o; - hashtable = cho->counting; + if (PyObject_TypeCheck(hashgraph_o, &khmer_KNodegraph_Type)) { + khmer_KNodegraph_Object * kho = (khmer_KNodegraph_Object *) hashgraph_o; + hashgraph = kho->nodegraph; + } else if (PyObject_TypeCheck(hashgraph_o, &khmer_KCountgraph_Type)) { + khmer_KCountgraph_Object * cho = (khmer_KCountgraph_Object *) hashgraph_o; + hashgraph = cho->countgraph; } else { PyErr_SetString(PyExc_ValueError, "graph object must be a NodeGraph or CountGraph"); @@ -5713,7 +4413,7 @@ static PyObject * khmer_junctioncountassembler_new(PyTypeObject *type, } try { - self->assembler = new JunctionCountAssembler(hashtable); + self->assembler = new JunctionCountAssembler(hashgraph); } catch (std::bad_alloc &e) { Py_DECREF(self); return PyErr_NoMemory(); @@ -5733,8 +4433,8 @@ junctioncountassembler_assemble(khmer_KJunctionCountAssembler_Object * me, JunctionCountAssembler * assembler = me->assembler; PyObject * val_o; - khmer_KHashbits_Object * nodegraph_o = NULL; - Hashbits * stop_bf = NULL; + khmer_KNodegraph_Object * nodegraph_o = NULL; + Nodegraph * stop_bf = NULL; const char *kwnames[] = {"seed_kmer", "stop_filter", NULL}; @@ -5751,7 +4451,7 @@ junctioncountassembler_assemble(khmer_KJunctionCountAssembler_Object * me, } if (nodegraph_o) { - stop_bf = nodegraph_o->hashbits; + stop_bf = nodegraph_o->nodegraph; } std::vector contigs = assembler->assemble(start_kmer, stop_bf); @@ -5779,7 +4479,7 @@ junctioncountassembler_consume(khmer_KJunctionCountAssembler_Object * me, if (strlen(long_str) < assembler->_ksize) { PyErr_SetString(PyExc_ValueError, - "string length must >= the hashtable k-mer size"); + "string length must >= the hashgraph k-mer size"); return NULL; } @@ -6041,7 +4741,23 @@ MOD_INIT(_khmer) return MOD_ERROR_VAL; } - khmer_KCountgraph_Type.tp_base = &khmer_KHashtable_Type; + khmer_KCounttable_Type.tp_base = &khmer_KHashtable_Type; + if (PyType_Ready(&khmer_KCounttable_Type) < 0) { + return MOD_ERROR_VAL; + } + + khmer_KNodetable_Type.tp_base = &khmer_KHashtable_Type; + if (PyType_Ready(&khmer_KNodetable_Type) < 0) { + return MOD_ERROR_VAL; + } + + khmer_KHashgraph_Type.tp_base = &khmer_KHashtable_Type; + khmer_KHashgraph_Type.tp_methods = khmer_hashgraph_methods; + if (PyType_Ready(&khmer_KHashgraph_Type) < 0) { + return MOD_ERROR_VAL; + } + + khmer_KCountgraph_Type.tp_base = &khmer_KHashgraph_Type; if (PyType_Ready(&khmer_KCountgraph_Type) < 0) { return MOD_ERROR_VAL; } @@ -6055,8 +4771,8 @@ MOD_INIT(_khmer) return MOD_ERROR_VAL; } - khmer_KNodegraph_Type.tp_base = &khmer_KHashtable_Type; - khmer_KNodegraph_Type.tp_methods = khmer_hashbits_methods; + khmer_KNodegraph_Type.tp_base = &khmer_KHashgraph_Type; + khmer_KNodegraph_Type.tp_methods = khmer_nodegraph_methods; if (PyType_Ready(&khmer_KNodegraph_Type) < 0) { return MOD_ERROR_VAL; } @@ -6119,6 +4835,18 @@ MOD_INIT(_khmer) return MOD_ERROR_VAL; } + Py_INCREF(&khmer_KCounttable_Type); + if (PyModule_AddObject( m, "Counttable", + (PyObject *)&khmer_KCounttable_Type ) < 0) { + return MOD_ERROR_VAL; + } + + Py_INCREF(&khmer_KNodetable_Type); + if (PyModule_AddObject( m, "Nodetable", + (PyObject *)&khmer_KNodetable_Type ) < 0) { + return MOD_ERROR_VAL; + } + Py_INCREF(&khmer_KCountgraph_Type); if (PyModule_AddObject( m, "Countgraph", (PyObject *)&khmer_KCountgraph_Type ) < 0) { diff --git a/lib/Makefile b/lib/Makefile index 60ddcad9cc..2dab174380 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -222,9 +222,8 @@ BZIP2_OBJS=$(addprefix $(BZIP2_DIR)/, $(BZIP2_OBJS_BASE)) #### oxli proper below here #### LIBKHMER_OBJS= \ - counting.o \ - hashbits.o \ hashtable.o \ + hashgraph.o \ hllcounter.o \ kmer_hash.o \ labelhash.o \ @@ -254,8 +253,6 @@ PRECLEAN_TARGS += libbz2clean endif KHMER_HEADERS= \ - counting.hh \ - hashbits.hh \ hashtable.hh \ khmer_exception.hh \ khmer.hh \ diff --git a/lib/assembler.cc b/lib/assembler.cc index 6f584cb6a5..d1bb07c548 100644 --- a/lib/assembler.cc +++ b/lib/assembler.cc @@ -337,7 +337,7 @@ JunctionCountAssembler::JunctionCountAssembler(Hashgraph * ht) : graph(ht), _ksize(ht->ksize()), traverser(ht), linear_asm(ht) { std::vector table_sizes = graph->get_tablesizes(); - junctions = new CountingHash(_ksize, table_sizes); + junctions = new Countgraph(_ksize, table_sizes); } diff --git a/lib/assembler.hh b/lib/assembler.hh index 467210fe30..5b8086553c 100644 --- a/lib/assembler.hh +++ b/lib/assembler.hh @@ -41,7 +41,7 @@ Contact: khmer-project@idyll.org #include "khmer.hh" #include "kmer_hash.hh" -#include "counting.hh" +#include "hashgraph.hh" #include "kmer_filters.hh" #include "traversal.hh" #include "labelhash.hh" @@ -149,7 +149,7 @@ public: class JunctionCountAssembler { LinearAssembler linear_asm; - CountingHash * junctions; + Countgraph * junctions; Traverser traverser; public: diff --git a/lib/counting.cc b/lib/counting.cc deleted file mode 100644 index fa99b43961..0000000000 --- a/lib/counting.cc +++ /dev/null @@ -1,282 +0,0 @@ -/* -This file is part of khmer, https://github.com/dib-lab/khmer/, and is -Copyright (C) 2010-2015, Michigan State University. -Copyright (C) 2015-2016, The Regents of the University of California. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the Michigan State University nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -LICENSE (END) - -Contact: khmer-project@idyll.org -*/ - -#include -#include -#include -#include // IWYU pragma: keep - -#include "counting.hh" -#include "hashbits.hh" -#include "hashtable.hh" -#include "khmer_exception.hh" -#include "kmer_hash.hh" -#include "read_parsers.hh" -#include "zlib.h" - -using namespace std; -using namespace khmer; -using namespace khmer:: read_parsers; - -BoundedCounterType CountingHash::get_min_count(const std::string &s) -{ - KmerIterator kmers(s.c_str(), _ksize); - - BoundedCounterType min_count = MAX_KCOUNT; - - while(!kmers.done()) { - HashIntoType kmer = kmers.next(); - - BoundedCounterType count = this->get_count(kmer); - - if (this->get_count(kmer) < min_count) { - min_count = count; - } - } - return min_count; -} - -BoundedCounterType CountingHash::get_max_count(const std::string &s) -{ - KmerIterator kmers(s.c_str(), _ksize); - - BoundedCounterType max_count = 0; - - while(!kmers.done()) { - HashIntoType kmer = kmers.next(); - - BoundedCounterType count = this->get_count(kmer); - - if (count > max_count) { - max_count = count; - } - } - return max_count; -} - -uint64_t * -CountingHash::abundance_distribution( - read_parsers::IParser * parser, - Hashbits * tracking) -{ - uint64_t * dist = new uint64_t[MAX_BIGCOUNT + 1]; - uint64_t i; - - for (i = 0; i <= MAX_BIGCOUNT; i++) { - dist[i] = 0; - } - - Read read; - - string name; - string seq; - - // if not, could lead to overflow. - if (sizeof(BoundedCounterType) != 2) { - delete[] dist; - throw khmer_exception(); - } - - while(!parser->is_complete()) { - try { - read = parser->get_next_read(); - } catch (NoMoreReadsAvailable &exc) { - break; - } - seq = read.sequence; - - if (check_and_normalize_read(seq)) { - KmerIterator kmers(seq.c_str(), _ksize); - - while(!kmers.done()) { - HashIntoType kmer = kmers.next(); - - if (!tracking->get_count(kmer)) { - tracking->count(kmer); - - BoundedCounterType n = get_count(kmer); - dist[n]++; - } - } - - name.clear(); - seq.clear(); - } - } - return dist; -} - - -uint64_t * CountingHash::abundance_distribution( - std::string filename, - Hashbits * tracking) -{ - IParser* parser = IParser::get_parser(filename.c_str()); - - uint64_t * distribution = abundance_distribution(parser, tracking); - delete parser; - return distribution; -} - -unsigned long CountingHash::trim_on_abundance( - std::string seq, - BoundedCounterType min_abund) -const -{ - if (!check_and_normalize_read(seq)) { - return 0; - } - - KmerIterator kmers(seq.c_str(), _ksize); - - HashIntoType kmer; - - if (kmers.done()) { - return 0; - } - kmer = kmers.next(); - - if (kmers.done() || get_count(kmer) < min_abund) { - return 0; - } - - unsigned long i = _ksize; - while (!kmers.done()) { - kmer = kmers.next(); - - if (get_count(kmer) < min_abund) { - return i; - } - i++; - } - - return seq.length(); -} - -unsigned long CountingHash::trim_below_abundance( - std::string seq, - BoundedCounterType max_abund) -const -{ - if (!check_and_normalize_read(seq)) { - return 0; - } - - KmerIterator kmers(seq.c_str(), _ksize); - - HashIntoType kmer; - - if (kmers.done()) { - return 0; - } - kmer = kmers.next(); - - if (kmers.done() || get_count(kmer) > max_abund) { - return 0; - } - - unsigned long i = _ksize; - while (!kmers.done()) { - kmer = kmers.next(); - - if (get_count(kmer) > max_abund) { - return i; - } - i++; - } - - return seq.length(); -} - -std::vector CountingHash::find_spectral_error_positions( - std::string seq, - BoundedCounterType max_abund) -const -{ - std::vector posns; - if (!check_and_normalize_read(seq)) { - throw khmer_exception("invalid read"); - } - - KmerIterator kmers(seq.c_str(), _ksize); - - HashIntoType kmer = kmers.next(); - if (kmers.done()) { - return posns; - } - - // find the first trusted k-mer - while (!kmers.done()) { - if (get_count(kmer) > max_abund) { - break; - } - kmer = kmers.next(); - } - - if (kmers.done()) { - return posns; - } - - // did we bypass some erroneous k-mers? call the last one. - if (kmers.get_start_pos() > 0) { - // if we are well past the first k, forget the whole thing (!? @CTB) - if (kmers.get_start_pos() >= _ksize && 0) { - return posns; - } - posns.push_back(kmers.get_start_pos() - 1); - } - - while (!kmers.done()) { - kmer = kmers.next(); - if (get_count(kmer) <= max_abund) { // error! - posns.push_back(kmers.get_end_pos() - 1); - - // find next good - while (!kmers.done()) { - kmer = kmers.next(); - if (get_count(kmer) > max_abund) { // a good stretch again. - break; - } - } - } - } - - return posns; -} - -/* vim: set ft=cpp ts=8 sts=4 sw=4 et tw=79 */ diff --git a/lib/counting.hh b/lib/counting.hh deleted file mode 100644 index 9964aa8892..0000000000 --- a/lib/counting.hh +++ /dev/null @@ -1,84 +0,0 @@ -/* -This file is part of khmer, https://github.com/dib-lab/khmer/, and is -Copyright (C) 2010-2015, Michigan State University. -Copyright (C) 2015-2016, The Regents of the University of California. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the Michigan State University nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -LICENSE (END) - -Contact: khmer-project@idyll.org -*/ -#ifndef COUNTING_HH -#define COUNTING_HH - -#include -#include -#include -#include -#include -#include -#include - -#include "hashtable.hh" -#include "khmer.hh" -#include "kmer_hash.hh" - -namespace khmer -{ -class Hashbits; - -using read_parsers::IParser; - -class CountingHash : public khmer::Hashgraph -{ -public: - explicit CountingHash(WordLength ksize, std::vector sizes) - : Hashgraph(ksize, new ByteStorage(sizes)) { } ; - - BoundedCounterType get_min_count(const std::string &s); - - BoundedCounterType get_max_count(const std::string &s); - - uint64_t * abundance_distribution(read_parsers::IParser * parser, - Hashbits * tracking); - uint64_t * abundance_distribution(std::string filename, - Hashbits * tracking); - - unsigned long trim_on_abundance(std::string seq, - BoundedCounterType min_abund) const; - unsigned long trim_below_abundance(std::string seq, - BoundedCounterType max_abund) const; - std::vector find_spectral_error_positions(std::string seq, - BoundedCounterType min_abund) const; -}; -} -#endif // COUNTING_HH - -// vim: set sts=2 sw=2: diff --git a/lib/hashbits.cc b/lib/hashbits.cc deleted file mode 100644 index 6bfcd1565c..0000000000 --- a/lib/hashbits.cc +++ /dev/null @@ -1,67 +0,0 @@ -/* -This file is part of khmer, https://github.com/dib-lab/khmer/, and is -Copyright (C) 2010-2015, Michigan State University. -Copyright (C) 2015-2016, The Regents of the University of California. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the Michigan State University nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -LICENSE (END) - -Contact: khmer-project@idyll.org -*/ -#include -#include // IWYU pragma: keep - -#include "hashbits.hh" -#include "hashtable.hh" -#include "khmer_exception.hh" -#include "read_parsers.hh" - -using namespace std; -using namespace khmer; -using namespace khmer:: read_parsers; - -void Hashbits::update_from(const Hashbits &otherBASE) -{ - if (_ksize != otherBASE._ksize) { - throw khmer_exception("both nodegraphs must have same k size"); - } - BitStorage * myself = dynamic_cast(this->store); - const BitStorage * other; - other = dynamic_cast(otherBASE.store); - - // if dynamic_cast worked, then the pointers will be not null. - if (myself && other) { - myself->update_from(*other); - } else { - throw khmer_exception("update_from failed with incompatible objects"); - } -} - -// vim: set sts=2 sw=2: diff --git a/lib/hashbits.hh b/lib/hashbits.hh deleted file mode 100644 index 34da52efb3..0000000000 --- a/lib/hashbits.hh +++ /dev/null @@ -1,70 +0,0 @@ -/* -This file is part of khmer, https://github.com/dib-lab/khmer/, and is -Copyright (C) 2010-2015, Michigan State University. -Copyright (C) 2015-2016, The Regents of the University of California. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the Michigan State University nor the names - of its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -LICENSE (END) - -Contact: khmer-project@idyll.org -*/ -#ifndef HASHBITS_HH -#define HASHBITS_HH - -#include -#include -#include -#include - -#include "hashtable.hh" -#include "khmer.hh" -#include "kmer_hash.hh" - -namespace khmer -{ - -class CountingHash; -class LabelHash; - -class Hashbits : public Hashgraph -{ -public: - explicit Hashbits(WordLength ksize, std::vector sizes) - : Hashgraph(ksize, new BitStorage(sizes)) { } ; - - void update_from(const Hashbits &other); -}; -} - -#include "counting.hh" -#include "labelhash.hh" -#endif // HASHBITS_HH - -// vim: set sts=2 sw=2: diff --git a/lib/hashgraph.cc b/lib/hashgraph.cc new file mode 100644 index 0000000000..4e2eaa1e41 --- /dev/null +++ b/lib/hashgraph.cc @@ -0,0 +1,899 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the University of California nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: keep +#include +#include + +#include "hashgraph.hh" +#include "khmer.hh" +#include "read_parsers.hh" + +using namespace std; +using namespace khmer; +using namespace khmer:: read_parsers; + +void Hashgraph::save_tagset(std::string outfilename) +{ + ofstream outfile(outfilename.c_str(), ios::binary); + const size_t tagset_size = n_tags(); + unsigned int save_ksize = _ksize; + + HashIntoType * buf = new HashIntoType[tagset_size]; + + outfile.write(SAVED_SIGNATURE, 4); + unsigned char version = SAVED_FORMAT_VERSION; + outfile.write((const char *) &version, 1); + + unsigned char ht_type = SAVED_TAGS; + outfile.write((const char *) &ht_type, 1); + + outfile.write((const char *) &save_ksize, sizeof(save_ksize)); + outfile.write((const char *) &tagset_size, sizeof(tagset_size)); + outfile.write((const char *) &_tag_density, sizeof(_tag_density)); + + unsigned int i = 0; + for (SeenSet::iterator pi = all_tags.begin(); pi != all_tags.end(); + ++pi, i++) { + buf[i] = *pi; + } + + outfile.write((const char *) buf, sizeof(HashIntoType) * tagset_size); + if (outfile.fail()) { + delete[] buf; + throw khmer_file_exception(strerror(errno)); + } + outfile.close(); + + delete[] buf; +} + +void Hashgraph::load_tagset(std::string infilename, bool clear_tags) +{ + ifstream infile; + + // configure ifstream to raise exceptions for everything. + infile.exceptions(std::ifstream::failbit | std::ifstream::badbit | + std::ifstream::eofbit); + + try { + infile.open(infilename.c_str(), ios::binary); + } catch (std::ifstream::failure &e) { + std::string err; + if (!(infile.is_open())) { + err = "Cannot open tagset file: " + infilename; + } else { + err = "Unknown error in opening file: " + infilename; + } + throw khmer_file_exception(err); + } catch (const std::exception &e) { + // Catching std::exception is a stopgap for + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66145 + std::string err = "Unknown error opening file: " + infilename + " " + + strerror(errno); + throw khmer_file_exception(err); + } + + if (clear_tags) { + all_tags.clear(); + } + + unsigned char version, ht_type; + unsigned int save_ksize = 0; + + size_t tagset_size = 0; + HashIntoType * buf = NULL; + + try { + char signature[4]; + infile.read(signature, 4); + infile.read((char *) &version, 1); + infile.read((char *) &ht_type, 1); + if (!(std::string(signature, 4) == SAVED_SIGNATURE)) { + std::ostringstream err; + err << "Incorrect file signature 0x"; + for(size_t i=0; i < 4; ++i) { + err << std::hex << (int) signature[i]; + } + err << " while reading tagset from " << infilename + << "; should be " << SAVED_SIGNATURE; + throw khmer_file_exception(err.str()); + } else if (!(version == SAVED_FORMAT_VERSION)) { + std::ostringstream err; + err << "Incorrect file format version " << (int) version + << " while reading tagset from " << infilename + << "; should be " << (int) SAVED_FORMAT_VERSION; + throw khmer_file_exception(err.str()); + } else if (!(ht_type == SAVED_TAGS)) { + std::ostringstream err; + err << "Incorrect file format type " << (int) ht_type + << " while reading tagset from " << infilename; + throw khmer_file_exception(err.str()); + } + + infile.read((char *) &save_ksize, sizeof(save_ksize)); + if (!(save_ksize == _ksize)) { + std::ostringstream err; + err << "Incorrect k-mer size " << save_ksize + << " while reading tagset from " << infilename; + throw khmer_file_exception(err.str()); + } + + infile.read((char *) &tagset_size, sizeof(tagset_size)); + infile.read((char *) &_tag_density, sizeof(_tag_density)); + + buf = new HashIntoType[tagset_size]; + + infile.read((char *) buf, sizeof(HashIntoType) * tagset_size); + + for (unsigned int i = 0; i < tagset_size; i++) { + all_tags.insert(buf[i]); + } + + delete[] buf; + } catch (std::ifstream::failure &e) { + std::string err = "Error reading data from: " + infilename; + if (buf != NULL) { + delete[] buf; + } + throw khmer_file_exception(err); + /* Yes, this is boneheaded. Unfortunately, there is a bug in gcc > 5 + * regarding the basic_ios::failure that makes it impossible to catch + * with more specificty. So, we catch *all* exceptions after trying to + * get the ifstream::failure, and assume it must have been the buggy one. + * Unfortunately, this would also cause us to catch the + * khmer_file_exceptions thrown above, so we catch them again first and + * rethrow them :) If this is understandably irritating to you, please + * bother the gcc devs at: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66145 + * + * See also: http://media4.giphy.com/media/3o6UBpHgaXFDNAuttm/giphy.gif + */ + } catch (khmer_file_exception &e) { + throw e; + } catch (const std::exception &e) { + std::string err = "Unknown error opening file: " + infilename + " " + + strerror(errno); + throw khmer_file_exception(err); + } +} + +void Hashgraph::consume_sequence_and_tag(const std::string& seq, + unsigned long long& n_consumed, + SeenSet * found_tags) +{ + bool kmer_tagged; + + KmerIterator kmers(seq.c_str(), _ksize); + HashIntoType kmer; + + unsigned int since = _tag_density / 2 + 1; + + while(!kmers.done()) { + kmer = kmers.next(); + bool is_new_kmer; + + // Set the bits for the kmer in the various hashtables, + // and report on whether or not they had already been set. + // This is probably better than first testing and then setting the bits, + // as a failed test essentially results in doing the same amount of work + // twice. + if ((is_new_kmer = store->test_and_set_bits( kmer ))) { + ++n_consumed; + } + +#if (1) + if (is_new_kmer) { + ++since; + } else { + ACQUIRE_ALL_TAGS_SPIN_LOCK + kmer_tagged = set_contains(all_tags, kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (kmer_tagged) { + since = 1; + if (found_tags) { + found_tags->insert(kmer); + } + } else { + ++since; + } + } +#else + if (!is_new_kmer && set_contains(all_tags, kmer)) { + since = 1; + if (found_tags) { + found_tags->insert(kmer); + } + } else { + since++; + } +#endif + + if (since >= _tag_density) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (found_tags) { + found_tags->insert(kmer); + } + since = 1; + } + + } // iteration over kmers + + if (since >= _tag_density/2 - 1) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); // insert the last k-mer, too. + RELEASE_ALL_TAGS_SPIN_LOCK + if (found_tags) { + found_tags->insert(kmer); + } + } +} + +// +// consume_fasta_and_tag: consume a FASTA file of reads, tagging reads every +// so often. +// + +// TODO? Inline in header. +void +Hashgraph:: +consume_fasta_and_tag( + std:: string const &filename, + unsigned int &total_reads, unsigned long long &n_consumed +) +{ + IParser * parser = + IParser::get_parser( filename ); + + consume_fasta_and_tag( + parser, + total_reads, n_consumed + ); + + delete parser; +} + +void +Hashgraph:: +consume_fasta_and_tag( + read_parsers:: IParser * parser, + unsigned int &total_reads, unsigned long long &n_consumed +) +{ + Read read; + + // TODO? Delete the following assignments. + total_reads = 0; + n_consumed = 0; + + // Iterate through the reads and consume their k-mers. + while (!parser->is_complete( )) { + + try { + read = parser->get_next_read( ); + } catch (NoMoreReadsAvailable &e) { + // Bail out if this error is raised + break; + } + + if (check_and_normalize_read( read.sequence )) { + unsigned long long this_n_consumed = 0; + consume_sequence_and_tag( read.sequence, this_n_consumed ); + + __sync_add_and_fetch( &n_consumed, this_n_consumed ); + __sync_add_and_fetch( &total_reads, 1 ); + } + } // while reads left for parser + +} + +// +// divide_tags_into_subsets - take all of the tags in 'all_tags', and +// divide them into subsets (based on starting tag) of <= given size. +// + +void Hashgraph::divide_tags_into_subsets(unsigned int subset_size, + SeenSet& divvy) +{ + unsigned int i = 0; + + for (SeenSet::const_iterator si = all_tags.begin(); si != all_tags.end(); + ++si) { + if (i % subset_size == 0) { + divvy.insert(*si); + i = 0; + } + i++; + } +} + +// +// consume_partitioned_fasta: consume a FASTA file of reads +// + +void Hashgraph::consume_partitioned_fasta(const std::string &filename, + unsigned int &total_reads, + unsigned long long &n_consumed) +{ + total_reads = 0; + n_consumed = 0; + + IParser* parser = IParser::get_parser(filename.c_str()); + Read read; + + string seq = ""; + + // reset the master subset partition + delete partition; + partition = new SubsetPartition(this); + + // + // iterate through the FASTA file & consume the reads. + // + + while(!parser->is_complete()) { + try { + read = parser->get_next_read(); + } catch (NoMoreReadsAvailable &exc) { + break; + } + seq = read.sequence; + + if (check_and_normalize_read(seq)) { + // First, figure out what the partition is (if non-zero), and save that. + PartitionID p = _parse_partition_id(read.name); + + // Then consume the sequence + n_consumed += consume_string(seq); // @CTB why are we doing this? + + // Next, compute the tag & set the partition, if nonzero + HashIntoType kmer = hash_dna(seq.c_str()); + all_tags.insert(kmer); + if (p > 0) { + partition->set_partition_id(kmer, p); + } + } + + // reset the sequence info, increment read number + total_reads++; + } + + delete parser; +} + +////////////////////////////////////////////////////////////////////// +// graph stuff + +void Hashgraph::calc_connected_graph_size(Kmer start, + unsigned long long& count, + KmerSet& keeper, + const unsigned long long threshold, + bool break_on_circum) +const +{ + const BoundedCounterType val = get_count(start); + + if (val == 0) { + return; + } + + KmerQueue node_q; + node_q.push(start); + + // Avoid high-circumference k-mers + Traverser traverser(this); + + KmerFilter filter = [&] (const Kmer& n) { + return break_on_circum && traverser.degree(n) > 4; + }; + traverser.push_filter(filter); + + while(!node_q.empty()) { + Kmer node = node_q.front(); + node_q.pop(); + + // have we already seen me? don't count; exit. + if (set_contains(keeper, node)) { + continue; + } + + // is this in stop_tags? + if (set_contains(stop_tags, node)) { + continue; + } + + // keep track of both seen kmers, and counts. + keeper.insert(node); + + count += 1; + + // are we past the threshold? truncate search. + if (threshold && count >= threshold) { + return; + } + + // otherwise, explore in all directions. + traverser.traverse(node, node_q); + } +} + +unsigned int Hashgraph::kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r) +{ + Traverser traverser(this); + Kmer node = build_kmer(kmer_f, kmer_r); + return traverser.degree(node); +} + +unsigned int Hashgraph::kmer_degree(const char * kmer_s) +{ + Traverser traverser(this); + Kmer node = build_kmer(kmer_s); + return traverser.degree(node); +} + +size_t Hashgraph::trim_on_stoptags(std::string seq) const +{ + if (!check_and_normalize_read(seq)) { + return 0; + } + + KmerIterator kmers(seq.c_str(), _ksize); + + size_t i = _ksize - 2; + while (!kmers.done()) { + HashIntoType kmer = kmers.next(); + if (set_contains(stop_tags, kmer)) { + return i; + } + i++; + } + + return seq.length(); +} + +unsigned int Hashgraph::traverse_from_kmer(Kmer start, + unsigned int radius, + KmerSet &keeper, + unsigned int max_count) +const +{ + + KmerQueue node_q; + std::queue breadth_q; + unsigned int cur_breadth = 0; + unsigned int total = 0; + unsigned int nfound = 0; + + KmerFilter filter = [&] (const Kmer& n) { + return set_contains(keeper, n); + }; + Traverser traverser(this, filter); + + node_q.push(start); + breadth_q.push(0); + + while(!node_q.empty()) { + Kmer node = node_q.front(); + node_q.pop(); + + unsigned int breadth = breadth_q.front(); + breadth_q.pop(); + + if (breadth > radius) { + break; + } + + if (max_count && total > max_count) { + break; + } + + if (set_contains(keeper, node)) { + continue; + } + + if (set_contains(stop_tags, node)) { + continue; + } + + // keep track of seen kmers + keeper.insert(node); + total++; + + if (!(breadth >= cur_breadth)) { // keep track of watermark, for debugging. + throw khmer_exception(); + } + if (breadth > cur_breadth) { + cur_breadth = breadth; + } + + nfound = traverser.traverse_right(node, node_q); + for (unsigned int i = 0; i &results) +{ + if (seq.size() < min_length) { + return; + } + + float max_seen = 1.0 - min_unique_f; + + min_length = min_length - _ksize + 1; // adjust for k-mer size. + + KmerIterator kmers(seq.c_str(), _ksize); + + std::deque seen_queue; + unsigned int n_already_seen = 0; + unsigned int n_kmers = 0; + + // first, put together an array for presence/absence of the k-mer + // at each given position. + while (!kmers.done()) { + HashIntoType kmer = kmers.next(); + + if (get_count(kmer)) { + seen_queue.push_back(true); + n_already_seen++; + } else { + seen_queue.push_back(false); + } + n_kmers++; + } + + // next, run through this array with 'i'. + + unsigned int i = 0; + while (i < n_kmers - min_length) { + unsigned int seen_counter, j; + + // For each starting 'i', count the number of 'seen' k-mers in the + // given window. + + // yes, inefficient n^2 algorithm. sue me. + for (seen_counter = 0, j = 0; j < min_length; j++) { + if (seen_queue[i + j]) { + seen_counter++; + } + } + + // If the fraction seen is small enough to be interesting, suggesting + // that this, in fact, a "new" window -- extend until it isn't, and + // then extract. + + if (!(j == min_length)) { + throw khmer_exception(); + } + if ( ((float)seen_counter / (float) j) <= max_seen) { + unsigned int start = i; + + // extend the window until the end of the sequence... + while ((start + min_length) < n_kmers) { + if (seen_queue[start]) { + seen_counter--; + } + if (seen_queue[start + min_length]) { + seen_counter++; + } + start++; + + // ...or until we've seen too many of the k-mers. + if (((float)seen_counter / (float) min_length) > max_seen) { + break; + } + } + + // adjust for ending point. + if (start + min_length == n_kmers) { // potentially decrement twice at end + if (((float)seen_counter / (float) min_length) > max_seen) { + start--; + } + start--; + } else { + start -= 2; + } + + // ...and now extract the relevant portion of the sequence, and adjust + // starting pos'n. + results.push_back(seq.substr(i, start + min_length + _ksize - i)); + + i = start + min_length + 1; + } else { + i++; + } + } +} + + +void Hashgraph::find_high_degree_nodes(const char * s, + SeenSet& high_degree_nodes) +const +{ + Traverser traverser(this); + KmerIterator kmers(s, _ksize); + + unsigned long n = 0; + while(!kmers.done()) { + n++; + if (n % 10000 == 0) { + std::cout << "... find_high_degree_nodes: " << n << "\n"; + std::cout << std::flush; + } + Kmer kmer = kmers.next(); + if ((traverser.degree(kmer)) > 2) { + high_degree_nodes.insert(kmer); + } + } +} + +unsigned int Hashgraph::traverse_linear_path(const Kmer seed_kmer, + SeenSet &adjacencies, + SeenSet &visited, Hashtable &bf, + SeenSet &high_degree_nodes) +const +{ + unsigned int size = 0; + + Traverser traverser(this); + + // if this k-mer is in the Bloom filter, truncate search. + // This prevents paths from being traversed in two directions. + if (bf.get_count(seed_kmer)) { + return 0; + } + + std::vector to_be_visited; + to_be_visited.push_back(seed_kmer); + + while (to_be_visited.size()) { + Kmer kmer = to_be_visited.back(); + to_be_visited.pop_back(); + + visited.insert(kmer); + size += 1; + + KmerQueue node_q; + traverser.traverse(kmer, node_q); + + while (node_q.size()) { + Kmer node = node_q.front(); + node_q.pop(); + + if (set_contains(high_degree_nodes, node)) { + // if there are any adjacent high degree nodes, record; + adjacencies.insert(node); + // also, add this to the stop Bloom filter. + bf.count(kmer); + } else if (set_contains(visited, node)) { + // do nothing - already visited + ; + } else { + to_be_visited.push_back(node); + } + } + } + return size; +} + +void Nodegraph::update_from(const Nodegraph &otherBASE) +{ + if (_ksize != otherBASE._ksize) { + throw khmer_exception("both nodegraphs must have same k size"); + } + BitStorage * myself = dynamic_cast(this->store); + const BitStorage * other; + other = dynamic_cast(otherBASE.store); + + // if dynamic_cast worked, then the pointers will be not null. + if (myself && other) { + myself->update_from(*other); + } else { + throw khmer_exception("update_from failed with incompatible objects"); + } +} + +// vim: set sts=2 sw=2: diff --git a/lib/hashgraph.hh b/lib/hashgraph.hh new file mode 100644 index 0000000000..b9622ad398 --- /dev/null +++ b/lib/hashgraph.hh @@ -0,0 +1,266 @@ +/* +This file is part of khmer, https://github.com/dib-lab/khmer/, and is +Copyright (C) 2016, The Regents of the University of California. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the University of California nor the names + of its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +LICENSE (END) + +Contact: khmer-project@idyll.org +*/ +#ifndef HASHGRAPH_HH +#define HASHGRAPH_HH + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hashtable.hh" +#include "traversal.hh" +#include "subset.hh" + +namespace khmer +{ +namespace read_parsers +{ +struct IParser; +} // namespace read_parsers +} // namespace khmer + +#define MAX_KEEPER_SIZE int(1e6) + +#define next_f(kmer_f, ch) ((((kmer_f) << 2) & bitmask) | (twobit_repr(ch))) +#define next_r(kmer_r, ch) (((kmer_r) >> 2) | (twobit_comp(ch) << rc_left_shift)) + +#define prev_f(kmer_f, ch) ((kmer_f) >> 2 | twobit_repr(ch) << rc_left_shift) +#define prev_r(kmer_r, ch) ((((kmer_r) << 2) & bitmask) | (twobit_comp(ch))) + +#define set_contains(s, e) ((s).find(e) != (s).end()) + +#define CALLBACK_PERIOD 100000 + +namespace khmer +{ +// +// Hashgraph: Extension of Hashtable to support graph operations. +// + +class Hashgraph: public Hashtable { + + friend class SubsetPartition; + friend class LabelHash; + friend class Traverser; + +protected: + unsigned int _tag_density; + + explicit Hashgraph(WordLength ksize, Storage * s) + : Hashtable(ksize, s) + { + _tag_density = DEFAULT_TAG_DENSITY; + if (!(_tag_density % 2 == 0)) { + throw khmer_exception(); + } + partition = new SubsetPartition(this); + _all_tags_spin_lock = 0; + } + + // clean up the partition structure. + virtual ~Hashgraph( ) + { + delete partition; + } + + // empty the partition structure + void _clear_all_partitions() + { + if (partition != NULL) { + partition->_clear_all_partitions(); + } + } + + uint32_t _all_tags_spin_lock; +public: + // default master partitioning + SubsetPartition * partition; + + // tags for sparse graph implementation + SeenSet all_tags; + + // tags at which to stop traversal + SeenSet stop_tags; + + // tags used in repartitioning + SeenSet repart_small_tags; + + // set the minimum density of tagging. + void _set_tag_density(unsigned int d) + { + // must be odd; can't be set if tags exist. + if (!(d % 2 == 0) || !all_tags.empty()) { + throw khmer_exception(); + } + _tag_density = d; + } + + unsigned int _get_tag_density() const { return _tag_density; } + + void add_tag(HashIntoType tag) { all_tags.insert(tag); } + void add_stop_tag(HashIntoType tag) { stop_tags.insert(tag); } + + size_t n_tags() const { return all_tags.size(); } + + void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); + + void add_kmer_to_tags(HashIntoType kmer) { all_tags.insert(kmer); } + + void clear_tags() { all_tags.clear(); } + + // Consume reads & build sparse graph. + void consume_fasta_and_tag( + std::string const &filename, + unsigned int &total_reads, + unsigned long long &n_consumed + ); + + // Count every k-mer from a stream of FASTA or FASTQ reads, + // using the supplied parser. + // Tag certain ones on the connectivity graph. + void consume_fasta_and_tag( + read_parsers:: IParser * parser, + unsigned int &total_reads, + unsigned long long &n_consumed + ); + + // consume a string & add sparse graph nodes. + void consume_sequence_and_tag(const std::string& seq, + unsigned long long& n_consumed, + SeenSet * new_tags = 0); + + + // consume an already-partitioned file & load in the partition IDs + void consume_partitioned_fasta(const std::string &filename, + unsigned int &total_reads, + unsigned long long &n_consumed); + + // trim the given sequence on stoptags + size_t trim_on_stoptags(std::string sequence) const; + + // @@ + unsigned int traverse_from_kmer(Kmer start, + unsigned int radius, + KmerSet &keeper, + unsigned int max_count = MAX_KEEPER_SIZE) + const; + + // print, save, and load the set of tags. + void print_tagset(std::string); + void save_tagset(std::string); + void load_tagset(std::string, bool clear_tags=true); + + // print, save and load the set of stop tags. + void print_stop_tags(std::string); + void save_stop_tags(std::string); + void load_stop_tags(std::string filename, bool clear_tags=true); + + // @@ + void extract_unique_paths(std::string seq, + unsigned int min_length, + float min_unique_f, + std::vector &results); + + // @@ + void calc_connected_graph_size(Kmer node, + unsigned long long& count, + KmerSet& keeper, + const unsigned long long threshold=0, + bool break_on_circum=false) const; + + // Calculate the graph degree of the given k-mer. + unsigned int kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r); + unsigned int kmer_degree(const char * kmer_s); + + // Find all nodes with a degree > 2. + void find_high_degree_nodes(const char * sequence, + SeenSet& high_degree_nodes) const; + + // Find the maximal linear path (nodes degree <= 2) + unsigned int traverse_linear_path(const Kmer start_kmer, + SeenSet &adjacencies, + SeenSet &nodes, Hashtable& bf, + SeenSet &high_degree_nodes) const; + + // + // for debugging/testing purposes only! + // + + // check partition map validity. + void _validate_pmap() + { + if (partition) { + partition->_validate_pmap(); + } + } + +}; + +// Hashgraph-derived class with ByteStorage. +class Countgraph : public khmer::Hashgraph +{ +public: + explicit Countgraph(WordLength ksize, std::vector sizes) + : Hashgraph(ksize, new ByteStorage(sizes)) { } ; +}; + +// Hashgraph-derived class with BitStorage. +class Nodegraph : public Hashgraph { +public: + explicit Nodegraph(WordLength ksize, std::vector sizes) + : Hashgraph(ksize, new BitStorage(sizes)) { } ; + + void update_from(const Nodegraph &other); +}; + +} + +#define ACQUIRE_ALL_TAGS_SPIN_LOCK \ + while (!__sync_bool_compare_and_swap( &_all_tags_spin_lock, 0, 1 )); + +#define RELEASE_ALL_TAGS_SPIN_LOCK \ + __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 ); + +#endif // HASHGRAPH_HH diff --git a/lib/hashtable.cc b/lib/hashtable.cc index d4a0bbcf07..5ab085a874 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -45,7 +45,6 @@ Contact: khmer-project@idyll.org #include #include -#include "counting.hh" #include "hashtable.hh" #include "khmer.hh" #include "traversal.hh" @@ -242,327 +241,115 @@ bool Hashtable::median_at_least(const std::string &s, return false; } -void Hashgraph::save_tagset(std::string outfilename) +void Hashtable::get_kmers(const std::string &s, + std::vector &kmers_vec) const { - ofstream outfile(outfilename.c_str(), ios::binary); - const size_t tagset_size = n_tags(); - unsigned int save_ksize = _ksize; - - HashIntoType * buf = new HashIntoType[tagset_size]; - - outfile.write(SAVED_SIGNATURE, 4); - unsigned char version = SAVED_FORMAT_VERSION; - outfile.write((const char *) &version, 1); - - unsigned char ht_type = SAVED_TAGS; - outfile.write((const char *) &ht_type, 1); - - outfile.write((const char *) &save_ksize, sizeof(save_ksize)); - outfile.write((const char *) &tagset_size, sizeof(tagset_size)); - outfile.write((const char *) &_tag_density, sizeof(_tag_density)); - - unsigned int i = 0; - for (SeenSet::iterator pi = all_tags.begin(); pi != all_tags.end(); - ++pi, i++) { - buf[i] = *pi; + if (s.length() < _ksize) { + return; } - - outfile.write((const char *) buf, sizeof(HashIntoType) * tagset_size); - if (outfile.fail()) { - delete[] buf; - throw khmer_file_exception(strerror(errno)); + for (unsigned int i = 0; i < s.length() - _ksize + 1; i++) { + std::string sub = s.substr(i, _ksize); + kmers_vec.push_back(sub); } - outfile.close(); - - delete[] buf; } -void Hashgraph::load_tagset(std::string infilename, bool clear_tags) + +void Hashtable::get_kmer_hashes(const std::string &s, + std::vector &kmers_vec) const { - ifstream infile; - - // configure ifstream to raise exceptions for everything. - infile.exceptions(std::ifstream::failbit | std::ifstream::badbit | - std::ifstream::eofbit); - - try { - infile.open(infilename.c_str(), ios::binary); - } catch (std::ifstream::failure &e) { - std::string err; - if (!(infile.is_open())) { - err = "Cannot open tagset file: " + infilename; - } else { - err = "Unknown error in opening file: " + infilename; - } - throw khmer_file_exception(err); - } catch (const std::exception &e) { - // Catching std::exception is a stopgap for - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66145 - std::string err = "Unknown error opening file: " + infilename + " " - + strerror(errno); - throw khmer_file_exception(err); - } + KmerIterator kmers(s.c_str(), _ksize); - if (clear_tags) { - all_tags.clear(); + while(!kmers.done()) { + HashIntoType kmer = kmers.next(); + kmers_vec.push_back(kmer); } +} - unsigned char version, ht_type; - unsigned int save_ksize = 0; - - size_t tagset_size = 0; - HashIntoType * buf = NULL; - - try { - char signature[4]; - infile.read(signature, 4); - infile.read((char *) &version, 1); - infile.read((char *) &ht_type, 1); - if (!(std::string(signature, 4) == SAVED_SIGNATURE)) { - std::ostringstream err; - err << "Incorrect file signature 0x"; - for(size_t i=0; i < 4; ++i) { - err << std::hex << (int) signature[i]; - } - err << " while reading tagset from " << infilename - << "; should be " << SAVED_SIGNATURE; - throw khmer_file_exception(err.str()); - } else if (!(version == SAVED_FORMAT_VERSION)) { - std::ostringstream err; - err << "Incorrect file format version " << (int) version - << " while reading tagset from " << infilename - << "; should be " << (int) SAVED_FORMAT_VERSION; - throw khmer_file_exception(err.str()); - } else if (!(ht_type == SAVED_TAGS)) { - std::ostringstream err; - err << "Incorrect file format type " << (int) ht_type - << " while reading tagset from " << infilename; - throw khmer_file_exception(err.str()); - } - - infile.read((char *) &save_ksize, sizeof(save_ksize)); - if (!(save_ksize == _ksize)) { - std::ostringstream err; - err << "Incorrect k-mer size " << save_ksize - << " while reading tagset from " << infilename; - throw khmer_file_exception(err.str()); - } - infile.read((char *) &tagset_size, sizeof(tagset_size)); - infile.read((char *) &_tag_density, sizeof(_tag_density)); +void Hashtable::get_kmer_hashes_as_hashset(const std::string &s, + SeenSet& hashes) const +{ + KmerIterator kmers(s.c_str(), _ksize); - buf = new HashIntoType[tagset_size]; + while(!kmers.done()) { + HashIntoType kmer = kmers.next(); + hashes.insert(kmer); + } +} - infile.read((char *) buf, sizeof(HashIntoType) * tagset_size); - for (unsigned int i = 0; i < tagset_size; i++) { - all_tags.insert(buf[i]); - } +void Hashtable::get_kmer_counts(const std::string &s, + std::vector &counts) const +{ + KmerIterator kmers(s.c_str(), _ksize); - delete[] buf; - } catch (std::ifstream::failure &e) { - std::string err = "Error reading data from: " + infilename; - if (buf != NULL) { - delete[] buf; - } - throw khmer_file_exception(err); - /* Yes, this is boneheaded. Unfortunately, there is a bug in gcc > 5 - * regarding the basic_ios::failure that makes it impossible to catch - * with more specificty. So, we catch *all* exceptions after trying to - * get the ifstream::failure, and assume it must have been the buggy one. - * Unfortunately, this would also cause us to catch the - * khmer_file_exceptions thrown above, so we catch them again first and - * rethrow them :) If this is understandably irritating to you, please - * bother the gcc devs at: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66145 - * - * See also: http://media4.giphy.com/media/3o6UBpHgaXFDNAuttm/giphy.gif - */ - } catch (khmer_file_exception &e) { - throw e; - } catch (const std::exception &e) { - std::string err = "Unknown error opening file: " + infilename + " " - + strerror(errno); - throw khmer_file_exception(err); + while(!kmers.done()) { + HashIntoType kmer = kmers.next(); + BoundedCounterType c = this->get_count(kmer); + counts.push_back(c); } } -void Hashgraph::consume_sequence_and_tag(const std::string& seq, - unsigned long long& n_consumed, - SeenSet * found_tags) +BoundedCounterType Hashtable::get_min_count(const std::string &s) { - bool kmer_tagged; - - KmerIterator kmers(seq.c_str(), _ksize); - HashIntoType kmer; + KmerIterator kmers(s.c_str(), _ksize); - unsigned int since = _tag_density / 2 + 1; + BoundedCounterType min_count = MAX_KCOUNT; while(!kmers.done()) { - kmer = kmers.next(); - bool is_new_kmer; - - // Set the bits for the kmer in the various hashtables, - // and report on whether or not they had already been set. - // This is probably better than first testing and then setting the bits, - // as a failed test essentially results in doing the same amount of work - // twice. - if ((is_new_kmer = store->test_and_set_bits( kmer ))) { - ++n_consumed; - } - -#if (1) - if (is_new_kmer) { - ++since; - } else { - ACQUIRE_ALL_TAGS_SPIN_LOCK - kmer_tagged = set_contains(all_tags, kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - if (kmer_tagged) { - since = 1; - if (found_tags) { - found_tags->insert(kmer); - } - } else { - ++since; - } - } -#else - if (!is_new_kmer && set_contains(all_tags, kmer)) { - since = 1; - if (found_tags) { - found_tags->insert(kmer); - } - } else { - since++; - } -#endif - - if (since >= _tag_density) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - if (found_tags) { - found_tags->insert(kmer); - } - since = 1; - } + HashIntoType kmer = kmers.next(); - } // iteration over kmers + BoundedCounterType count = this->get_count(kmer); - if (since >= _tag_density/2 - 1) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); // insert the last k-mer, too. - RELEASE_ALL_TAGS_SPIN_LOCK - if (found_tags) { - found_tags->insert(kmer); + if (this->get_count(kmer) < min_count) { + min_count = count; } } + return min_count; } -// -// consume_fasta_and_tag: consume a FASTA file of reads, tagging reads every -// so often. -// - -// TODO? Inline in header. -void -Hashgraph:: -consume_fasta_and_tag( - std:: string const &filename, - unsigned int &total_reads, unsigned long long &n_consumed -) -{ - IParser * parser = - IParser::get_parser( filename ); - - consume_fasta_and_tag( - parser, - total_reads, n_consumed - ); - - delete parser; -} - -void -Hashgraph:: -consume_fasta_and_tag( - read_parsers:: IParser * parser, - unsigned int &total_reads, unsigned long long &n_consumed -) +BoundedCounterType Hashtable::get_max_count(const std::string &s) { - Read read; - - // TODO? Delete the following assignments. - total_reads = 0; - n_consumed = 0; + BoundedCounterType max_count = 0; - // Iterate through the reads and consume their k-mers. - while (!parser->is_complete( )) { + KmerIterator kmers(s.c_str(), _ksize); - try { - read = parser->get_next_read( ); - } catch (NoMoreReadsAvailable &e) { - // Bail out if this error is raised - break; - } + while(!kmers.done()) { + HashIntoType kmer = kmers.next(); - if (check_and_normalize_read( read.sequence )) { - unsigned long long this_n_consumed = 0; - consume_sequence_and_tag( read.sequence, this_n_consumed ); + BoundedCounterType count = this->get_count(kmer); - __sync_add_and_fetch( &n_consumed, this_n_consumed ); - __sync_add_and_fetch( &total_reads, 1 ); + if (count > max_count) { + max_count = count; } - } // while reads left for parser - + } + return max_count; } -// -// divide_tags_into_subsets - take all of the tags in 'all_tags', and -// divide them into subsets (based on starting tag) of <= given size. -// - -void Hashgraph::divide_tags_into_subsets(unsigned int subset_size, - SeenSet& divvy) +uint64_t * +Hashtable::abundance_distribution( + read_parsers::IParser * parser, + Hashtable * tracking) { - unsigned int i = 0; + uint64_t * dist = new uint64_t[MAX_BIGCOUNT + 1]; + uint64_t i; - for (SeenSet::const_iterator si = all_tags.begin(); si != all_tags.end(); - ++si) { - if (i % subset_size == 0) { - divvy.insert(*si); - i = 0; - } - i++; + for (i = 0; i <= MAX_BIGCOUNT; i++) { + dist[i] = 0; } -} - -// -// consume_partitioned_fasta: consume a FASTA file of reads -// - -void Hashgraph::consume_partitioned_fasta(const std::string &filename, - unsigned int &total_reads, - unsigned long long &n_consumed) -{ - total_reads = 0; - n_consumed = 0; - IParser* parser = IParser::get_parser(filename.c_str()); Read read; - string seq = ""; - - // reset the master subset partition - delete partition; - partition = new SubsetPartition(this); + string name; + string seq; - // - // iterate through the FASTA file & consume the reads. - // + // if not, could lead to overflow. + if (sizeof(BoundedCounterType) != 2) { + delete[] dist; + throw khmer_exception(); + } - while(!parser->is_complete()) { + while(!parser->is_complete()) { try { read = parser->get_next_read(); } catch (NoMoreReadsAvailable &exc) { @@ -571,109 +358,65 @@ void Hashgraph::consume_partitioned_fasta(const std::string &filename, seq = read.sequence; if (check_and_normalize_read(seq)) { - // First, figure out what the partition is (if non-zero), and save that. - PartitionID p = _parse_partition_id(read.name); + KmerIterator kmers(seq.c_str(), _ksize); - // Then consume the sequence - n_consumed += consume_string(seq); // @CTB why are we doing this? + while(!kmers.done()) { + HashIntoType kmer = kmers.next(); - // Next, compute the tag & set the partition, if nonzero - HashIntoType kmer = hash_dna(seq.c_str()); - all_tags.insert(kmer); - if (p > 0) { - partition->set_partition_id(kmer, p); + if (!tracking->get_count(kmer)) { + tracking->count(kmer); + + BoundedCounterType n = get_count(kmer); + dist[n]++; + } } - } - // reset the sequence info, increment read number - total_reads++; + name.clear(); + seq.clear(); + } } + return dist; +} + +uint64_t * Hashtable::abundance_distribution( + std::string filename, + Hashtable * tracking) +{ + IParser* parser = IParser::get_parser(filename.c_str()); + + uint64_t * distribution = abundance_distribution(parser, tracking); delete parser; + return distribution; } -////////////////////////////////////////////////////////////////////// -// graph stuff - -void Hashgraph::calc_connected_graph_size(Kmer start, - unsigned long long& count, - KmerSet& keeper, - const unsigned long long threshold, - bool break_on_circum) +unsigned long Hashtable::trim_on_abundance( + std::string seq, + BoundedCounterType min_abund) const { - const BoundedCounterType val = get_count(start); - - if (val == 0) { - return; + if (!check_and_normalize_read(seq)) { + return 0; } - KmerQueue node_q; - node_q.push(start); - - // Avoid high-circumference k-mers - Traverser traverser(this); - - KmerFilter filter = [&] (const Kmer& n) { - return break_on_circum && traverser.degree(n) > 4; - }; - traverser.push_filter(filter); - - while(!node_q.empty()) { - Kmer node = node_q.front(); - node_q.pop(); - - // have we already seen me? don't count; exit. - if (set_contains(keeper, node)) { - continue; - } - - // is this in stop_tags? - if (set_contains(stop_tags, node)) { - continue; - } - - // keep track of both seen kmers, and counts. - keeper.insert(node); - - count += 1; + KmerIterator kmers(seq.c_str(), _ksize); - // are we past the threshold? truncate search. - if (threshold && count >= threshold) { - return; - } + HashIntoType kmer; - // otherwise, explore in all directions. - traverser.traverse(node, node_q); + if (kmers.done()) { + return 0; } -} - -unsigned int Hashgraph::kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r) -{ - Traverser traverser(this); - Kmer node = build_kmer(kmer_f, kmer_r); - return traverser.degree(node); -} - -unsigned int Hashgraph::kmer_degree(const char * kmer_s) -{ - Traverser traverser(this); - Kmer node = build_kmer(kmer_s); - return traverser.degree(node); -} + kmer = kmers.next(); -size_t Hashgraph::trim_on_stoptags(std::string seq) const -{ - if (!check_and_normalize_read(seq)) { + if (kmers.done() || get_count(kmer) < min_abund) { return 0; } - KmerIterator kmers(seq.c_str(), _ksize); - - size_t i = _ksize - 2; + unsigned long i = _ksize; while (!kmers.done()) { - HashIntoType kmer = kmers.next(); - if (set_contains(stop_tags, kmer)) { + kmer = kmers.next(); + + if (get_count(kmer) < min_abund) { return i; } i++; @@ -682,440 +425,95 @@ size_t Hashgraph::trim_on_stoptags(std::string seq) const return seq.length(); } -unsigned int Hashgraph::traverse_from_kmer(Kmer start, - unsigned int radius, - KmerSet &keeper, - unsigned int max_count) +unsigned long Hashtable::trim_below_abundance( + std::string seq, + BoundedCounterType max_abund) const { - - KmerQueue node_q; - std::queue breadth_q; - unsigned int cur_breadth = 0; - unsigned int total = 0; - unsigned int nfound = 0; - - KmerFilter filter = [&] (const Kmer& n) { - return set_contains(keeper, n); - }; - Traverser traverser(this, filter); - - node_q.push(start); - breadth_q.push(0); - - while(!node_q.empty()) { - Kmer node = node_q.front(); - node_q.pop(); - - unsigned int breadth = breadth_q.front(); - breadth_q.pop(); - - if (breadth > radius) { - break; - } - - if (max_count && total > max_count) { - break; - } - - if (set_contains(keeper, node)) { - continue; - } - - if (set_contains(stop_tags, node)) { - continue; - } - - // keep track of seen kmers - keeper.insert(node); - total++; - - if (!(breadth >= cur_breadth)) { // keep track of watermark, for debugging. - throw khmer_exception(); - } - if (breadth > cur_breadth) { - cur_breadth = breadth; - } - - nfound = traverser.traverse_right(node, node_q); - for (unsigned int i = 0; i max_abund) { + return 0; } - printfile.close(); -} - -void Hashgraph::print_tagset(std::string infilename) -{ - ofstream printfile(infilename.c_str()); + unsigned long i = _ksize; + while (!kmers.done()) { + kmer = kmers.next(); - unsigned int i = 0; - for (SeenSet::iterator pi = all_tags.begin(); pi != all_tags.end(); - ++pi, i++) { - std::string kmer = _revhash(*pi, _ksize); - printfile << kmer << "\n"; + if (get_count(kmer) > max_abund) { + return i; + } + i++; } - printfile.close(); + return seq.length(); } -void Hashgraph::extract_unique_paths(std::string seq, - unsigned int min_length, - float min_unique_f, - std::vector &results) +std::vector Hashtable::find_spectral_error_positions( + std::string seq, + BoundedCounterType max_abund) +const { - if (seq.size() < min_length) { - return; + std::vector posns; + if (!check_and_normalize_read(seq)) { + throw khmer_exception("invalid read"); } - float max_seen = 1.0 - min_unique_f; - - min_length = min_length - _ksize + 1; // adjust for k-mer size. - KmerIterator kmers(seq.c_str(), _ksize); - std::deque seen_queue; - unsigned int n_already_seen = 0; - unsigned int n_kmers = 0; + HashIntoType kmer = kmers.next(); + if (kmers.done()) { + return posns; + } - // first, put together an array for presence/absence of the k-mer - // at each given position. + // find the first trusted k-mer while (!kmers.done()) { - HashIntoType kmer = kmers.next(); - - if (get_count(kmer)) { - seen_queue.push_back(true); - n_already_seen++; - } else { - seen_queue.push_back(false); + if (get_count(kmer) > max_abund) { + break; } - n_kmers++; + kmer = kmers.next(); } - // next, run through this array with 'i'. - - unsigned int i = 0; - while (i < n_kmers - min_length) { - unsigned int seen_counter, j; - - // For each starting 'i', count the number of 'seen' k-mers in the - // given window. - - // yes, inefficient n^2 algorithm. sue me. - for (seen_counter = 0, j = 0; j < min_length; j++) { - if (seen_queue[i + j]) { - seen_counter++; - } - } - - // If the fraction seen is small enough to be interesting, suggesting - // that this, in fact, a "new" window -- extend until it isn't, and - // then extract. + if (kmers.done()) { + return posns; + } - if (!(j == min_length)) { - throw khmer_exception(); + // did we bypass some erroneous k-mers? call the last one. + if (kmers.get_start_pos() > 0) { + // if we are well past the first k, forget the whole thing (!? @CTB) + if (kmers.get_start_pos() >= _ksize && 0) { + return posns; } - if ( ((float)seen_counter / (float) j) <= max_seen) { - unsigned int start = i; + posns.push_back(kmers.get_start_pos() - 1); + } - // extend the window until the end of the sequence... - while ((start + min_length) < n_kmers) { - if (seen_queue[start]) { - seen_counter--; - } - if (seen_queue[start + min_length]) { - seen_counter++; - } - start++; + while (!kmers.done()) { + kmer = kmers.next(); + if (get_count(kmer) <= max_abund) { // error! + posns.push_back(kmers.get_end_pos() - 1); - // ...or until we've seen too many of the k-mers. - if (((float)seen_counter / (float) min_length) > max_seen) { + // find next good + while (!kmers.done()) { + kmer = kmers.next(); + if (get_count(kmer) > max_abund) { // a good stretch again. break; } } - - // adjust for ending point. - if (start + min_length == n_kmers) { // potentially decrement twice at end - if (((float)seen_counter / (float) min_length) > max_seen) { - start--; - } - start--; - } else { - start -= 2; - } - - // ...and now extract the relevant portion of the sequence, and adjust - // starting pos'n. - results.push_back(seq.substr(i, start + min_length + _ksize - i)); - - i = start + min_length + 1; - } else { - i++; } } -} - - -void Hashtable::get_kmers(const std::string &s, - std::vector &kmers_vec) const -{ - if (s.length() < _ksize) { - return; - } - for (unsigned int i = 0; i < s.length() - _ksize + 1; i++) { - std::string sub = s.substr(i, _ksize); - kmers_vec.push_back(sub); - } -} - - -void Hashtable::get_kmer_hashes(const std::string &s, - std::vector &kmers_vec) const -{ - KmerIterator kmers(s.c_str(), _ksize); - - while(!kmers.done()) { - HashIntoType kmer = kmers.next(); - kmers_vec.push_back(kmer); - } -} - -void Hashtable::get_kmer_hashes_as_hashset(const std::string &s, - SeenSet& hashes) const -{ - KmerIterator kmers(s.c_str(), _ksize); - - while(!kmers.done()) { - HashIntoType kmer = kmers.next(); - hashes.insert(kmer); - } -} - - -void Hashtable::get_kmer_counts(const std::string &s, - std::vector &counts) const -{ - KmerIterator kmers(s.c_str(), _ksize); - - while(!kmers.done()) { - HashIntoType kmer = kmers.next(); - BoundedCounterType c = this->get_count(kmer); - counts.push_back(c); - } -} - -void Hashgraph::find_high_degree_nodes(const char * s, - SeenSet& high_degree_nodes) -const -{ - Traverser traverser(this); - KmerIterator kmers(s, _ksize); - - unsigned long n = 0; - while(!kmers.done()) { - n++; - if (n % 10000 == 0) { - std::cout << "... find_high_degree_nodes: " << n << "\n"; - std::cout << std::flush; - } - Kmer kmer = kmers.next(); - if ((traverser.degree(kmer)) > 2) { - high_degree_nodes.insert(kmer); - } - } -} - -unsigned int Hashgraph::traverse_linear_path(const Kmer seed_kmer, - SeenSet &adjacencies, - SeenSet &visited, Hashtable &bf, - SeenSet &high_degree_nodes) -const -{ - unsigned int size = 0; - - Traverser traverser(this); - - // if this k-mer is in the Bloom filter, truncate search. - // This prevents paths from being traversed in two directions. - if (bf.get_count(seed_kmer)) { - return 0; - } - - std::vector to_be_visited; - to_be_visited.push_back(seed_kmer); - - while (to_be_visited.size()) { - Kmer kmer = to_be_visited.back(); - to_be_visited.pop_back(); - - visited.insert(kmer); - size += 1; - - KmerQueue node_q; - traverser.traverse(kmer, node_q); - - while (node_q.size()) { - Kmer node = node_q.front(); - node_q.pop(); - - if (set_contains(high_degree_nodes, node)) { - // if there are any adjacent high degree nodes, record; - adjacencies.insert(node); - // also, add this to the stop Bloom filter. - bf.count(kmer); - } else if (set_contains(visited, node)) { - // do nothing - already visited - ; - } else { - to_be_visited.push_back(node); - } - } - } - return size; + return posns; } // vim: set sts=2 sw=2: diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 1d1a76b7cf..7c4f67113e 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -66,16 +66,6 @@ struct IParser; } // namespace read_parsers } // namespace khmer -#define MAX_KEEPER_SIZE int(1e6) - -#define next_f(kmer_f, ch) ((((kmer_f) << 2) & bitmask) | (twobit_repr(ch))) -#define next_r(kmer_r, ch) (((kmer_r) >> 2) | (twobit_comp(ch) << rc_left_shift)) - -#define prev_f(kmer_f, ch) ((kmer_f) >> 2 | twobit_repr(ch) << rc_left_shift) -#define prev_r(kmer_r, ch) ((((kmer_r) << 2) & bitmask) | (twobit_comp(ch))) - -#define set_contains(s, e) ((s).find(e) != (s).end()) - #define CALLBACK_PERIOD 100000 namespace khmer @@ -289,179 +279,49 @@ public: { return store->get_raw_tables(); } -}; -// -// Hashgraph: Extension of Hashtable to support graph operations. -// + // find the minimum k-mer count in the given sequence + BoundedCounterType get_min_count(const std::string &s); -class Hashgraph: public Hashtable -{ + // find the maximum k-mer count in the given sequence + BoundedCounterType get_max_count(const std::string &s); - friend class SubsetPartition; - friend class LabelHash; - friend class Traverser; - -protected: - unsigned int _tag_density; + // calculate the abundance distribution of kmers in the given file. + uint64_t * abundance_distribution(read_parsers::IParser * parser, + Hashtable * tracking); + uint64_t * abundance_distribution(std::string filename, + Hashtable * tracking); - explicit Hashgraph(WordLength ksize, Storage * s) - : Hashtable(ksize, s) - { - _tag_density = DEFAULT_TAG_DENSITY; - if (!(_tag_density % 2 == 0)) { - throw khmer_exception(); - } - partition = new SubsetPartition(this); - _all_tags_spin_lock = 0; - } + // return the index of the first position in the sequence with k-mer + // abundance below min_abund. + unsigned long trim_on_abundance(std::string seq, + BoundedCounterType min_abund) const; - virtual ~Hashgraph( ) - { - delete partition; - } + // return the index of the first position in the sequence with k-mer + // abundance above max_abund. + unsigned long trim_below_abundance(std::string seq, + BoundedCounterType max_abund) const; - void _clear_all_partitions() - { - if (partition != NULL) { - partition->_clear_all_partitions(); - } - } + // detect likely positions of errors + std::vector find_spectral_error_positions(std::string seq, + BoundedCounterType min_abund) const; +}; - uint32_t _all_tags_spin_lock; +// Hashtable-derived class with ByteStorage. +class Counttable : public khmer::Hashtable +{ public: - SubsetPartition * partition; - SeenSet all_tags; - SeenSet stop_tags; - SeenSet repart_small_tags; - - virtual void save_tagset(std::string); - virtual void load_tagset(std::string, bool clear_tags=true); - - void _set_tag_density(unsigned int d) - { - // must be odd; can't be set if tags exist. - if (!(d % 2 == 0) || !all_tags.empty()) { - throw khmer_exception(); - } - _tag_density = d; - } - - unsigned int _get_tag_density() const - { - return _tag_density; - } - - void add_tag(HashIntoType tag) - { - all_tags.insert(tag); - } - void add_stop_tag(HashIntoType tag) - { - stop_tags.insert(tag); - } - - // Partitioning stuff. - - size_t n_tags() const - { - return all_tags.size(); - } - - void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); - - void add_kmer_to_tags(HashIntoType kmer) - { - all_tags.insert(kmer); - } - - void clear_tags() - { - all_tags.clear(); - } - - // Count every k-mer in a FASTA or FASTQ file. - // Tag certain ones on the connectivity graph. - void consume_fasta_and_tag( - std::string const &filename, - unsigned int &total_reads, - unsigned long long &n_consumed - ); - - // Count every k-mer from a stream of FASTA or FASTQ reads, - // using the supplied parser. - // Tag certain ones on the connectivity graph. - void consume_fasta_and_tag( - read_parsers:: IParser * parser, - unsigned int &total_reads, - unsigned long long &n_consumed - ); - - void consume_sequence_and_tag(const std::string& seq, - unsigned long long& n_consumed, - SeenSet * new_tags = 0); - - - void consume_partitioned_fasta(const std::string &filename, - unsigned int &total_reads, - unsigned long long &n_consumed); - - size_t trim_on_stoptags(std::string sequence) const; - - unsigned int traverse_from_kmer(Kmer start, - unsigned int radius, - KmerSet &keeper, - unsigned int max_count = MAX_KEEPER_SIZE) - const; - - virtual void print_tagset(std::string); - virtual void print_stop_tags(std::string); - virtual void save_stop_tags(std::string); - void load_stop_tags(std::string filename, bool clear_tags=true); - - void extract_unique_paths(std::string seq, - unsigned int min_length, - float min_unique_f, - std::vector &results); - - void calc_connected_graph_size(Kmer node, - unsigned long long& count, - KmerSet& keeper, - const unsigned long long threshold=0, - bool break_on_circum=false) const; - - - unsigned int kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r); - unsigned int kmer_degree(const char * kmer_s); - - // - void find_high_degree_nodes(const char * sequence, - SeenSet& high_degree_nodes) const; - - unsigned int traverse_linear_path(const Kmer start_kmer, - SeenSet &adjacencies, - SeenSet &nodes, Hashtable& bf, - SeenSet &high_degree_nodes) const; - - // - // for debugging/testing purposes only! - // - - // check partition map validity. - void _validate_pmap() - { - if (partition) { - partition->_validate_pmap(); - } - } - + explicit Counttable(WordLength ksize, std::vector sizes) + : Hashtable(ksize, new ByteStorage(sizes)) { } ; +}; + +// Hashtable-derived class with BitStorage. +class Nodetable : public Hashtable { +public: + explicit Nodetable(WordLength ksize, std::vector sizes) + : Hashtable(ksize, new BitStorage(sizes)) { } ; }; -} - -#define ACQUIRE_ALL_TAGS_SPIN_LOCK \ - while (!__sync_bool_compare_and_swap( &_all_tags_spin_lock, 0, 1 )); -#define RELEASE_ALL_TAGS_SPIN_LOCK \ - __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 ); +} #endif // HASHTABLE_HH diff --git a/lib/kmer_filters.cc b/lib/kmer_filters.cc index c34bd98285..ba65b25d08 100644 --- a/lib/kmer_filters.cc +++ b/lib/kmer_filters.cc @@ -120,7 +120,7 @@ KmerFilter get_simple_label_intersect_filter(const LabelSet& src_labels, KmerFilter get_junction_count_filter(const Kmer& src_node, - CountingHash * junctions, + Countgraph * junctions, const unsigned int min_cov) { KmerFilter filter = [=] (const Kmer& dst_node) { diff --git a/lib/kmer_filters.hh b/lib/kmer_filters.hh index 82d0f7f3a5..b40b831f83 100644 --- a/lib/kmer_filters.hh +++ b/lib/kmer_filters.hh @@ -41,7 +41,6 @@ Contact: khmer-project@idyll.org #include "khmer.hh" #include "kmer_hash.hh" -#include "counting.hh" #include "labelhash.hh" @@ -65,7 +64,7 @@ KmerFilter get_stop_bf_filter(const Hashtable * stop_bf); KmerFilter get_visited_filter(const SeenSet * visited); KmerFilter get_junction_count_filter(const Kmer& src_node, - CountingHash * junctions, + Countgraph * junctions, const unsigned int min_cov = 2); } diff --git a/lib/labelhash.cc b/lib/labelhash.cc index b76fdf0c84..02ceedcdf1 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -41,8 +41,7 @@ Contact: khmer-project@idyll.org #include // IWYU pragma: keep #include -#include "hashbits.hh" -#include "hashtable.hh" +#include "hashgraph.hh" #include "khmer_exception.hh" #include "labelhash.hh" #include "read_parsers.hh" diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 574d2f9141..8f1d5347b8 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -44,7 +44,7 @@ Contact: khmer-project@idyll.org #include #include -#include "hashtable.hh" +#include "hashgraph.hh" #include "khmer.hh" #include "read_parsers.hh" diff --git a/lib/read_aligner.hh b/lib/read_aligner.hh index bb727e1913..cb0c456e1e 100644 --- a/lib/read_aligner.hh +++ b/lib/read_aligner.hh @@ -48,8 +48,8 @@ Contact: khmer-project@idyll.org #include #include -#include "counting.hh" #include "khmer.hh" +#include "hashgraph.hh" #include "kmer_hash.hh" #define READ_ALIGNER_DEBUG 0 @@ -226,7 +226,7 @@ private: const HashIntoType bitmask; const size_t rc_left_shift; - CountingHash* m_ch; + khmer::Countgraph* m_ch; ScoringMatrix m_sm; size_t m_trusted_cutoff; @@ -244,7 +244,7 @@ public: Alignment* Align(const std::string&); Alignment* AlignForward(const std::string&); - ReadAligner(CountingHash* ch, + ReadAligner(khmer::Countgraph* ch, BoundedCounterType trusted_cutoff, double bits_theta) : bitmask(comp_bitmask(ch->ksize())), rc_left_shift(ch->ksize() * 2 - 2), @@ -265,7 +265,7 @@ public: #endif } - ReadAligner(CountingHash* ch, + ReadAligner(khmer::Countgraph* ch, BoundedCounterType trusted_cutoff, double bits_theta, double* scoring_matrix, double* transitions) : bitmask(comp_bitmask(ch->ksize())), diff --git a/lib/storage.cc b/lib/storage.cc index ab27ced8f1..3836b10270 100644 --- a/lib/storage.cc +++ b/lib/storage.cc @@ -133,7 +133,7 @@ void BitStorage::save(std::string outfilename, WordLength ksize) } /** - * Loads @param infilename into Hashbits, with error checking on + * Loads @param infilename into BitStorage, with error checking on * file type and file version. Populates _counts internally. */ void BitStorage::load(std::string infilename, WordLength &ksize) @@ -249,7 +249,7 @@ void BitStorage::load(std::string infilename, WordLength &ksize) } } -void CountingHashFile::save( +void ByteStorageFile::save( const std::string &outfilename, WordLength ksize, const ByteStorage &store) @@ -259,14 +259,14 @@ void CountingHashFile::save( std::string type = filename.substr(found + 1); if (type == "gz") { - CountingHashGzFileWriter(filename, ksize, store); + ByteStorageGzFileWriter(filename, ksize, store); } else { - CountingHashFileWriter(filename, ksize, store); + ByteStorageFileWriter(filename, ksize, store); } } -CountingHashFileReader::CountingHashFileReader( +ByteStorageFileReader::ByteStorageFileReader( const std::string &infilename, WordLength& ksize, ByteStorage &store) @@ -400,7 +400,7 @@ CountingHashFileReader::CountingHashFileReader( } } -CountingHashGzFileReader::CountingHashGzFileReader( +ByteStorageGzFileReader::ByteStorageGzFileReader( const std::string &infilename, WordLength &ksize, ByteStorage &store) @@ -576,7 +576,7 @@ CountingHashGzFileReader::CountingHashGzFileReader( gzclose(infile); } -CountingHashFileWriter::CountingHashFileWriter( +ByteStorageFileWriter::ByteStorageFileWriter( const std::string &outfilename, const WordLength ksize, const ByteStorage& store) @@ -634,7 +634,7 @@ CountingHashFileWriter::CountingHashFileWriter( outfile.close(); } -CountingHashGzFileWriter::CountingHashGzFileWriter( +ByteStorageGzFileWriter::ByteStorageGzFileWriter( const std::string &outfilename, const WordLength ksize, const ByteStorage &store) @@ -739,7 +739,7 @@ CountingHashGzFileWriter::CountingHashGzFileWriter( gzclose(outfile); } -void CountingHashFile::load( +void ByteStorageFile::load( const std::string &infilename, WordLength &ksize, ByteStorage &store) @@ -749,19 +749,19 @@ void CountingHashFile::load( std::string type = filename.substr(found + 1); if (type == "gz") { - CountingHashGzFileReader(filename, ksize, store); + ByteStorageGzFileReader(filename, ksize, store); } else { - CountingHashFileReader(filename, ksize, store); + ByteStorageFileReader(filename, ksize, store); } } void ByteStorage::save(std::string outfilename, WordLength ksize) { - CountingHashFile::save(outfilename, ksize, *this); + ByteStorageFile::save(outfilename, ksize, *this); } void ByteStorage::load(std::string infilename, WordLength& ksize) { - CountingHashFile::load(infilename, ksize, *this); + ByteStorageFile::load(infilename, ksize, *this); } diff --git a/lib/storage.hh b/lib/storage.hh index f33de2bc4b..f51d13a0b9 100644 --- a/lib/storage.hh +++ b/lib/storage.hh @@ -41,8 +41,6 @@ Contact: khmer-project@idyll.org namespace khmer { typedef std::map KmerCountMap; -class Hashbits; -class CountingHash; // // base Storage class for hashtable-related storage of information in memory. @@ -87,7 +85,6 @@ public: class BitStorage : public Storage { - friend class Hashbits; protected: std::vector _tablesizes; size_t _n_tables; @@ -95,6 +92,7 @@ protected: uint64_t _n_unique_kmers; Byte ** _counts; +public: BitStorage(std::vector& tablesizes) : _tablesizes(tablesizes) { @@ -236,20 +234,19 @@ protected: * */ -class CountingHashFile; -class CountingHashFileReader; -class CountingHashFileWriter; -class CountingHashGzFileReader; -class CountingHashGzFileWriter; - -class ByteStorage : public Storage -{ - friend class CountingHashFile; - friend class CountingHashFileReader; - friend class CountingHashFileWriter; - friend class CountingHashGzFileReader; - friend class CountingHashGzFileWriter; - friend class CountingHash; +class ByteStorageFile; +class ByteStorageFileReader; +class ByteStorageFileWriter; +class ByteStorageGzFileReader; +class ByteStorageGzFileWriter; + +class ByteStorage : public Storage { + friend class ByteStorageFile; + friend class ByteStorageFileReader; + friend class ByteStorageFileWriter; + friend class ByteStorageGzFileReader; + friend class ByteStorageGzFileWriter; + friend class CountGraph; protected: unsigned int _max_count; unsigned int _max_bigcount; @@ -423,7 +420,7 @@ public: // Helper classes for saving ByteStorage objs to disk & loading them. -class CountingHashFile +class ByteStorageFile { public: static void load(const std::string &infilename, @@ -434,35 +431,35 @@ public: const ByteStorage &store); }; -class CountingHashFileReader : public CountingHashFile +class ByteStorageFileReader : public ByteStorageFile { public: - CountingHashFileReader(const std::string &infilename, + ByteStorageFileReader(const std::string &infilename, WordLength &ksize, ByteStorage &store); }; -class CountingHashGzFileReader : public CountingHashFile +class ByteStorageGzFileReader : public ByteStorageFile { public: - CountingHashGzFileReader(const std::string &infilename, + ByteStorageGzFileReader(const std::string &infilename, WordLength &ksize, ByteStorage &store); }; -class CountingHashFileWriter : public CountingHashFile +class ByteStorageFileWriter : public ByteStorageFile { public: - CountingHashFileWriter(const std::string &outfilename, + ByteStorageFileWriter(const std::string &outfilename, const WordLength ksize, const ByteStorage &store); }; -class CountingHashGzFileWriter : public CountingHashFile +class ByteStorageGzFileWriter : public ByteStorageFile { public: - CountingHashGzFileWriter(const std::string &outfilename, + ByteStorageGzFileWriter(const std::string &outfilename, const WordLength ksize, const ByteStorage &store); }; diff --git a/lib/subset.cc b/lib/subset.cc index 3314df36aa..aa6ecc7caf 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -44,8 +44,7 @@ Contact: khmer-project@idyll.org #include #include -#include "counting.hh" -#include "hashtable.hh" +#include "hashgraph.hh" #include "khmer_exception.hh" #include "read_parsers.hh" #include "subset.hh" @@ -1193,7 +1192,7 @@ const void SubsetPartition::partition_average_coverages( PartitionCountMap &cm, - CountingHash * ht) const + Countgraph * ht) const { PartitionCountMap csum; PartitionCountMap cN; @@ -1218,7 +1217,7 @@ unsigned long long SubsetPartition::repartition_largest_partition( unsigned int distance, unsigned int threshold, unsigned int frequency, - CountingHash &counting) + Countgraph &counting) { PartitionCountMap cm; unsigned int n_unassigned = 0; diff --git a/lib/subset.hh b/lib/subset.hh index 1fc49b2fe1..02a710bc1f 100644 --- a/lib/subset.hh +++ b/lib/subset.hh @@ -46,8 +46,7 @@ Contact: khmer-project@idyll.org namespace khmer { -class CountingHash; -class Hashbits; +class Countgraph; class Hashgraph; struct pre_partition_info { @@ -157,10 +156,10 @@ public: unsigned int& n_unassigned) const; void partition_average_coverages(PartitionCountMap &cm, - CountingHash * ht) const; + Countgraph * ht) const; unsigned long long repartition_largest_partition(unsigned int, unsigned int, - unsigned int, CountingHash&); + unsigned int, Countgraph&); void repartition_a_partition(const SeenSet& partition_tags); void _clear_partition(PartitionID, SeenSet& partition_tags); diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc index bcafd84ad6..28e06a05d9 100644 --- a/lib/test-Colors.cc +++ b/lib/test-Colors.cc @@ -50,7 +50,7 @@ int main() sizes + sizeof(sizes) / sizeof(HashIntoType) ); khmer::LabelHash * lh_pointer = new khmer::LabelHash(20, sizes_vec); - khmer::Hashbits * hb_pointer = (khmer::Hashbits *)lh_pointer; + khmer::Nodegraph * hb_pointer = (khmer::Hashbits *)lh_pointer; std::cout << "lh_pointer n_tags: " << lh_pointer->n_tags() << std::endl; std::cout << "hb_pointer n_tags: " << hb_pointer->n_tags() << std::endl; diff --git a/lib/test-compile.cc b/lib/test-compile.cc index 124ac9ad8e..d272613e0e 100644 --- a/lib/test-compile.cc +++ b/lib/test-compile.cc @@ -39,10 +39,10 @@ Contact: khmer-project@idyll.org // This file is used to test compilation with libkhmer.a/libkhmer.so, after // installation -#include +#include int main() { - khmer::CountingHash test(1,1); + khmer::Countgraph test(1,1); return 0; } diff --git a/setup.py b/setup.py index 6b20d71202..b252af3aa7 100755 --- a/setup.py +++ b/setup.py @@ -127,14 +127,16 @@ def check_for_openmp(): BUILD_DEPENDS = [] BUILD_DEPENDS.extend(path_join("lib", bn + ".hh") for bn in [ - "khmer", "kmer_hash", "hashtable", "counting", "hashbits", "labelhash", + "khmer", "kmer_hash", "hashtable", "labelhash", "hashgraph", "hllcounter", "khmer_exception", "read_aligner", "subset", "read_parsers", "kmer_filters", "traversal", "assembler", "alphabets", "storage"]) +BUILD_DEPENDS.extend(path_join("khmer", bn + ".hh") for bn in [ + "_cpy_counttable", "_cpy_hashgraph", "_cpy_nodetable"]) SOURCES = ["khmer/_khmer.cc"] SOURCES.extend(path_join("lib", bn + ".cc") for bn in [ - "read_parsers", "kmer_hash", "hashtable", - "hashbits", "labelhash", "counting", "subset", "read_aligner", + "read_parsers", "kmer_hash", "hashtable", "hashgraph", + "labelhash", "subset", "read_aligner", "hllcounter", "traversal", "kmer_filters", "assembler", "alphabets", "storage"]) diff --git a/tests/test_countgraph.py b/tests/test_countgraph.py index fa2032b695..f523df6eb6 100644 --- a/tests/test_countgraph.py +++ b/tests/test_countgraph.py @@ -1476,3 +1476,10 @@ def test_counting_load_bigcount(): print(i, count_table.count('ATATATATAT')) count = count_table.get('ATATATATAT') assert count == 500 + + +def test_bad_create(): + try: + countgraph = khmer._Countgraph(5, []) + except ValueError as err: + assert 'tablesizes needs to be one or more numbers' in str(err) diff --git a/tests/test_cpython_hierarchy.py b/tests/test_cpython_hierarchy.py new file mode 100644 index 0000000000..55232534db --- /dev/null +++ b/tests/test_cpython_hierarchy.py @@ -0,0 +1,64 @@ +# This file is part of khmer, https://github.com/dib-lab/khmer/, and is +# Copyright (C) 2016, The Regents of the University of California. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# +# * Neither the name of the University of California nor the names +# of its contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Contact: khmer-project@idyll.org +# pylint: disable=C0111,C0103,missing-docstring,no-member,protected-access + +from __future__ import print_function +from __future__ import absolute_import + +import khmer + +import pytest +from . import khmer_tst_utils as utils + + +def test_countgraph_vs_table(): + x = khmer.Counttable(4, 21, 3) + y = khmer.Countgraph(4, 21, 3) + + assert hasattr(x, 'add') + assert hasattr(y, 'add') + + assert not hasattr(x, 'consume_and_tag') + assert hasattr(y, 'consume_and_tag') + + +def test_nodegraph_vs_table(): + x = khmer.Nodetable(4, 21, 3) + y = khmer.Nodegraph(4, 21, 3) + + assert hasattr(x, 'add') + assert hasattr(y, 'add') + + assert not hasattr(x, 'consume_and_tag') + assert hasattr(y, 'consume_and_tag') diff --git a/tests/test_nodegraph.py b/tests/test_nodegraph.py index 49226bbfcf..78a3fef410 100644 --- a/tests/test_nodegraph.py +++ b/tests/test_nodegraph.py @@ -61,6 +61,13 @@ def test_toobig(): print(str(err)) +def test_bad_create(): + try: + nodegraph = khmer._Nodegraph(5, []) + except ValueError as err: + assert 'tablesizes needs to be one or more numbers' in str(err) + + def test__get_set_tag_density(): nodegraph = khmer._Nodegraph(32, [1]) orig = nodegraph._get_tag_density()