Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_kmers() and get_kmer_counts() functions #1049

Merged
merged 11 commits into from
Jun 9, 2015
9 changes: 9 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
2015-06-04 Titus Brown <titus@idyll.org>

* lib/hashtable.{cc,hh}: add Hashtable::get_kmers()
and Hashtable::get_kmer_counts().
* khmer/_khmermodule.cc: add CPython functions for get_kmers() and
get_kmer_counts(); reorganize hashtable_methods a bit.
* tests/test_counting_hash.py: add tests for get_kmers() and
get_kmer_counts().

2015-06-04 Titus Brown <titus@idyll.org>

* khmer/_khmermodule.cc: added error handling to load_partitionmap.
Expand Down
185 changes: 150 additions & 35 deletions khmer/_khmermodule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2281,61 +2281,165 @@ hashtable_extract_unique_paths(khmer_KHashtable_Object * me, PyObject * args)
return x;
}


static
PyObject *
hashtable_get_kmers(khmer_KHashtable_Object * me, PyObject * args)
{
Hashtable * hashtable = me->hashtable;
const char * sequence;

if (!PyArg_ParseTuple(args, "s", &sequence)) {
return NULL;
}

std::vector<std::string> kmers;

hashtable->get_kmers(sequence, kmers);

PyObject * x = PyList_New(kmers.size());
for (unsigned int i = 0; i < kmers.size(); i++) {
PyObject * obj = PyBytes_FromString(kmers[i].c_str());
PyList_SET_ITEM(x, i, obj);
}

return x;
}

static
PyObject *
hashtable_get_kmer_counts(khmer_KHashtable_Object * me, PyObject * args)
{
Hashtable * hashtable = me->hashtable;
const char * sequence;

if (!PyArg_ParseTuple(args, "s", &sequence)) {
return NULL;
}

std::vector<BoundedCounterType> counts;
hashtable->get_kmer_counts(sequence, counts);

PyObject * x = PyList_New(counts.size());
for (unsigned int i = 0; i <counts.size(); i++) {
PyObject * obj = PyInt_FromLong(counts[i]);
PyList_SET_ITEM(x, i, obj);
}

return x;
}


static PyMethodDef khmer_hashtable_methods[] = {
{ "ksize", (PyCFunction)hashtable_get_ksize, METH_VARARGS, "" },
//
// Basic methods
//

{ "ksize",
(PyCFunction)hashtable_get_ksize, METH_VARARGS,
"Returns the k-mer size of this graph."
},
{ "hashsizes", (PyCFunction)hashtable_get_hashsizes, METH_VARARGS, "" },
{ "n_unique_kmers", (PyCFunction)hashtable_n_unique_kmers, METH_VARARGS, "Count the number of unique kmers" },
{ "n_occupied", (PyCFunction)hashtable_n_occupied, METH_VARARGS, "Count the number of occupied bins" },
{ "n_entries", (PyCFunction)hashtable_n_entries, METH_VARARGS, "" },
{ "count", (PyCFunction)hashtable_count, METH_VARARGS, "Count the given kmer" },
{ "consume", (PyCFunction)hashtable_consume, METH_VARARGS, "Count all k-mers in the given string" },
{ "consume_fasta", (PyCFunction)hashtable_consume_fasta, METH_VARARGS, "Count all k-mers in a given file" },
{ "n_unique_kmers",
(PyCFunction)hashtable_n_unique_kmers, METH_VARARGS,
"Count the number of unique kmers in this graph."
},
{
"consume_fasta_with_reads_parser", (PyCFunction)hashtable_consume_fasta_with_reads_parser,
METH_VARARGS, "Count all k-mers using a given reads parser"
"n_occupied", (PyCFunction)hashtable_n_occupied, METH_VARARGS,
"Count the number of occupied bins."
},
{ "n_entries", (PyCFunction)hashtable_n_entries, METH_VARARGS, "" },
{ "count",
(PyCFunction)hashtable_count, METH_VARARGS,
"Increment the count of this k-mer."
},
{ "consume",
(PyCFunction)hashtable_consume, METH_VARARGS,
"Increment the counts of all of the k-mers in the string."
},
{ "consume_fasta",
(PyCFunction)hashtable_consume_fasta, METH_VARARGS,
"Incrment the counts of all the k-mers in the sequences in the "
"given file"
},
{ "consume_fasta_with_reads_parser",
(PyCFunction)hashtable_consume_fasta_with_reads_parser, METH_VARARGS,
"Count all k-mers retrieved with this reads parser object."
},
{ "get",
(PyCFunction)hashtable_get, METH_VARARGS,
"Retrieve the count for the given k-mer."
},
{ "load",
(PyCFunction)hashtable_load, METH_VARARGS,
"Load the graph from the specified file." },
{ "save",
(PyCFunction)hashtable_save, METH_VARARGS,
"Save the graph to the specified file."
},
{ "get_median_count",
(PyCFunction)hashtable_get_median_count, METH_VARARGS,
"Get the median, average, and stddev of the k-mer counts "
" in the string"
},
{ "get_kmers",
(PyCFunction)hashtable_get_kmers, METH_VARARGS,
"Generate an ordered list of all substrings of length k in the string."
},
{ "get", (PyCFunction)hashtable_get, METH_VARARGS, "Get the count for the given k-mer" },
{ "load", (PyCFunction)hashtable_load, METH_VARARGS, "" },
{ "save", (PyCFunction)hashtable_save, METH_VARARGS, "" },
{ "get_kmer_counts",
(PyCFunction)hashtable_get_kmer_counts, METH_VARARGS,
"Retrieve the counts of all k-mers in the string."
},

//
// graph/traversal functionality
//

{ "calc_connected_graph_size",
(PyCFunction)hashtable_calc_connected_graph_size, METH_VARARGS, ""
},
{ "kmer_degree",
(PyCFunction)hashtable_kmer_degree, METH_VARARGS,
"Calculate the number of immediate neighbors this k-mer has in "
"the graph."
},
{ "count_kmers_within_radius",
(PyCFunction)hashtable_count_kmers_within_radius, METH_VARARGS,
"Calculate the number of neighbors with given radius in the graph."
},

//
// tagging / sparse graph functionality
//

{ "consume_and_tag", (PyCFunction)hashtable_consume_and_tag, METH_VARARGS, "Consume a sequence and tag it" },
{ "get_tags_and_positions", (PyCFunction)hashtable_get_tags_and_positions, METH_VARARGS, "Retrieve tags and their positions in a sequence." },
{ "find_all_tags_list", (PyCFunction)hashtable_find_all_tags_list, METH_VARARGS, "Find all tags within range of the given k-mer, return as list" },
{ "consume_fasta_and_tag", (PyCFunction)hashtable_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
{ "get_median_count", (PyCFunction)hashtable_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
{ "extract_unique_paths", (PyCFunction)hashtable_extract_unique_paths, METH_VARARGS, "" },
{ "load_stop_tags", (PyCFunction)hashtable_load_stop_tags, METH_VARARGS, "" },
{ "save_stop_tags", (PyCFunction)hashtable_save_stop_tags, METH_VARARGS, "" },
{ "print_stop_tags", (PyCFunction)hashtable_print_stop_tags, METH_VARARGS, "" },
{ "print_tagset", (PyCFunction)hashtable_print_tagset, METH_VARARGS, "" },
{ "calc_connected_graph_size", (PyCFunction)hashtable_calc_connected_graph_size, METH_VARARGS, "" },
{ "kmer_degree", (PyCFunction)hashtable_kmer_degree, METH_VARARGS, "" },
{ "trim_on_stoptags", (PyCFunction)hashtable_trim_on_stoptags, METH_VARARGS, "" },
{ "identify_stoptags_by_position", (PyCFunction)hashtable_identify_stoptags_by_position, METH_VARARGS, "" },
{ "do_subset_partition", (PyCFunction)hashtable_do_subset_partition, METH_VARARGS, "" },
{ "find_all_tags", (PyCFunction)hashtable_find_all_tags, METH_VARARGS, "" },
{ "assign_partition_id", (PyCFunction)hashtable_assign_partition_id, METH_VARARGS, "" },
{ "output_partitions", (PyCFunction)hashtable_output_partitions, METH_VARARGS, "" },
{ "find_unpart", (PyCFunction)hashtable_find_unpart, METH_VARARGS, "" },
{ "filter_if_present", (PyCFunction)hashtable_filter_if_present, METH_VARARGS, "" },
{ "add_tag", (PyCFunction)hashtable_add_tag, METH_VARARGS, "" },
{ "add_stop_tag", (PyCFunction)hashtable_add_stop_tag, METH_VARARGS, "" },
{ "get_stop_tags", (PyCFunction)hashtable_get_stop_tags, METH_VARARGS, "" },
{ "get_tagset", (PyCFunction)hashtable_get_tagset, METH_VARARGS, "" },
{ "load_tagset", (PyCFunction)hashtable_load_tagset, METH_VARARGS, "" },
{ "save_tagset", (PyCFunction)hashtable_save_tagset, METH_VARARGS, "" },
{ "n_tags", (PyCFunction)hashtable_n_tags, METH_VARARGS, "" },
{ "divide_tags_into_subsets", (PyCFunction)hashtable_divide_tags_into_subsets, METH_VARARGS, "" },
{ "_get_tag_density", (PyCFunction)hashtable__get_tag_density, METH_VARARGS, "" },
{ "_set_tag_density", (PyCFunction)hashtable__set_tag_density, METH_VARARGS, "" },

// partitioning
{ "do_subset_partition", (PyCFunction)hashtable_do_subset_partition, METH_VARARGS, "" },
{ "find_all_tags", (PyCFunction)hashtable_find_all_tags, METH_VARARGS, "" },
{ "assign_partition_id", (PyCFunction)hashtable_assign_partition_id, METH_VARARGS, "" },
{ "output_partitions", (PyCFunction)hashtable_output_partitions, METH_VARARGS, "" },
{ "find_unpart", (PyCFunction)hashtable_find_unpart, METH_VARARGS, "" },
{ "load_partitionmap", (PyCFunction)hashtable_load_partitionmap, METH_VARARGS, "" },
{ "save_partitionmap", (PyCFunction)hashtable_save_partitionmap, METH_VARARGS, "" },
{ "_validate_partitionmap", (PyCFunction)hashtable__validate_partitionmap, METH_VARARGS, "" },
{ "_get_tag_density", (PyCFunction)hashtable__get_tag_density, METH_VARARGS, "" },
{ "_set_tag_density", (PyCFunction)hashtable__set_tag_density, METH_VARARGS, "" },
{
"consume_fasta_and_tag_with_reads_parser", (PyCFunction)hashtable_consume_fasta_and_tag_with_reads_parser,
{ "consume_fasta_and_traverse", (PyCFunction)hashtable_consume_fasta_and_traverse, METH_VARARGS, "" },
{ "consume_fasta_and_tag_with_reads_parser", (PyCFunction)hashtable_consume_fasta_and_tag_with_reads_parser,
METH_VARARGS, "Count all k-mers using a given reads parser"
},
{ "consume_fasta_and_traverse", (PyCFunction)hashtable_consume_fasta_and_traverse, METH_VARARGS, "" },
{ "consume_fasta_and_tag_with_stoptags", (PyCFunction)hashtable_consume_fasta_and_tag_with_stoptags, METH_VARARGS, "Count all k-mers in a given file" },
{ "consume_partitioned_fasta", (PyCFunction)hashtable_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" },
{ "join_partitions_by_path", (PyCFunction)hashtable_join_partitions_by_path, METH_VARARGS, "" },
{ "merge_subset", (PyCFunction)hashtable_merge_subset, METH_VARARGS, "" },
Expand All @@ -2350,9 +2454,20 @@ static PyMethodDef khmer_hashtable_methods[] = {
{ "join_partitions", (PyCFunction)hashtable_join_partitions, METH_VARARGS, "" },
{ "get_partition_id", (PyCFunction)hashtable_get_partition_id, METH_VARARGS, "" },
{ "is_single_partition", (PyCFunction)hashtable_is_single_partition, METH_VARARGS, "" },
{ "count_kmers_within_radius", (PyCFunction)hashtable_count_kmers_within_radius, METH_VARARGS, "" },
{ "traverse_from_tags", (PyCFunction)hashtable_traverse_from_tags, METH_VARARGS, "" },
{ "repartition_largest_partition", (PyCFunction)hashtable_repartition_largest_partition, METH_VARARGS, "" },

// stop tags
{ "load_stop_tags", (PyCFunction)hashtable_load_stop_tags, METH_VARARGS, "" },
{ "save_stop_tags", (PyCFunction)hashtable_save_stop_tags, METH_VARARGS, "" },
{ "print_stop_tags", (PyCFunction)hashtable_print_stop_tags, METH_VARARGS, "" },
{ "trim_on_stoptags", (PyCFunction)hashtable_trim_on_stoptags, METH_VARARGS, "" },
{ "identify_stoptags_by_position", (PyCFunction)hashtable_identify_stoptags_by_position, METH_VARARGS, "" },
{ "filter_if_present", (PyCFunction)hashtable_filter_if_present, METH_VARARGS, "" },
{ "add_stop_tag", (PyCFunction)hashtable_add_stop_tag, METH_VARARGS, "" },
{ "get_stop_tags", (PyCFunction)hashtable_get_stop_tags, METH_VARARGS, "" },
{ "consume_fasta_and_tag_with_stoptags", (PyCFunction)hashtable_consume_fasta_and_tag_with_stoptags, METH_VARARGS, "Count all k-mers in a given file" },

{NULL, NULL, 0, NULL} /* sentinel */
};

Expand Down
45 changes: 29 additions & 16 deletions lib/hashtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -193,24 +193,10 @@ void Hashtable::get_median_count(const std::string &s,
float &stddev)
{
std::vector<BoundedCounterType> counts;
KMerIterator kmers(s.c_str(), _ksize);

while(!kmers.done()) {
HashIntoType kmer = kmers.next();
BoundedCounterType count = this->get_count(kmer);
counts.push_back(count);
}

if (!counts.size()) {
throw khmer_exception();
}
this->get_kmer_counts(s, counts);

if (!counts.size()) {
median = 0;
average = 0;
stddev = 0;

return;
throw khmer_exception("no k-mer counts for this string; too short?");
}

average = 0;
Expand Down Expand Up @@ -1502,4 +1488,31 @@ void Hashtable::extract_unique_paths(std::string seq,
}
}
}


void Hashtable::get_kmers(const std::string &s,
std::vector<std::string> &kmers_vec) const
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has no dependency on Hashtable; it should go elsewhere.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It uses ksize.

Titus Brown, ctbrown@ucdavis.edu

On Jun 19, 2015, at 11:40 AM, Michael R. Crusoe notifications@github.com wrote:

In lib/hashtable.cc:

@@ -1522,4 +1508,43 @@ void Hashtable::extract_unique_paths(std::string seq,
}
}
}
+
+
+void Hashtable::get_kmers(const std::string &s,

  •                      std::vectorstd::string &kmers_vec) const
    
    This has no dependency on Hashtable; it should go elsewhere.


Reply to this email directly or view it on GitHub.

{
if (s.length() < _ksize) {
return;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In other cases where the sequence is shorter than K, we raise an exception; is this a case of letting an error pass silently?

}
for (unsigned int i = 0; i < s.length() - _ksize + 1; i++) {
std::string sub = s.substr(i, i + _ksize);
kmers_vec.push_back(sub);
}
}


void Hashtable::get_kmer_counts(const std::string &s,
std::vector<BoundedCounterType> &counts) const
{
KMerIterator kmers(s.c_str(), _ksize);

while(!kmers.done()) {
HashIntoType kmer = kmers.next();
BoundedCounterType c = this->get_count(kmer);
counts.push_back(c);
}
}

// vim: set sts=2 sw=2:
8 changes: 8 additions & 0 deletions lib/hashtable.hh
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,14 @@ public:

return kmer_degree(kmer_f, kmer_r);
}

// return all k-mer substrings, on the forward strand.
void get_kmers(const std::string &s, std::vector<std::string> &kmers)
const;

// return counts of all k-mers in this string.
void get_kmer_counts(const std::string &s,
std::vector<BoundedCounterType> &counts) const;
};
};

Expand Down
66 changes: 66 additions & 0 deletions tests/test_counting_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,17 @@ def test_simple_median():
assert int(stddev * 100) == 50 # .5


def test_median_too_short():
hi = khmer.new_counting_hash(6, 1e6, 2)

hi.consume("AAAAAA")
try:
hi.get_median_count("A")
assert 0, "this should fail"
except ValueError:
pass


def test_simple_kadian():
hi = khmer.new_counting_hash(6, 1e6, 2)
hi.consume("ACTGCTATCTCTAGAGCTATG")
Expand Down Expand Up @@ -311,6 +322,61 @@ def test_2_kadian():
assert x == 1, x


def test_get_kmer_counts_too_short():
hi = khmer.new_counting_hash(6, 1e6, 2)

hi.consume("AAAAAA")
counts = hi.get_kmer_counts("A")
assert len(counts) == 0


def test_get_kmer_counts():
hi = khmer.new_counting_hash(6, 1e6, 2)

hi.consume("AAAAAA")
counts = hi.get_kmer_counts("AAAAAA")
print counts
assert len(counts) == 1
assert counts[0] == 1

hi.consume("AAAAAA")
counts = hi.get_kmer_counts("AAAAAA")
print counts
assert len(counts) == 1
assert counts[0] == 2

hi.consume("AAAAAT")
counts = hi.get_kmer_counts("AAAAAAT")
print counts
assert len(counts) == 2
assert counts[0] == 2
assert counts[1] == 1

hi.consume("AAAAAT")
counts = hi.get_kmer_counts("AAAAAAT")
print counts
assert len(counts) == 2
assert counts[0] == 2
assert counts[1] == 2

hi.consume("AAAAAT")
counts = hi.get_kmer_counts("AAAAAAT")
print counts
assert len(counts) == 2
assert counts[0] == 2
assert counts[1] == 3


def test_get_kmers():
hi = khmer.new_counting_hash(6, 1e6, 2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

too large and not needed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+def test_get_kmers():

  • hi = khmer.new_counting_hash(6, 1e6, 2)

too large and not needed

While I appreciate that it's unnecessary, this is only 2 MB ;). I'll fix it
but -0 on this kind of thing in the future.


kmers = hi.get_kmers("AAAAAA")
assert kmers == ["AAAAAA"]

kmers = hi.get_kmers("AAAAAAT")
assert kmers == ["AAAAAA", "AAAAAT"]


def test_save_load():
inpath = utils.get_test_data('random-20-a.fa')
savepath = utils.get_temp_filename('tempcountingsave0.ht')
Expand Down