-
Notifications
You must be signed in to change notification settings - Fork 295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add get_kmers() and get_kmer_counts() functions #1049
Changes from 5 commits
d42177c
ae175c0
4a66496
d1cf0b2
527dac5
56e1990
6de148b
1de9d5a
045e5f9
1f41c51
7be805b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,24 +193,10 @@ void Hashtable::get_median_count(const std::string &s, | |
float &stddev) | ||
{ | ||
std::vector<BoundedCounterType> counts; | ||
KMerIterator kmers(s.c_str(), _ksize); | ||
|
||
while(!kmers.done()) { | ||
HashIntoType kmer = kmers.next(); | ||
BoundedCounterType count = this->get_count(kmer); | ||
counts.push_back(count); | ||
} | ||
|
||
if (!counts.size()) { | ||
throw khmer_exception(); | ||
} | ||
this->get_kmer_counts(s, counts); | ||
|
||
if (!counts.size()) { | ||
median = 0; | ||
average = 0; | ||
stddev = 0; | ||
|
||
return; | ||
throw khmer_exception("no k-mer counts for this string; too short?"); | ||
} | ||
|
||
average = 0; | ||
|
@@ -1502,4 +1488,31 @@ void Hashtable::extract_unique_paths(std::string seq, | |
} | ||
} | ||
} | ||
|
||
|
||
void Hashtable::get_kmers(const std::string &s, | ||
std::vector<std::string> &kmers_vec) const | ||
{ | ||
if (s.length() < _ksize) { | ||
return; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In other cases where the sequence is shorter than K, we raise an exception; is this a case of letting an error pass silently? |
||
} | ||
for (unsigned int i = 0; i < s.length() - _ksize + 1; i++) { | ||
std::string sub = s.substr(i, i + _ksize); | ||
kmers_vec.push_back(sub); | ||
} | ||
} | ||
|
||
|
||
void Hashtable::get_kmer_counts(const std::string &s, | ||
std::vector<BoundedCounterType> &counts) const | ||
{ | ||
KMerIterator kmers(s.c_str(), _ksize); | ||
|
||
while(!kmers.done()) { | ||
HashIntoType kmer = kmers.next(); | ||
BoundedCounterType c = this->get_count(kmer); | ||
counts.push_back(c); | ||
} | ||
} | ||
|
||
// vim: set sts=2 sw=2: |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -217,6 +217,17 @@ def test_simple_median(): | |
assert int(stddev * 100) == 50 # .5 | ||
|
||
|
||
def test_median_too_short(): | ||
hi = khmer.new_counting_hash(6, 1e6, 2) | ||
|
||
hi.consume("AAAAAA") | ||
try: | ||
hi.get_median_count("A") | ||
assert 0, "this should fail" | ||
except ValueError: | ||
pass | ||
|
||
|
||
def test_simple_kadian(): | ||
hi = khmer.new_counting_hash(6, 1e6, 2) | ||
hi.consume("ACTGCTATCTCTAGAGCTATG") | ||
|
@@ -311,6 +322,61 @@ def test_2_kadian(): | |
assert x == 1, x | ||
|
||
|
||
def test_get_kmer_counts_too_short(): | ||
hi = khmer.new_counting_hash(6, 1e6, 2) | ||
|
||
hi.consume("AAAAAA") | ||
counts = hi.get_kmer_counts("A") | ||
assert len(counts) == 0 | ||
|
||
|
||
def test_get_kmer_counts(): | ||
hi = khmer.new_counting_hash(6, 1e6, 2) | ||
|
||
hi.consume("AAAAAA") | ||
counts = hi.get_kmer_counts("AAAAAA") | ||
print counts | ||
assert len(counts) == 1 | ||
assert counts[0] == 1 | ||
|
||
hi.consume("AAAAAA") | ||
counts = hi.get_kmer_counts("AAAAAA") | ||
print counts | ||
assert len(counts) == 1 | ||
assert counts[0] == 2 | ||
|
||
hi.consume("AAAAAT") | ||
counts = hi.get_kmer_counts("AAAAAAT") | ||
print counts | ||
assert len(counts) == 2 | ||
assert counts[0] == 2 | ||
assert counts[1] == 1 | ||
|
||
hi.consume("AAAAAT") | ||
counts = hi.get_kmer_counts("AAAAAAT") | ||
print counts | ||
assert len(counts) == 2 | ||
assert counts[0] == 2 | ||
assert counts[1] == 2 | ||
|
||
hi.consume("AAAAAT") | ||
counts = hi.get_kmer_counts("AAAAAAT") | ||
print counts | ||
assert len(counts) == 2 | ||
assert counts[0] == 2 | ||
assert counts[1] == 3 | ||
|
||
|
||
def test_get_kmers(): | ||
hi = khmer.new_counting_hash(6, 1e6, 2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. too large and not needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
While I appreciate that it's unnecessary, this is only 2 MB ;). I'll fix it |
||
|
||
kmers = hi.get_kmers("AAAAAA") | ||
assert kmers == ["AAAAAA"] | ||
|
||
kmers = hi.get_kmers("AAAAAAT") | ||
assert kmers == ["AAAAAA", "AAAAAT"] | ||
|
||
|
||
def test_save_load(): | ||
inpath = utils.get_test_data('random-20-a.fa') | ||
savepath = utils.get_temp_filename('tempcountingsave0.ht') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This has no dependency on Hashtable; it should go elsewhere.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It uses ksize.
Titus Brown, ctbrown@ucdavis.edu