Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify the hashing code by removing sliding window optimization &c. #1432

Closed
wants to merge 14 commits into from
Closed
14 changes: 2 additions & 12 deletions khmer/_khmer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,17 +114,7 @@ extern "C" {
static bool convert_PyObject_to_Kmer(PyObject * value,
Kmer& kmer, WordLength ksize)
{
if (PyInt_Check(value) || PyLong_Check(value)) {
HashIntoType h;
if (PyLong_Check(value)) {
h = PyLong_AsUnsignedLongLong(value);
} else {
h = PyInt_AsLong(value);
}

kmer.set_from_unique_hash(h, ksize);
return true;
} else if (PyUnicode_Check(value)) {
if (PyUnicode_Check(value)) {
std::string s = PyBytes_AsString(PyUnicode_AsEncodedString(
value, "utf-8", "strict"));
if (strlen(s.c_str()) != ksize) {
Expand All @@ -146,7 +136,7 @@ static bool convert_PyObject_to_Kmer(PyObject * value,
return true;
} else {
PyErr_SetString(PyExc_ValueError,
"k-mers must be either a hash or a string");
"k-mers must be a string");
return false;
}
}
Expand Down
2 changes: 1 addition & 1 deletion lib/counting.cc
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ const

KmerIterator kmers(seq.c_str(), _ksize);

HashIntoType kmer = kmers.next();
HashIntoType kmer = kmers.first();
if (kmers.done()) {
return posns;
}
Expand Down
7 changes: 0 additions & 7 deletions lib/hashtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -244,13 +244,6 @@ bool Hashtable::median_at_least(const std::string &s,
//////////////////////////////////////////////////////////////////////
// graph stuff

unsigned int Hashtable::kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r)
{
Traverser traverser(this);
Kmer node = build_kmer(kmer_f, kmer_r);
return traverser.degree(node);
}

unsigned int Hashtable::kmer_degree(const char * kmer_s)
{
Traverser traverser(this);
Expand Down
1 change: 0 additions & 1 deletion lib/hashtable.hh
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,6 @@ public:
unsigned int max_count = MAX_KEEPER_SIZE)
const;

unsigned int kmer_degree(HashIntoType kmer_f, HashIntoType kmer_r);
unsigned int kmer_degree(const char * kmer_s);

// return all k-mer substrings, on the forward strand.
Expand Down
124 changes: 23 additions & 101 deletions lib/kmer_hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,53 +55,49 @@ using namespace std;
namespace khmer
{

HashIntoType _hash(const char * kmer, const WordLength k,
HashIntoType& _h, HashIntoType& _r)
// _hash_forward: return the hash from the forward direction only.

HashIntoType _hash_forward(const char * kmer, WordLength k)
{
// sizeof(HashIntoType) * 8 bits / 2 bits/base
if (!(k <= sizeof(HashIntoType)*4) || !(strlen(kmer) >= k)) {
if (!(k <= sizeof(HashIntoType)*4) || strlen(kmer) < k) {
throw khmer_exception("Supplied kmer string doesn't match the underlying k-size.");
}

HashIntoType h = 0, r = 0;
HashIntoType h = 0;

h |= twobit_repr(kmer[0]);
r |= twobit_comp(kmer[k-1]);

for (WordLength i = 1, j = k - 2; i < k; i++, j--) {
for (WordLength i = 1; i < k; i++) {
h = h << 2;
r = r << 2;

h |= twobit_repr(kmer[i]);
r |= twobit_comp(kmer[j]);
}

_h = h;
_r = r;

return uniqify_rc(h, r);
return h;
}

// _hash: return the maximum of the forward and reverse hash.

HashIntoType _hash(const char * kmer, const WordLength k)
HashIntoType _hash(const char * kmer, const WordLength k,
HashIntoType& _h, HashIntoType& _r)
{
HashIntoType h = 0;
HashIntoType r = 0;
std::string _revcomp(const std::string&);
std::string fwd(kmer);
fwd = fwd.substr(0, k);
std::string rc = _revcomp(fwd);

return khmer::_hash(kmer, k, h, r);
_h = _hash_forward(fwd.c_str(), k);
_r = _hash_forward(rc.c_str(), k);

return uniqify_rc(_h, _r);
}

// _hash_forward: return the hash from the forward direction only.
// _hash: return the maximum of the forward and reverse hash.

HashIntoType _hash_forward(const char * kmer, WordLength k)
HashIntoType _hash(const char * kmer, const WordLength k)
{
HashIntoType h = 0;
HashIntoType r = 0;


khmer::_hash(kmer, k, h, r);
return h; // return forward only
return khmer::_hash(kmer, k, h, r);
}

HashIntoType _hash(const std::string kmer, const WordLength k)
Expand All @@ -123,17 +119,11 @@ std::string _revhash(HashIntoType hash, WordLength k)
{
std::string s = "";

unsigned int val = hash & 3;
s += revtwobit_repr(val);

s += "A";
for (WordLength i = 1; i < k; i++) {
hash = hash >> 2;
val = hash & 3;
s += revtwobit_repr(val);
s += "A";
}

reverse(s.begin(), s.end());

return s;
}

Expand All @@ -145,7 +135,7 @@ std::string _revcomp(const std::string& kmer)
for (size_t i=0; i < ksize; ++i) {
char complement;

switch(kmer[i]) {
switch(toupper(kmer[i])) {
case 'A':
complement = 'T';
break;
Expand Down Expand Up @@ -198,72 +188,4 @@ HashIntoType _hash_murmur_forward(const std::string& kmer)
khmer::_hash_murmur(kmer, h, r);
return h;
}

KmerIterator::KmerIterator(const char * seq,
unsigned char k) :
KmerFactory(k), _seq(seq)
{
bitmask = 0;
for (unsigned char i = 0; i < _ksize; i++) {
bitmask = (bitmask << 2) | 3;
}
_nbits_sub_1 = (_ksize*2 - 2);

index = _ksize - 1;
length = strlen(_seq);
_kmer_f = 0;
_kmer_r = 0;

initialized = false;
}

Kmer KmerIterator::first(HashIntoType& f, HashIntoType& r)
{
HashIntoType x;
x = _hash(_seq, _ksize, _kmer_f, _kmer_r);

f = _kmer_f;
r = _kmer_r;

index = _ksize;

return Kmer(_kmer_f, _kmer_r, x);
}

Kmer KmerIterator::next(HashIntoType& f, HashIntoType& r)
{
if (done()) {
throw khmer_exception();
}

if (!initialized) {
initialized = true;
return first(f, r);
}

unsigned char ch = _seq[index];
index++;
if (!(index <= length)) {
throw khmer_exception();
}

// left-shift the previous hash over
_kmer_f = _kmer_f << 2;

// 'or' in the current nt
_kmer_f |= twobit_repr(ch);

// mask off the 2 bits we shifted over.
_kmer_f &= bitmask;

// now handle reverse complement
_kmer_r = _kmer_r >> 2;
_kmer_r |= (twobit_comp(ch) << _nbits_sub_1);

f = _kmer_f;
r = _kmer_r;

return build_kmer(_kmer_f, _kmer_r);
}

}
Loading