Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mqf integration2 #1873

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,12 @@ matrix:
osx_image: xcode7.3
env:
- TESTATTR="'not linux and not known_failing and not huge'"
- CXXFLAGS="$CXXFLAGS -nostdinc+"
- CXX="clang++ -stdlib=libc++"
before_install:
- source ci_scripts/install.sh


# command to install common dependencies
install:
- python --version
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ clean: FORCE
cd src/oxli && $(MAKE) clean || true
cd tests && rm -rf khmertest_* || true
rm -f pytests.xml
cd third-party/cqf && make clean || true
cd third-party/MQF && make clean || true
rm -f $(EXTENSION_MODULE)
rm -f khmer/*.pyc scripts/*.pyc tests/*.pyc oxli/*.pyc \
sandbox/*.pyc khmer/__pycache__/* sandbox/__pycache__/* \
Expand Down
4 changes: 2 additions & 2 deletions examples/c++-api/Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
CXXFLAGS=--std=c++11 \
-I ../../include/ \
-I ../../third-party/smhasher \
-I ../../third-party/cqf \
-I ../../third-party/mqf \
-I ../../third-party/seqan/core/include/ \
-I ../../third-party/rollinghash

TESTS=exact-counting bloom consume

%: %.cc ../../src/oxli/liboxli.a
$(CXX) $(CXXFLAGS) $< ../../src/oxli/liboxli.a -o $@
$(CXX) $(CXXFLAGS) $< ../../src/oxli/liboxli.a -o $@ -lstdc++

../../src/oxli/liboxli.a:
cd ../../src/oxli && make
Expand Down
12 changes: 10 additions & 2 deletions include/oxli/hashtable.hh
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,18 @@ public:
class QFCounttable : public oxli::MurmurHashtable
{
public:
explicit QFCounttable(WordLength ksize, int size)
: MurmurHashtable(ksize, new QFStorage(size)) { } ;
explicit QFCounttable(WordLength ksize, int size,int slotsize)
: MurmurHashtable(ksize, new QFStorage(size,slotsize)) { } ;
};

class BufferedQFCounttable : public oxli::MurmurHashtable
{
public:
explicit BufferedQFCounttable(WordLength ksize, int size,int slotsize)
: MurmurHashtable(ksize, new BufferedMQFStorage(size,slotsize)) { } ;
};


// Hashtable-derived class with BitStorage.
class Nodetable : public oxli::MurmurHashtable
{
Expand Down
188 changes: 146 additions & 42 deletions include/oxli/storage.hh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Contact: khmer-project@idyll.org
using MuxGuard = std::lock_guard<std::mutex>;

#include "gqf.h"
#include "bufferedMQF.h"

namespace oxli {
typedef std::unordered_map<HashIntoType, BoundedCounterType> KmerCountMap;
Expand Down Expand Up @@ -410,53 +411,156 @@ public:
*
* \brief A Quotient Filter storage
*/
class QFStorage : public Storage {
class QFStorage : public Storage
{
protected:
QF cf;
QF mf;

public:
QFStorage(int size) {
// size is the power of two to specify the number of slots in
// the filter (2**size). Third argument sets the number of bits used
// in the key (current value of size+8 is copied from the CQF example)
// Final argument is the number of bits allocated for the value, which
// we do not use.
qf_init(&cf, (1ULL << size), size+8, 0);
}

~QFStorage() { qf_destroy(&cf); }

BoundedCounterType test_and_set_bits(HashIntoType khash) {
BoundedCounterType x = get_count(khash);
add(khash);
return !x;
}

//
bool add(HashIntoType khash) {
bool is_new = get_count(khash) == 0;
qf_insert(&cf, khash % cf.range, 0, 1);
return is_new;
}

// get the count for the given k-mer hash.
const BoundedCounterType get_count(HashIntoType khash) const {
return qf_count_key_value(&cf, khash % cf.range, 0);
}

// Accessors for protected/private table info members
// xnslots is larger than nslots. It includes some extra slots to deal
// with some details of how the counting is implemented
std::vector<uint64_t> get_tablesizes() const { return {cf.xnslots}; }
const size_t n_tables() const { return 1; }
const uint64_t n_unique_kmers() const { return cf.ndistinct_elts; }
const uint64_t n_occupied() const { return cf.noccupied_slots; }
void save(std::string outfilename, WordLength ksize);
void load(std::string infilename, WordLength &ksize);

Byte **get_raw_tables() { return nullptr; }
QFStorage(int size,int slotSize)
{
// size is the power of two to specify the number of slots in
// the filter (2**size). Third argument sets the number of bits used
// in the key (current value of size+8 is copied from the CQF example)
// Final argument is the number of bits allocated for the value, which
// we do not use.
_supports_bigcount = true;
qf_init(&mf, (1ULL << size), size+slotSize, 0,2,0,true,"",2038074761);



}

~QFStorage()
{
qf_destroy(&mf);
}

BoundedCounterType test_and_set_bits(HashIntoType khash)
{
BoundedCounterType x = get_count(khash);
add(khash);
return !x;
}

//
bool add(HashIntoType khash)
{
bool is_new = get_count(khash) == 0;
qf_insert(&mf, khash % mf.metadata->range, 1,false,false);
return is_new;
}

// get the count for the given k-mer hash.
const BoundedCounterType get_count(HashIntoType khash) const
{
return qf_count_key(&mf, khash % mf.metadata->range);
}

// Accessors for protected/private table info members
// xnslots is larger than nslots. It includes some extra slots to deal
// with some details of how the counting is implemented
std::vector<uint64_t> get_tablesizes() const
{
return {mf.metadata->xnslots};
}
const size_t n_tables() const
{
return 1;
}
const uint64_t n_unique_kmers() const
{
return mf.metadata->ndistinct_elts;
}
const uint64_t n_occupied() const
{
return mf.metadata->noccupied_slots;
}
void save(std::string outfilename, WordLength ksize);
void load(std::string infilename, WordLength &ksize);

Byte **get_raw_tables()
{
return nullptr;
}
};

class BufferedMQFStorage : public Storage
{
protected:
QF buffer;
bufferedMQF main;

public:
BufferedMQFStorage(int size,int slotSize)
{
// size is the power of two to specify the number of slots in
// the filter (2**size). Third argument sets the number of bits used
// in the key (current value of size+8 is copied from the CQF example)
// Final argument is the number of bits allocated for the value, which
// we do not use.
_supports_bigcount = true;
qf_init(&buffer, (1ULL << 15), 15+slotSize, 0,2,0,true,"",2038074761);
bufferedMQF_init(&main, (1ULL<< (size-2)) ,(1ULL << size), size+slotSize
,0,2,"");


}

~BufferedMQFStorage()
{
qf_destroy(&buffer);
bufferedMQF_destroy(&main);
}

BoundedCounterType test_and_set_bits(HashIntoType khash)
{
BoundedCounterType x = get_count(khash);
add(khash);
return !x;
}

//
bool add(HashIntoType khash)
{
bool is_new = get_count(khash) == 0;
bufferedMQF_insert(&main, khash % main.disk->metadata->range, 1,false,false);
return is_new;
}

// get the count for the given k-mer hash.
const BoundedCounterType get_count(HashIntoType khash) const
{
return bufferedMQF_count_key(&main, khash % main.disk->metadata->range);
}

// Accessors for protected/private table info members
// xnslots is larger than nslots. It includes some extra slots to deal
// with some details of how the counting is implemented
std::vector<uint64_t> get_tablesizes() const
{
return {main.disk->metadata->xnslots};
}
const size_t n_tables() const
{
return 1;
}
const uint64_t n_unique_kmers() const
{
return main.disk->metadata->ndistinct_elts;
}
const uint64_t n_occupied() const
{
return main.disk->metadata->noccupied_slots;
}
void save(std::string outfilename, WordLength ksize);
void load(std::string infilename, WordLength &ksize);

Byte **get_raw_tables()
{
return nullptr;
}
};

/*
* \class ByteStorage
Expand Down
2 changes: 1 addition & 1 deletion khmer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@

from khmer._khmer import FILETYPES

from khmer._oxli.graphs import (Counttable, QFCounttable, Nodetable,
from khmer._oxli.graphs import (Counttable, QFCounttable,BufferedQFCounttable, Nodetable,
CyclicCounttable,
SmallCounttable, Countgraph, SmallCountgraph,
Nodegraph)
Expand Down
10 changes: 8 additions & 2 deletions khmer/_oxli/graphs.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,11 @@ cdef extern from "oxli/hashtable.hh" namespace "oxli" nogil:
CpNodetable(WordLength, vector[uint64_t])

cdef cppclass CpQFCounttable "oxli::QFCounttable" (CpHashtable):
CpQFCounttable(WordLength, uint64_t) except +oxli_raise_py_error
CpQFCounttable(WordLength, uint64_t,uint64_t) except +oxli_raise_py_error


cdef cppclass CpBufferedQFCounttable "oxli::BufferedQFCounttable" (CpHashtable):
CpBufferedQFCounttable(WordLength, uint64_t,uint64_t) except +oxli_raise_py_error


cdef extern from "oxli/hashgraph.hh" namespace "oxli" nogil:
Expand Down Expand Up @@ -215,7 +219,7 @@ cdef extern from "oxli/labelhash.hh" namespace "oxli":
uint64_t &,
CallbackFn,
void *)
void consume_seqfile_and_tag_with_labels[SeqIO](const string &,
void _seqfile_and_tag_with_labels[SeqIO](const string &,
uint32_t &,
uint64_t &)
void consume_seqfile_and_tag_with_labels[SeqIO](
Expand Down Expand Up @@ -259,6 +263,8 @@ cdef class Hashtable:
cdef class QFCounttable(Hashtable):
cdef shared_ptr[CpQFCounttable] _qf_this

cdef class BufferedQFCounttable(Hashtable):
cdef shared_ptr[CpBufferedQFCounttable] _qf_this

cdef class SmallCounttable(Hashtable):
cdef shared_ptr[CpSmallCounttable] _st_this
Expand Down
54 changes: 50 additions & 4 deletions khmer/_oxli/graphs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ from khmer._khmer import ReadParser

CYTHON_TABLES = (Hashtable, Nodetable, Counttable, CyclicCounttable,
SmallCounttable,
QFCounttable, Nodegraph, Countgraph, SmallCountgraph)
QFCounttable,BufferedQFCounttable, Nodegraph, Countgraph, SmallCountgraph)


cdef class Hashtable:
Expand Down Expand Up @@ -368,27 +368,73 @@ cdef class QFCounttable(Hashtable):
Set the number of slots used by the counting quotient filter. This
determines the amount of memory used and how many k-mers can be entered
into the datastructure. Each slot uses roughly 1.3 bytes.
slot size: integer
"""

def __cinit__(self, int k, uint64_t size):
def __cinit__(self, int k, uint64_t size,uint64_t slotsize):
# size has to be a power of two
power_of_two = ((size & (size - 1) == 0) and
(size != 0))
if not power_of_two:
raise ValueError("size has to be a power of two, not"
" {}.".format(size))
if type(self) is QFCounttable:
self._qf_this = make_shared[CpQFCounttable](k, <uint64_t>log(size, 2))
self._qf_this = make_shared[CpQFCounttable](k, <uint64_t>log(size, 2),slotsize)
self._ht_this = <shared_ptr[CpHashtable]>self._qf_this


@classmethod
def load(cls, file_name):
"""Load the graph from the specified file."""
cdef QFCounttable table = cls(1, 1)
cdef QFCounttable table = cls(1, 1,1)
deref(table._qf_this).load(_bstring(file_name))
return table


cdef class BufferedQFCounttable(Hashtable):
"""Count kmers using a counting quotient filter.

The counting quotient filter (CQF) is an extension of the quotient filter
that supports counting in addition to simple membership testing. A CQF has
better cache locality compared to (Small)Counttable which increases
performance.

Each new k-mer uses one slot, and the number of slots used per k-mer
increases the more often the same k-mer is entered into the CQF. As a result
the CQF can be "full" and will stop accepting calls to `add` and `count`.

Parameters
----------
k : integer
k-mer size

size : integer
Set the number of slots used by the counting quotient filter. This
determines the amount of memory used and how many k-mers can be entered
into the datastructure. Each slot uses roughly 1.3 bytes.
slot size: integer
"""

def __cinit__(self, int k, uint64_t size,uint64_t slotsize):
# size has to be a power of two
power_of_two = ((size & (size - 1) == 0) and
(size != 0))
if not power_of_two:
raise ValueError("size has to be a power of two, not"
" {}.".format(size))
if type(self) is BufferedQFCounttable:
self._qf_this = make_shared[CpBufferedQFCounttable](k, <uint64_t>log(size, 2),slotsize)
self._ht_this = <shared_ptr[CpHashtable]>self._qf_this


@classmethod
def load(cls, file_name):
"""Load the graph from the specified file."""
cdef BufferedQFCounttable table = cls(1, 1,1)
deref(table._qf_this).load(_bstring(file_name))
return table


cdef class Counttable(Hashtable):

def __cinit__(self, int k, uint64_t starting_size, int n_tables,
Expand Down
Loading