diff --git a/.travis.yml b/.travis.yml index 2eb1bad..2b407ca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,7 @@ sudo: required cache: pip +os: linux +dist: bionic language: python python: - "3.6" @@ -7,9 +9,11 @@ env: global: - COMMIT=${TRAVIS_COMMIT::8} install: + - sudo apt-get install libdb-dev - pip3 install cython - - pip install -r requirements.txt - - pip install -r optional-requirements.txt + - pip3 install -r requirements.txt + - pip3 install -r optional-requirements.txt + - pip3 install . script: - py.test -s -vv bigsi/tests/ after_success: diff --git a/bigsi/bitvector.py b/bigsi/bitvector.py deleted file mode 100644 index efc94ad..0000000 --- a/bigsi/bitvector.py +++ /dev/null @@ -1,29 +0,0 @@ -from bitarray import bitarray -import numpy as np - - -class BitArray(bitarray): - def __init__(self, *args, **kwargs): - super().__init__() - - def setbit(self, i, bit): - if i < 0: - raise ValueError("Index must be >= 0") - try: - self[i] = bit - return self - except IndexError: - self.extend([False] * (1 + i - self.length())) - return self.setbit(i, bit) - - def getbit(self, i): - try: - return self[i] - except IndexError: - return False - - def indexes(self): - return np.where(self)[0].tolist() - - def colours(self): - return self.indexes() diff --git a/bigsi/bloom/bloomfilter.py b/bigsi/bloom/bloomfilter.py index de4dfaf..ff971a2 100644 --- a/bigsi/bloom/bloomfilter.py +++ b/bigsi/bloom/bloomfilter.py @@ -18,6 +18,7 @@ def __init__(self, m, h): self.m = m self.h = h self.bitarray = bitarray(self.m) + self.bitarray.setall(0) def __hashes(self, element): return generate_hashes(element, self.h, self.m) @@ -30,10 +31,3 @@ def update(self, elements): for e in list(elements): self.add(e) return self - - -def load_bitarray(f): - bloomfilter = bitarray() - with open(f, "rb") as inf: - bloomfilter.fromfile(inf) - return bloomfilter diff --git a/bigsi/graph/bigsi.py b/bigsi/graph/bigsi.py index fb66009..83a416a 100644 --- a/bigsi/graph/bigsi.py +++ b/bigsi/graph/bigsi.py @@ -11,7 +11,7 @@ from bigsi.utils import convert_query_kmers from bigsi.utils import seq_to_kmers from bigsi.utils import bitwise_and -from bigsi.utils import non_zero_bitarrary_positions +from bigsi.utils import non_zero_bitarray_positions from bigsi.storage import get_storage from bigsi.scoring import Scorer from bigsi.constants import DEFAULT_NPROC @@ -190,7 +190,7 @@ def search(self, seq, threshold=1.0, score=False): ] def exact_filter(self, kmers_to_colours): - colours_with_all_kmers = non_zero_bitarrary_positions( + colours_with_all_kmers = non_zero_bitarray_positions( bitwise_and(kmers_to_colours.values()) ) samples = self.get_sample_list(colours_with_all_kmers) diff --git a/bigsi/tests/bloom/test_bloomfilter.py b/bigsi/tests/bloom/test_bloomfilter.py new file mode 100644 index 0000000..8c4c88e --- /dev/null +++ b/bigsi/tests/bloom/test_bloomfilter.py @@ -0,0 +1,80 @@ +from bitarray import bitarray +from random import choice +from hypothesis import assume, given, strategies as st + +from bigsi.bloom import generate_hashes +from bigsi.bloom import BloomFilter + + +def test_generate_hashes(): + assert generate_hashes("ATT", 3, 25) == {2, 15, 17} + assert generate_hashes("ATT", 1, 25) == {15} + assert generate_hashes("ATT", 2, 50) == {15, 27} + + +@given(len_bloom_filter=st.integers(min_value=1, max_value=1000), + num_hash_functions=st.integers(min_value=1, max_value=3)) +def test_bloomfilter_created_with_initialisation(len_bloom_filter, num_hash_functions): + bloom_filter = BloomFilter(m=len_bloom_filter, h=num_hash_functions) + assert bloom_filter.bitarray == bitarray("0" * len_bloom_filter) + + +@given(len_bloom_filter=st.integers(min_value=100, max_value=2000), + num_hash_functions=st.integers(min_value=1, max_value=3), + len_kmer=st.integers(min_value=3, max_value=31), + num_kmers=st.integers(min_value=1, max_value=10)) +def test_bloomfilter_updated_success(len_bloom_filter, num_hash_functions, len_kmer, num_kmers): + kmers = _generate_random_kmers(len_kmer, num_kmers) + hashes = _generate_kmer_hashes(kmers, len_bloom_filter, num_hash_functions) + + expected = bitarray("0" * len_bloom_filter) + for h in hashes: + expected[h] = True + + bloom_filter = BloomFilter(m=len_bloom_filter, h=num_hash_functions) + bloom_filter.update(kmers) + + assert bloom_filter.bitarray == expected + + +@given(len_bloom_filter=st.integers(min_value=100, max_value=2000), + num_hash_functions=st.integers(min_value=1, max_value=3), + len_kmer=st.integers(min_value=3, max_value=31), + num_kmers=st.integers(min_value=1, max_value=10)) +def test_bloomfilters_updated_with_same_kmers(len_bloom_filter, num_hash_functions, len_kmer, num_kmers): + kmers = _generate_random_kmers(len_kmer, num_kmers) + + bloom_filter1 = BloomFilter(m=len_bloom_filter, h=num_hash_functions) + bloom_filter1.update(kmers) + bloom_filter2 = BloomFilter(m=len_bloom_filter, h=num_hash_functions) + bloom_filter2.update(kmers) + + assert bloom_filter1.bitarray == bloom_filter2.bitarray + + +@given(len_bloom_filter=st.integers(min_value=100, max_value=2000), + num_hash_functions=st.integers(min_value=1, max_value=3), + len_kmer=st.integers(min_value=3, max_value=31), + num_kmers=st.integers(min_value=1, max_value=10)) +def test_bloomfilters_updated_with_different_kmers(len_bloom_filter, num_hash_functions, len_kmer, num_kmers): + kmers1 = _generate_random_kmers(len_kmer, num_kmers) + hashes1 = _generate_kmer_hashes(kmers1, len_bloom_filter, num_hash_functions) + kmers2 = _generate_random_kmers(len_kmer, num_kmers) + hashes2 = _generate_kmer_hashes(kmers2, len_bloom_filter, num_hash_functions) + + assume(hashes1 != hashes2) + + bloom_filter1 = BloomFilter(m=len_bloom_filter, h=num_hash_functions) + bloom_filter1.update(kmers1) + bloom_filter2 = BloomFilter(m=len_bloom_filter, h=num_hash_functions) + bloom_filter2.update(kmers2) + + assert bloom_filter1.bitarray != bloom_filter2.bitarray + + +def _generate_random_kmers(len_kmer, num_kmers): + return [''.join(choice("ACGT") for _ in range(len_kmer)) for _ in range(num_kmers)] + + +def _generate_kmer_hashes(kmers, len_bloom_filter, num_hash_functions): + return {h for kmer in kmers for h in generate_hashes(kmer, num_hash_functions, len_bloom_filter)} diff --git a/bigsi/tests/bloom/test_create_bloomfilter.py b/bigsi/tests/bloom/test_create_bloomfilter.py deleted file mode 100644 index 74d519c..0000000 --- a/bigsi/tests/bloom/test_create_bloomfilter.py +++ /dev/null @@ -1,21 +0,0 @@ -from bigsi.bloom import generate_hashes -from bigsi.bloom import BloomFilter - - -def test_generate_hashes(): - assert generate_hashes("ATT", 3, 25) == {2, 15, 17} - assert generate_hashes("ATT", 1, 25) == {15} - assert generate_hashes("ATT", 2, 50) == {15, 27} - - -def test_create_bloom(): - for i in range(3): - kmers1 = ["ATT", "ATC"] - bloomfilter1 = BloomFilter(m=25, h=3) - bloomfilter1.update(kmers1) - - kmers2 = ["ATT", "ATT"] - bloomfilter2 = BloomFilter(m=25, h=3) - bloomfilter2.update(kmers2) - - assert bloomfilter1.bitarray != bloomfilter2.bitarray diff --git a/bigsi/tests/cli/__init__.py b/bigsi/tests/cli/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bigsi/tests/graph/__init__.py b/bigsi/tests/graph/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bigsi/tests/graph/test_end_to_end.py b/bigsi/tests/graph/test_end_to_end.py index 1d74381..c4e9efb 100644 --- a/bigsi/tests/graph/test_end_to_end.py +++ b/bigsi/tests/graph/test_end_to_end.py @@ -91,9 +91,6 @@ def test_exact_search(): bigsi.delete() -@pytest.mark.skip( - reason="Passes in isolation, but fails when run with the rest of the tests" -) def test_inexact_search(): for config in CONFIGS: get_storage(config).delete_all() diff --git a/bigsi/tests/matrix/__init__.py b/bigsi/tests/matrix/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bigsi/tests/storage/__init__.py b/bigsi/tests/storage/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/bigsi/tests/utils/test_fncts.py b/bigsi/tests/utils/test_fncts.py new file mode 100644 index 0000000..5a8bd29 --- /dev/null +++ b/bigsi/tests/utils/test_fncts.py @@ -0,0 +1,16 @@ +from bitarray import bitarray +from hypothesis import given, strategies as st + +from bigsi.utils.fncts import non_zero_bitarray_positions + + +@given(byte_values=st.lists(min_size=0, max_size=2000, elements=st.integers(min_value=0, max_value=255))) +def test_non_zero_bitarrary_positions_success(byte_values): + bit_array = bitarray() + bit_array.frombytes(bytes(byte_values)) + + expected = [index for index, bit in enumerate(bit_array) if bit] + + result = non_zero_bitarray_positions(bit_array) + + assert result == expected diff --git a/bigsi/utils/fncts.py b/bigsi/utils/fncts.py index 22b2b60..b21dc1b 100644 --- a/bigsi/utils/fncts.py +++ b/bigsi/utils/fncts.py @@ -1,9 +1,6 @@ -import hashlib -import struct -import sys import logging -from functools import reduce import numpy as np +from functools import reduce from itertools import islice, chain logger = logging.getLogger(__name__) @@ -25,8 +22,8 @@ def bitwise_and(bitarrays): return reduce(lambda x, y: x & y, bitarrays) -def non_zero_bitarrary_positions(bitarray): - return np.where(bitarray)[0].tolist() +def non_zero_bitarray_positions(bitarray): + return np.where(np.unpackbits(bitarray))[0].tolist() def chunks(l, n): diff --git a/optional-requirements.txt b/optional-requirements.txt index 14bfc29..b9fca7f 100644 --- a/optional-requirements.txt +++ b/optional-requirements.txt @@ -1,4 +1,4 @@ # Optional Storage -python-rocksdb +#python-rocksdb # temporarily commented out so that installation will not fail if not using RocksDB bsddb3==6.2.5 uWSGI==2.0.18 \ No newline at end of file