Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bitarray related bugs #1

Merged
merged 14 commits into from
Apr 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
sudo: required
cache: pip
os: linux
dist: bionic
language: python
python:
- "3.6"
env:
global:
- COMMIT=${TRAVIS_COMMIT::8}
install:
- sudo apt-get install libdb-dev
- pip3 install cython
- pip install -r requirements.txt
- pip install -r optional-requirements.txt
- pip3 install -r requirements.txt
- pip3 install -r optional-requirements.txt
- pip3 install .
script:
- py.test -s -vv bigsi/tests/
after_success:
Expand Down
29 changes: 0 additions & 29 deletions bigsi/bitvector.py

This file was deleted.

8 changes: 1 addition & 7 deletions bigsi/bloom/bloomfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(self, m, h):
self.m = m
self.h = h
self.bitarray = bitarray(self.m)
self.bitarray.setall(0)

def __hashes(self, element):
return generate_hashes(element, self.h, self.m)
Expand All @@ -30,10 +31,3 @@ def update(self, elements):
for e in list(elements):
self.add(e)
return self


def load_bitarray(f):
bloomfilter = bitarray()
with open(f, "rb") as inf:
bloomfilter.fromfile(inf)
return bloomfilter
4 changes: 2 additions & 2 deletions bigsi/graph/bigsi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from bigsi.utils import convert_query_kmers
from bigsi.utils import seq_to_kmers
from bigsi.utils import bitwise_and
from bigsi.utils import non_zero_bitarrary_positions
from bigsi.utils import non_zero_bitarray_positions
from bigsi.storage import get_storage
from bigsi.scoring import Scorer
from bigsi.constants import DEFAULT_NPROC
Expand Down Expand Up @@ -190,7 +190,7 @@ def search(self, seq, threshold=1.0, score=False):
]

def exact_filter(self, kmers_to_colours):
colours_with_all_kmers = non_zero_bitarrary_positions(
colours_with_all_kmers = non_zero_bitarray_positions(
bitwise_and(kmers_to_colours.values())
)
samples = self.get_sample_list(colours_with_all_kmers)
Expand Down
80 changes: 80 additions & 0 deletions bigsi/tests/bloom/test_bloomfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from bitarray import bitarray
from random import choice
from hypothesis import assume, given, strategies as st

from bigsi.bloom import generate_hashes
from bigsi.bloom import BloomFilter


def test_generate_hashes():
assert generate_hashes("ATT", 3, 25) == {2, 15, 17}
assert generate_hashes("ATT", 1, 25) == {15}
assert generate_hashes("ATT", 2, 50) == {15, 27}


@given(len_bloom_filter=st.integers(min_value=1, max_value=1000),
num_hash_functions=st.integers(min_value=1, max_value=3))
def test_bloomfilter_created_with_initialisation(len_bloom_filter, num_hash_functions):
bloom_filter = BloomFilter(m=len_bloom_filter, h=num_hash_functions)
assert bloom_filter.bitarray == bitarray("0" * len_bloom_filter)


@given(len_bloom_filter=st.integers(min_value=100, max_value=2000),
num_hash_functions=st.integers(min_value=1, max_value=3),
len_kmer=st.integers(min_value=3, max_value=31),
num_kmers=st.integers(min_value=1, max_value=10))
def test_bloomfilter_updated_success(len_bloom_filter, num_hash_functions, len_kmer, num_kmers):
kmers = _generate_random_kmers(len_kmer, num_kmers)
hashes = _generate_kmer_hashes(kmers, len_bloom_filter, num_hash_functions)

expected = bitarray("0" * len_bloom_filter)
for h in hashes:
expected[h] = True

bloom_filter = BloomFilter(m=len_bloom_filter, h=num_hash_functions)
bloom_filter.update(kmers)

assert bloom_filter.bitarray == expected


@given(len_bloom_filter=st.integers(min_value=100, max_value=2000),
num_hash_functions=st.integers(min_value=1, max_value=3),
len_kmer=st.integers(min_value=3, max_value=31),
num_kmers=st.integers(min_value=1, max_value=10))
def test_bloomfilters_updated_with_same_kmers(len_bloom_filter, num_hash_functions, len_kmer, num_kmers):
kmers = _generate_random_kmers(len_kmer, num_kmers)

bloom_filter1 = BloomFilter(m=len_bloom_filter, h=num_hash_functions)
bloom_filter1.update(kmers)
bloom_filter2 = BloomFilter(m=len_bloom_filter, h=num_hash_functions)
bloom_filter2.update(kmers)

assert bloom_filter1.bitarray == bloom_filter2.bitarray


@given(len_bloom_filter=st.integers(min_value=100, max_value=2000),
num_hash_functions=st.integers(min_value=1, max_value=3),
len_kmer=st.integers(min_value=3, max_value=31),
num_kmers=st.integers(min_value=1, max_value=10))
def test_bloomfilters_updated_with_different_kmers(len_bloom_filter, num_hash_functions, len_kmer, num_kmers):
kmers1 = _generate_random_kmers(len_kmer, num_kmers)
hashes1 = _generate_kmer_hashes(kmers1, len_bloom_filter, num_hash_functions)
kmers2 = _generate_random_kmers(len_kmer, num_kmers)
hashes2 = _generate_kmer_hashes(kmers2, len_bloom_filter, num_hash_functions)

assume(hashes1 != hashes2)

bloom_filter1 = BloomFilter(m=len_bloom_filter, h=num_hash_functions)
bloom_filter1.update(kmers1)
bloom_filter2 = BloomFilter(m=len_bloom_filter, h=num_hash_functions)
bloom_filter2.update(kmers2)

assert bloom_filter1.bitarray != bloom_filter2.bitarray


def _generate_random_kmers(len_kmer, num_kmers):
return [''.join(choice("ACGT") for _ in range(len_kmer)) for _ in range(num_kmers)]


def _generate_kmer_hashes(kmers, len_bloom_filter, num_hash_functions):
return {h for kmer in kmers for h in generate_hashes(kmer, num_hash_functions, len_bloom_filter)}
21 changes: 0 additions & 21 deletions bigsi/tests/bloom/test_create_bloomfilter.py

This file was deleted.

Empty file removed bigsi/tests/cli/__init__.py
Empty file.
Empty file removed bigsi/tests/graph/__init__.py
Empty file.
3 changes: 0 additions & 3 deletions bigsi/tests/graph/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,6 @@ def test_exact_search():
bigsi.delete()


@pytest.mark.skip(
reason="Passes in isolation, but fails when run with the rest of the tests"
)
def test_inexact_search():
for config in CONFIGS:
get_storage(config).delete_all()
Expand Down
Empty file removed bigsi/tests/matrix/__init__.py
Empty file.
Empty file removed bigsi/tests/storage/__init__.py
Empty file.
16 changes: 16 additions & 0 deletions bigsi/tests/utils/test_fncts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from bitarray import bitarray
from hypothesis import given, strategies as st

from bigsi.utils.fncts import non_zero_bitarray_positions


@given(byte_values=st.lists(min_size=0, max_size=2000, elements=st.integers(min_value=0, max_value=255)))
def test_non_zero_bitarrary_positions_success(byte_values):
bit_array = bitarray()
bit_array.frombytes(bytes(byte_values))

expected = [index for index, bit in enumerate(bit_array) if bit]

result = non_zero_bitarray_positions(bit_array)

assert result == expected
9 changes: 3 additions & 6 deletions bigsi/utils/fncts.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import hashlib
import struct
import sys
import logging
from functools import reduce
import numpy as np
from functools import reduce
from itertools import islice, chain

logger = logging.getLogger(__name__)
Expand All @@ -25,8 +22,8 @@ def bitwise_and(bitarrays):
return reduce(lambda x, y: x & y, bitarrays)


def non_zero_bitarrary_positions(bitarray):
return np.where(bitarray)[0].tolist()
def non_zero_bitarray_positions(bitarray):
return np.where(np.unpackbits(bitarray))[0].tolist()


def chunks(l, n):
Expand Down
2 changes: 1 addition & 1 deletion optional-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Optional Storage
python-rocksdb
#python-rocksdb # temporarily commented out so that installation will not fail if not using RocksDB
bsddb3==6.2.5
uWSGI==2.0.18