Skip to content

Commit

Permalink
Convert numba generator to a function
Browse files Browse the repository at this point in the history
  • Loading branch information
ekiefl committed Mar 4, 2020
1 parent aecacd5 commit 1d25bf1
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 15 deletions.
12 changes: 5 additions & 7 deletions anvio/contigops.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,11 @@ def run_SCVs(self, bam):
# which genes we include for SCV analysis. Resolving issue
# https://github.com/merenlab/anvio/issues/1358 would enable a more elegant
# scenario, where this would not happen.
gene_overlap_start, gene_overlap_stop = next(utils.get_constant_value_blocks(gene_id_per_nt_in_read, gene_id))
gene_overlap_start, gene_overlap_stop = utils.get_constant_value_blocks(gene_id_per_nt_in_read, gene_id)[0]
gene_overlap_start += read.reference_start
gene_overlap_stop += read.reference_start - 1
start_index = next(utils.find_value_index(read[:, 0], gene_overlap_start))
stop_index = next(utils.find_value_index(read[:, 0], gene_overlap_stop))
start_index = utils.find_value_index(read[:, 0], gene_overlap_start)
stop_index = utils.find_value_index(read[:, 0], gene_overlap_stop)
segment_that_overlaps_gene = read[start_index:stop_index+1]

if gene_id not in gene_calls:
Expand Down Expand Up @@ -304,16 +304,14 @@ def run_SCVs(self, bam):
# must determine by how many nts on each side we must trim
base_positions = self.split.per_position_info['base_pos_in_codon'][block_start_split:block_end_split]

first_pos = next(utils.find_value_index(base_positions, (1 if gene_call['direction'] == 'f' else 3)))
last_pos = next(utils.find_value_index(base_positions, (3 if gene_call['direction'] == 'f' else 1), reverse_search=True))
first_pos = utils.find_value_index(base_positions, (1 if gene_call['direction'] == 'f' else 3))
last_pos = utils.find_value_index(base_positions, (3 if gene_call['direction'] == 'f' else 1), reverse_search=True)

if last_pos - first_pos < 3:
# the required trimming creates a sequence that is less than a codon long.
# We cannot use this read.
continue

# We use gapless_segment.trim instead of slicing with gapless_segment[a:b]
# because slicing makes a copy, whereas gapless_segment.trim modifies in place
gapless_segment = gapless_segment[first_pos:(last_pos+1), :]

# Update these for posterity
Expand Down
16 changes: 8 additions & 8 deletions anvio/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1559,18 +1559,18 @@ def get_constant_value_blocks(array, value):

@jit(nopython=True)
def find_value_index(x, val, reverse_search=False):
"""Generator that returns indices where a value is found
"""Returns first instance of indices where a value is found
Created this because unlike np.where, this is a generator that stops after the first instance is
found. If you only want the first instance, this algorithm is therefore preferrable for array
sizes < 1000 (see examples)
Created this because unlike np.where, this stops after the first instance is found. If you only
want the first instance, this algorithm is therefore preferrable for array sizes < 1000 (see
examples)
Parameters
==========
x : 1D array
val : number
Yield indices of x where the value == val.
return index of x where the value == val.
reverse_search : bool, False
Search in reverse order
Expand All @@ -1580,17 +1580,17 @@ def find_value_index(x, val, reverse_search=False):
>>> import numpy as np
>>> import anvio.utils as utils
>>> x = np.arange(1000)
>>> %timeit next(utils.find_value_index(x, 999, rev=True))
>>> %timeit utils.find_value_index(x, 999, rev=True)
574 ns ± 15.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
>>> %timeit next(utils.find_value_index(x, 999))
>>> %timeit utils.find_value_index(x, 999)
2.21 µs ± 36.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
>>> %timeit np.where(x == 999)[0][0]
2.91 µs ± 563 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
"""

for i in range(len(x)) if not reverse_search else range(len(x)-1, -1, -1):
if x[i] == val:
yield i
return i


def convert_SSM_to_single_accession(matrix_data):
Expand Down

1 comment on commit 1d25bf1

@ekiefl
Copy link
Contributor Author

@ekiefl ekiefl commented on 1d25bf1 Mar 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After this commit and the last, the memory profile for running SCVs looks like this and ran in 0:00:57.889664:

image

Whereas before it looked like this and ran in 0:01:02.596473:

image

Both the memory leak and the speed difference really indicate that numbas yield statement is in its infancy.

Please sign in to comment.