From f6ef62cfdf909adac1b10ea86555cd218f8b2a74 Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Fri, 10 Dec 2021 15:29:59 -0700 Subject: [PATCH] Fix bug in `find_indels_substitutions` This bug occurred when there was a deletion at the end of a sequence, and was thus not properly accounted for. --- CRISPResso2/CRISPRessoCOREResources.pyx | 77 +++++++++++++++---------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/CRISPResso2/CRISPRessoCOREResources.pyx b/CRISPResso2/CRISPRessoCOREResources.pyx index fcf7fadb..8a849eb5 100644 --- a/CRISPResso2/CRISPRessoCOREResources.pyx +++ b/CRISPResso2/CRISPRessoCOREResources.pyx @@ -78,35 +78,54 @@ def find_indels_substitutions(read_seq_al, ref_seq_al, _include_indx): start_insertion = idx - 1 current_insertion_size += 1 - substitution_n = len(substitution_positions) - - #the remainder of positions are with reference to the original reference sequence indexes we calculated above - all_deletion_positions=[] - deletion_positions=[] - deletion_coordinates=[] - deletion_sizes=[] - - all_insertion_positions=[] - all_insertion_left_positions=[] - insertion_positions=[] - insertion_coordinates = [] - insertion_sizes=[] - - include_indx_set = set(_include_indx) - for p in re_find_indels.finditer(read_seq_al): - st,en=p.span() - ref_st = 0 - if st-1 > 0: - ref_st = ref_positions[st] - ref_en = idx-1 - if en < len(ref_positions): - ref_en = ref_positions[en] - all_deletion_positions.extend(range(ref_st,ref_en)) - inc_del_pos = include_indx_set.intersection(range(ref_st,ref_en)) - if(len(inc_del_pos)>0): - deletion_positions.extend(range(ref_st,ref_en)) - deletion_coordinates.append((ref_st,ref_en)) - deletion_sizes.append(en-st) + if read_seq_al[idx_c] == '-' and start_deletion == -1: # this is the first part of a deletion + if idx_c - 1 > 0: + start_deletion = ref_positions[idx_c] + else: + start_deletion = 0 + elif read_seq_al[idx_c] != '-' and start_deletion != -1: # this is the end of a deletion + end_deletion = ref_positions[idx_c] + all_deletion_positions.extend(range(start_deletion, end_deletion)) + if include_indx_set.intersection(range(start_deletion, end_deletion)): + deletion_positions.extend(range(start_deletion, end_deletion)) + deletion_coordinates.append((start_deletion, end_deletion)) + deletion_sizes.append(end_deletion - start_deletion) + start_deletion = -1 + + if start_deletion != -1: + end_deletion = ref_positions[seq_len - 1] + all_deletion_positions.extend(range(start_deletion, end_deletion)) + if include_indx_set.intersection(range(start_deletion, end_deletion)): + deletion_positions.extend(range(start_deletion, end_deletion)) + deletion_coordinates.append((start_deletion, end_deletion)) + deletion_sizes.append(end_deletion - start_deletion) + + cdef size_t substitution_n = len(substitution_positions) + cdef size_t deletion_n = sum(deletion_sizes) + cdef size_t insertion_n = sum(insertion_sizes) + + return { + 'all_insertion_positions': all_insertion_positions, + 'all_insertion_left_positions': all_insertion_left_positions, + 'insertion_positions': insertion_positions, + 'insertion_coordinates': insertion_coordinates, + 'insertion_sizes': insertion_sizes, + 'insertion_n': insertion_n, + + 'all_deletion_positions': all_deletion_positions, + 'deletion_positions': deletion_positions, + 'deletion_coordinates': deletion_coordinates, + 'deletion_sizes': deletion_sizes, + 'deletion_n': deletion_n, + + 'all_substitution_positions': all_substitution_positions, + 'substitution_positions': substitution_positions, + 'all_substitution_values': np.array(all_substitution_values), + 'substitution_values': np.array(substitution_values), + 'substitution_n': substitution_n, + + 'ref_positions': ref_positions, + } deletion_n = np.sum(deletion_sizes)