Merge branch 'master' into wgs-whole-region-deleted-v2.3.1

edilytics · Dec 6, 2024 · 7dbd5eb · 7dbd5eb
2 parents 1f175dd + 0232b06
commit 7dbd5eb
Show file tree

Hide file tree

Showing 14 changed files with 30,096 additions and 21,891 deletions.
diff --git a/CRISPResso2/CRISPResso2Align.c b/CRISPResso2/CRISPResso2Align.c
diff --git a/CRISPResso2/CRISPResso2Align.pyx b/CRISPResso2/CRISPResso2Align.pyx
@@ -17,9 +17,7 @@ cdef extern from "stdlib.h":
 cdef extern from "Python.h":
     ctypedef void PyObject
 
-ctypedef np.int_t DTYPE_INT
-ctypedef np.uint_t DTYPE_UINT
-ctypedef np.int8_t DTYPE_BOOL
+ctypedef long DTYPE_LONG
 
 cdef size_t UP = 1, LEFT = 2, DIAG = 3, NONE = 4
 cdef size_t MARRAY = 1, IARRAY = 2, JARRAY = 3
@@ -38,7 +36,7 @@ def read_matrix(path):
     The score for a 'C' changing to an 'A' is stored in the matrix as:
         mat[ord('C'), ord('A')] = score
     """
-    cdef np.ndarray[DTYPE_INT, ndim=2] a
+    cdef np.ndarray[DTYPE_LONG, ndim=2] a
     cdef size_t ai = 0, i
     cdef int v, mat_size
 
@@ -50,7 +48,7 @@ def read_matrix(path):
             headers = [ord(x) for x in line.split(' ') if x]
         mat_size = max(headers) + 1
 
-        a = np.zeros((mat_size, mat_size), dtype=int)
+        a = np.zeros((mat_size, mat_size), dtype=long)
 
         line = fh.readline()
         while line:
@@ -72,7 +70,7 @@ def make_matrix(match_score=5, mismatch_score=-4, n_mismatch_score=-2, n_match_s
     n_mismatch_score: score for matching a nucleotide with 'N'
     n_match_score: score for 'N' matching an 'N'
     """
-    cdef np.ndarray[DTYPE_INT, ndim=2] a
+    cdef np.ndarray[DTYPE_LONG, ndim=2] a
     cdef size_t ai = 0, i
     cdef int v, mat_size
 
@@ -82,7 +80,7 @@ def make_matrix(match_score=5, mismatch_score=-4, n_mismatch_score=-2, n_match_s
 
     nuc_ords = [ord(x) for x in ['A','T','C','G']]
 
-    a = np.zeros((mat_size, mat_size), dtype=int)
+    a = np.zeros((mat_size, mat_size), dtype=long)
 
     for nuc in nuc_ords:
       for nuc2 in nuc_ords:
@@ -102,8 +100,8 @@ def make_matrix(match_score=5, mismatch_score=-4, n_mismatch_score=-2, n_match_s
 
 @cython.boundscheck(False)
 @cython.nonecheck(False)
-def global_align(str pystr_seqj, str pystr_seqi, np.ndarray[DTYPE_INT, ndim=2] matrix,
-          np.ndarray[DTYPE_INT,ndim=1] gap_incentive, int gap_open=-1,
+def global_align(str pystr_seqj, str pystr_seqi, np.ndarray[DTYPE_LONG, ndim=2] matrix,
+          np.ndarray[DTYPE_LONG,ndim=1] gap_incentive, int gap_open=-1,
           int gap_extend=-1):
     """
     Global sequence alignment (needleman-wunsch) on seq i and j.

diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
@@ -819,9 +819,9 @@ def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names,
         crispresso_cmd_to_write = ' '.join(sys.argv)
         sam_out.write('@PG\tID:crispresso2\tPN:crispresso2\tVN:'+CRISPRessoShared.__version__+'\tCL:"'+crispresso_cmd_to_write+'"\n')
         if bam_chr_loc != "":
-            proc = sb.Popen(['samtools', 'view', bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
+            proc = sb.Popen(['samtools', 'view', '-F', args.samtools_exclude_flags, bam_filename, bam_chr_loc], stdout=sb.PIPE, encoding='utf-8')
         else:
-            proc = sb.Popen(['samtools', 'view', bam_filename], stdout=sb.PIPE, encoding='utf-8')
+            proc = sb.Popen(['samtools', 'view', '-F', args.samtools_exclude_flags, bam_filename], stdout=sb.PIPE, encoding='utf-8')
         num_reads = 0
 
         # Reading through the bam file and enriching variantCache as a dictionary with the following:
@@ -2335,7 +2335,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
 
                     #subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded)
                     this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs'])))
-                    this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs))))
+                    this_include_idxs = sorted(map(int, set(np.setdiff1d(this_include_idxs, this_exclude_idxs))))
 
                     refs[ref_name]['gap_incentive'] = this_gap_incentive
                     refs[ref_name]['sgRNA_cut_points'] = this_cut_points
@@ -2360,8 +2360,8 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
                         )
 
                     #subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded)
-                    this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs'])))
-                    this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs))))
+                    this_exclude_idxs = sorted(map(int, set(refs[ref_name]['exclude_idxs'])))
+                    this_include_idxs = sorted(map(int, set(np.setdiff1d(this_include_idxs, this_exclude_idxs))))
                     refs[ref_name]['include_idxs'] = this_include_idxs
                     refs[ref_name]['exclude_idxs'] = this_exclude_idxs
 
@@ -3361,11 +3361,15 @@ def calculate_99_max(d):
             ref_info_file.write(refString)
             np.set_printoptions(linewidth=1000**1000) #no line breaks
             for ref_name in ref_names:
+                if isinstance(refs[ref_name]['include_idxs'], np.ndarray):
+                    refs[ref_name]['include_idxs'] = refs[ref_name]['include_idxs'].tolist()
+                if isinstance(refs[ref_name]['exclude_idxs'], np.ndarray):
+                    refs[ref_name]['exclude_idxs'] = refs[ref_name]['exclude_idxs'].tolist()
                 refString = ( refs[ref_name]['name'] + "\t" +
                     str(refs[ref_name]['sequence']) + "\t" +
                     str(refs[ref_name]['sequence_length']) + "\t" +
                     str(refs[ref_name]['min_aln_score']) + "\t" +
-                    str(refs[ref_name]['gap_incentive']) + "\t" +
+                    str(refs[ref_name]['gap_incentive'].tolist()) + "\t" +
                     str(refs[ref_name]['sgRNA_cut_points']) + "\t" +
                     str(refs[ref_name]['sgRNA_plot_cut_points']) + "\t" +
                     str(refs[ref_name]['sgRNA_intervals']) + "\t" +