From 3eba6150965654a9009a8d893ceaf93fc1564ae4 Mon Sep 17 00:00:00 2001
From: Cole Lyman <cole@colelyman.com>
Date: Wed, 28 Aug 2024 16:33:48 -0400
Subject: [PATCH] Move `get_n_reads_fastq` to CRISPRessoShared

---
 CRISPResso2/CRISPRessoCORE.py       |  7 ++-----
 CRISPResso2/CRISPRessoPooledCORE.py | 12 +++---------
 CRISPResso2/CRISPRessoShared.py     |  6 ++++++
 CRISPResso2/CRISPRessoWGSCORE.py    |  4 ----
 4 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
index ebdd4d23..ceabc274 100644
--- a/CRISPResso2/CRISPRessoCORE.py
+++ b/CRISPResso2/CRISPRessoCORE.py
@@ -139,9 +139,6 @@ def get_avg_read_length_fastq(fastq_filename):
      p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
      return int(p.communicate()[0].strip())
 
-def get_n_reads_fastq(fastq_filename):
-    p = sb.Popen('gunzip -c' if fastq_filename.endswith('.gz') else 'cat' + ' < "%s" | wc -l' % fastq_filename, shell=True, stdout=sb.PIPE)
-    return round(float(p.communicate()[0]) / 4.0)
 
 def get_n_reads_bam(bam_filename,bam_chr_loc=""):
     cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc
@@ -2458,7 +2455,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
 
         N_READS_INPUT = 0
         if args.fastq_r1:
-            N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
+            N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
         elif args.bam_input:
             N_READS_INPUT = get_n_reads_bam(args.bam_input, args.bam_chr_loc)
 
@@ -2620,7 +2617,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
         if args.bam_input:
             N_READS_AFTER_PREPROCESSING = N_READS_INPUT
         else:
-            N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
+            N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
         if N_READS_AFTER_PREPROCESSING == 0:
             raise CRISPRessoShared.NoReadsAfterQualityFilteringException('No reads in input or no reads survived the average or single bp quality filtering.')
 
diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py
index 14184e13..2d9a00ed 100644
--- a/CRISPResso2/CRISPRessoPooledCORE.py
+++ b/CRISPResso2/CRISPRessoPooledCORE.py
@@ -145,10 +145,6 @@ def get_read_length_from_cigar(cigar_string):
             result += int(length)
     return result
 
-def get_n_reads_fastq(fastq_filename):
-     p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
-     n_reads = int(float(p.communicate()[0])/4.0)
-     return n_reads
 
 def get_n_reads_bam(bam_filename):
     p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
@@ -611,10 +607,10 @@ def main():
                 N_READS_INPUT = get_n_reads_bam(args.aligned_pooled_bam)
                 N_READS_AFTER_PREPROCESSING = N_READS_INPUT
             else:
-                N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
+                N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
                 if args.split_interleaved_input:
                     N_READS_INPUT /= 2
-                N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename)
+                N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
 
             crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING)
             CRISPRessoShared.write_crispresso_info(
@@ -852,7 +848,7 @@ def main():
             n_reads_aligned_amplicons=[]
             crispresso_cmds = []
             for idx, row in df_template.iterrows():
-                this_n_reads = get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
+                this_n_reads = CRISPRessoShared.get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
                 n_reads_aligned_amplicons.append(this_n_reads)
                 info('\n Processing:%s with %d reads'%(idx,this_n_reads))
                 this_amp_seq = row['amplicon_seq']
@@ -1552,8 +1548,6 @@ def rreplace(s, old, new):
 
         #if many reads weren't aligned, print those out for the user
         if RUNNING_MODE != 'ONLY_GENOME':
-            #N_READS_INPUT=get_n_reads_fastq(args.fastq_r1)
-            #N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
             tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum()
             tot_reads = df_summary_quantification['Reads_total'].sum()
 
diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
index 78d2e052..4ab4ed0e 100644
--- a/CRISPResso2/CRISPRessoShared.py
+++ b/CRISPResso2/CRISPRessoShared.py
@@ -722,6 +722,12 @@ def get_command_output(command):
             break
 
 
+def get_n_reads_fastq(fastq_filename):
+    """Count the number of reads from fastq_filename."""
+    p = sb.Popen('gunzip -c' if fastq_filename.endswith('.gz') else 'cat' + ' < "%s" | wc -l' % fastq_filename, shell=True, stdout=sb.PIPE)
+    return round(float(p.communicate()[0]) / 4.0)
+
+
 def get_most_frequent_reads(fastq_r1, fastq_r2, number_of_reads_to_consider, fastp_command, min_paired_end_reads_overlap, split_interleaved_input=False, debug=False):
     """
     Get the most frequent amplicon from a fastq file (or after merging a r1 and r2 fastq file).
diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py
index a50b1668..10426c41 100644
--- a/CRISPResso2/CRISPRessoWGSCORE.py
+++ b/CRISPResso2/CRISPRessoWGSCORE.py
@@ -203,10 +203,6 @@ def write_trimmed_fastq(in_bam_filename, bpstart, bpend, out_fastq_filename):
 pd=check_library('pandas')
 np=check_library('numpy')
 
-def get_n_reads_fastq(fastq_filename):
-     p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
-     n_reads = int(float(p.communicate()[0])/4.0)
-     return n_reads
 
 def extract_reads(row):
     if row.sequence: