Fixing bug that miscounts number of reads if extra newline characters…

… are added or omitted from end of file
edilytics · Oct 11, 2024 · ac7acf9 · ac7acf9
1 parent 14139c3
commit ac7acf9
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 16 deletions.
diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml
@@ -63,6 +63,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           repository: edilytics/CRISPResso2_tests
+          ref: "trevor/get_n_reads_fix"
           # ref: '<BRANCH-NAME>' # update to specific branch
 
       - name: Run Basic

diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
@@ -138,10 +138,6 @@ def get_avg_read_length_fastq(fastq_filename):
      p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
      return int(p.communicate()[0].strip())
 
-def get_n_reads_fastq(fastq_filename):
-    p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < \"%s\" | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
-    return int(float(p.communicate()[0])/4.0)
-
 def get_n_reads_bam(bam_filename,bam_chr_loc=""):
     cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc
     p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
@@ -2458,7 +2454,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
 
         N_READS_INPUT = 0
         if args.fastq_r1:
-            N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
+            N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
         elif args.bam_input:
             N_READS_INPUT = get_n_reads_bam(args.bam_input, args.bam_chr_loc)
 
@@ -2620,7 +2616,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
         if args.bam_input:
             N_READS_AFTER_PREPROCESSING = N_READS_INPUT
         else:
-            N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
+            N_READS_AFTER_PREPROCESSING=CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
         if N_READS_AFTER_PREPROCESSING == 0:
             raise CRISPRessoShared.NoReadsAfterQualityFilteringException('No reads in input or no reads survived the average or single bp quality filtering.')
 

diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py
@@ -145,11 +145,6 @@ def get_read_length_from_cigar(cigar_string):
             result += int(length)
     return result
 
-def get_n_reads_fastq(fastq_filename):
-     p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
-     n_reads = int(float(p.communicate()[0])/4.0)
-     return n_reads
-
 def get_n_reads_bam(bam_filename):
     p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
     return int(p.communicate()[0])
@@ -611,10 +606,10 @@ def main():
                 N_READS_INPUT = get_n_reads_bam(args.aligned_pooled_bam)
                 N_READS_AFTER_PREPROCESSING = N_READS_INPUT
             else:
-                N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
+                N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
                 if args.split_interleaved_input:
                     N_READS_INPUT /= 2
-                N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename)
+                N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
 
             crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING)
             CRISPRessoShared.write_crispresso_info(
@@ -854,7 +849,7 @@ def main():
             n_reads_aligned_amplicons=[]
             crispresso_cmds = []
             for idx, row in df_template.iterrows():
-                this_n_reads = get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
+                this_n_reads = CRISPRessoShared.get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
                 n_reads_aligned_amplicons.append(this_n_reads)
                 info('\n Processing:%s with %d reads'%(idx,this_n_reads))
                 this_amp_seq = row['amplicon_seq']
@@ -1554,8 +1549,8 @@ def rreplace(s, old, new):
 
         #if many reads weren't aligned, print those out for the user
         if RUNNING_MODE != 'ONLY_GENOME':
-            #N_READS_INPUT=get_n_reads_fastq(args.fastq_r1)
-            #N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
+            #N_READS_INPUT=CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
+            #N_READS_AFTER_PREPROCESSING=CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
             tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum()
             tot_reads = df_summary_quantification['Reads_total'].sum()
 

diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
@@ -492,6 +492,15 @@ def assert_fastq_format(file_path, max_lines_to_check=100):
         raise InputFileFormatException('File %s is not in fastq format!' % (file_path)) from e
 
 
+def get_n_reads_fastq(fastq_filename):
+    if not os.path.exists(fastq_filename) or os.path.getsize(fastq_filename) == 0:
+        return 0
+
+    p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | grep -c ." % fastq_filename, shell=True, stdout=sb.PIPE)
+    n_reads = int(float(p.communicate()[0])/4.0)
+    return n_reads
+
+
 def check_output_folder(output_folder):
     """
     Checks to see that the CRISPResso run has completed, and gathers the amplicon info for that run

diff --git a/tests/unit_tests/test_CRISPRessoShared.py b/tests/unit_tests/test_CRISPRessoShared.py
@@ -1,4 +1,7 @@
 from CRISPResso2 import CRISPResso2Align, CRISPRessoShared
+import tempfile
+import os
+import gzip
 
 ALN_MATRIX = CRISPResso2Align.read_matrix('./CRISPResso2/EDNAFULL')
 
@@ -28,6 +31,120 @@ def test_get_relative_coordinates():
     assert s1inds_gap_right == [0, 1, 2, 3, 4]
 
 
+def test_get_n_reads_fastq():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        f.write("@SEQ_ID\n")
+        f.write("GATTACA\n")
+        f.write("+\n")
+        f.write("AAAAAAA\n")  # Ensure the file content is correct and ends with a newline
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+
+def test_get_n_reads_fastq_gzip():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        f.write("@SEQ_ID\n")
+        f.write("GATTACA\n")
+        f.write("+\n")
+        f.write("AAAAAAA\n")  # Ensure the file content is correct and ends with a newline
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # gzip file
+    with open(f.name, 'rb') as f_in, gzip.open(f.name + '.gz', 'wb') as f_out:
+        f_out.writelines(f_in)
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name + '.gz') == 1
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+    os.remove(f.name + '.gz')
+
+
+def test_get_n_reads_fastq_three_extra_newlines():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        f.write("@SEQ_ID\n")
+        f.write("GATTACA\n")
+        f.write("+\n")
+        f.write("AAAAAAA\n")  # Ensure the file content is correct and ends with a newline
+        f.write("\n\n")  # Ensure the file content is correct and ends with a newline
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+
+
+def test_get_n_reads_fastq_four_extra_newlines():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        f.write("@SEQ_ID\n")
+        f.write("GATTACA\n")
+        f.write("+\n")
+        f.write("AAAAAAA\n")  # Ensure the file content is correct and ends with a newline
+        f.write("\n\n\n\n\n\n\n\n")  # Ensure the file content is correct and ends with a newline
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+
+
+def test_get_n_reads_fastq_100_reads():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        for i in range(100):
+            f.write("@SEQ_ID\n")
+            f.write("GATTACA\n")
+            f.write("+\n")
+            f.write("AAAAAAA\n")
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name) == 100
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+
+def test_get_n_reads_fastq_no_newline():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        f.write("@SEQ_ID\n")
+        f.write("GATTACA\n")
+        f.write("+\n")
+        f.write("AAAAAAA")  # Ensure the file content is correct and ends with a newline
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+
+
+def test_get_n_reads_fastq_empty_file():
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
+        f.flush()  # Flush the content to disk
+        os.fsync(f.fileno())  # Ensure all internal buffers associated with the file are written to disk
+
+    # Since the file needs to be read by another process, we ensure it's closed and written before passing it
+    assert CRISPRessoShared.get_n_reads_fastq(f.name) == 0
+
+    # Clean up: delete the file after the test
+    os.remove(f.name)
+
+
 def test_get_relative_coordinates_to_gap():
     # unaligned sequences
     seq_1 = 'TTCGT'
@@ -98,3 +215,4 @@ def test_get_quant_window_ranges_from_include_idxs_single_gap():
 def test_get_quant_window_ranges_from_include_idxs_multiple_gaps():
     include_idxs = [50, 51, 52, 53, 55, 56, 57, 58, 60]
     assert CRISPRessoShared.get_quant_window_ranges_from_include_idxs(include_idxs) == [(50, 53), (55, 58), (60, 60)]
+