Skip to content

Commit

Permalink
Fixing bug that miscounts number of reads if extra newline characters…
Browse files Browse the repository at this point in the history
… are added or omitted from end of file
  • Loading branch information
trevormartinj7 committed Oct 11, 2024
1 parent 14139c3 commit ac7acf9
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 16 deletions.
1 change: 1 addition & 0 deletions .github/workflows/integration_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ jobs:
uses: actions/checkout@v3
with:
repository: edilytics/CRISPResso2_tests
ref: "trevor/get_n_reads_fix"
# ref: '<BRANCH-NAME>' # update to specific branch

- name: Run Basic
Expand Down
8 changes: 2 additions & 6 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,6 @@ def get_avg_read_length_fastq(fastq_filename):
p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0].strip())

def get_n_reads_fastq(fastq_filename):
p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < \"%s\" | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
return int(float(p.communicate()[0])/4.0)

def get_n_reads_bam(bam_filename,bam_chr_loc=""):
cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc
p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
Expand Down Expand Up @@ -2458,7 +2454,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited

N_READS_INPUT = 0
if args.fastq_r1:
N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
elif args.bam_input:
N_READS_INPUT = get_n_reads_bam(args.bam_input, args.bam_chr_loc)

Expand Down Expand Up @@ -2620,7 +2616,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
if args.bam_input:
N_READS_AFTER_PREPROCESSING = N_READS_INPUT
else:
N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
N_READS_AFTER_PREPROCESSING=CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
if N_READS_AFTER_PREPROCESSING == 0:
raise CRISPRessoShared.NoReadsAfterQualityFilteringException('No reads in input or no reads survived the average or single bp quality filtering.')

Expand Down
15 changes: 5 additions & 10 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,6 @@ def get_read_length_from_cigar(cigar_string):
result += int(length)
return result

def get_n_reads_fastq(fastq_filename):
p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
n_reads = int(float(p.communicate()[0])/4.0)
return n_reads

def get_n_reads_bam(bam_filename):
p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])
Expand Down Expand Up @@ -611,10 +606,10 @@ def main():
N_READS_INPUT = get_n_reads_bam(args.aligned_pooled_bam)
N_READS_AFTER_PREPROCESSING = N_READS_INPUT
else:
N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
if args.split_interleaved_input:
N_READS_INPUT /= 2
N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename)
N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename)

crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING)
CRISPRessoShared.write_crispresso_info(
Expand Down Expand Up @@ -854,7 +849,7 @@ def main():
n_reads_aligned_amplicons=[]
crispresso_cmds = []
for idx, row in df_template.iterrows():
this_n_reads = get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
this_n_reads = CRISPRessoShared.get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
n_reads_aligned_amplicons.append(this_n_reads)
info('\n Processing:%s with %d reads'%(idx,this_n_reads))
this_amp_seq = row['amplicon_seq']
Expand Down Expand Up @@ -1554,8 +1549,8 @@ def rreplace(s, old, new):

#if many reads weren't aligned, print those out for the user
if RUNNING_MODE != 'ONLY_GENOME':
#N_READS_INPUT=get_n_reads_fastq(args.fastq_r1)
#N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
#N_READS_INPUT=CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
#N_READS_AFTER_PREPROCESSING=CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum()
tot_reads = df_summary_quantification['Reads_total'].sum()

Expand Down
9 changes: 9 additions & 0 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,15 @@ def assert_fastq_format(file_path, max_lines_to_check=100):
raise InputFileFormatException('File %s is not in fastq format!' % (file_path)) from e


def get_n_reads_fastq(fastq_filename):
if not os.path.exists(fastq_filename) or os.path.getsize(fastq_filename) == 0:
return 0

p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | grep -c ." % fastq_filename, shell=True, stdout=sb.PIPE)
n_reads = int(float(p.communicate()[0])/4.0)
return n_reads


def check_output_folder(output_folder):
"""
Checks to see that the CRISPResso run has completed, and gathers the amplicon info for that run
Expand Down
118 changes: 118 additions & 0 deletions tests/unit_tests/test_CRISPRessoShared.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from CRISPResso2 import CRISPResso2Align, CRISPRessoShared
import tempfile
import os
import gzip

ALN_MATRIX = CRISPResso2Align.read_matrix('./CRISPResso2/EDNAFULL')

Expand Down Expand Up @@ -28,6 +31,120 @@ def test_get_relative_coordinates():
assert s1inds_gap_right == [0, 1, 2, 3, 4]


def test_get_n_reads_fastq():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)

def test_get_n_reads_fastq_gzip():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# gzip file
with open(f.name, 'rb') as f_in, gzip.open(f.name + '.gz', 'wb') as f_out:
f_out.writelines(f_in)

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name + '.gz') == 1

# Clean up: delete the file after the test
os.remove(f.name)
os.remove(f.name + '.gz')


def test_get_n_reads_fastq_three_extra_newlines():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.write("\n\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_n_reads_fastq_four_extra_newlines():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.write("\n\n\n\n\n\n\n\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_n_reads_fastq_100_reads():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
for i in range(100):
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n")
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 100

# Clean up: delete the file after the test
os.remove(f.name)

def test_get_n_reads_fastq_no_newline():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_n_reads_fastq_empty_file():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 0

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_relative_coordinates_to_gap():
# unaligned sequences
seq_1 = 'TTCGT'
Expand Down Expand Up @@ -98,3 +215,4 @@ def test_get_quant_window_ranges_from_include_idxs_single_gap():
def test_get_quant_window_ranges_from_include_idxs_multiple_gaps():
include_idxs = [50, 51, 52, 53, 55, 56, 57, 58, 60]
assert CRISPRessoShared.get_quant_window_ranges_from_include_idxs(include_idxs) == [(50, 53), (55, 58), (60, 60)]

0 comments on commit ac7acf9

Please sign in to comment.