Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix get_n_fastq function #108

Merged
merged 9 commits into from
Dec 13, 2024
8 changes: 2 additions & 6 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,6 @@ def get_avg_read_length_fastq(fastq_filename):
p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0].strip())

def get_n_reads_fastq(fastq_filename):
p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < \"%s\" | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
return int(float(p.communicate()[0])/4.0)

def get_n_reads_bam(bam_filename,bam_chr_loc=""):
cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc
p = sb.Popen(cmd, shell=True, stdout=sb.PIPE)
Expand Down Expand Up @@ -2458,7 +2454,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited

N_READS_INPUT = 0
if args.fastq_r1:
N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
elif args.bam_input:
N_READS_INPUT = get_n_reads_bam(args.bam_input, args.bam_chr_loc)

Expand Down Expand Up @@ -2620,7 +2616,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited
if args.bam_input:
N_READS_AFTER_PREPROCESSING = N_READS_INPUT
else:
N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
N_READS_AFTER_PREPROCESSING=CRISPRessoShared.get_n_reads_fastq(processed_output_filename)
if N_READS_AFTER_PREPROCESSING == 0:
raise CRISPRessoShared.NoReadsAfterQualityFilteringException('No reads in input or no reads survived the average or single bp quality filtering.')

Expand Down
15 changes: 4 additions & 11 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,6 @@ def get_read_length_from_cigar(cigar_string):
result += int(length)
return result

def get_n_reads_fastq(fastq_filename):
p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE)
n_reads = int(float(p.communicate()[0])/4.0)
return n_reads

def get_n_reads_bam(bam_filename):
p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE)
return int(p.communicate()[0])
Expand Down Expand Up @@ -629,10 +624,10 @@ def main():
N_READS_INPUT = get_n_reads_bam(args.aligned_pooled_bam)
N_READS_AFTER_PREPROCESSING = N_READS_INPUT
else:
N_READS_INPUT = get_n_reads_fastq(args.fastq_r1)
N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1)
if args.split_interleaved_input:
N_READS_INPUT /= 2
N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename)
N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename)

crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING)
CRISPRessoShared.write_crispresso_info(
Expand Down Expand Up @@ -880,7 +875,7 @@ def main():
n_reads_aligned_amplicons=[]
crispresso_cmds = []
for idx, row in df_template.iterrows():
this_n_reads = get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
this_n_reads = CRISPRessoShared.get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename'])
n_reads_aligned_amplicons.append(this_n_reads)
info('\n Processing:%s with %d reads'%(idx,this_n_reads))
this_amp_seq = row['amplicon_seq']
Expand Down Expand Up @@ -1159,7 +1154,7 @@ def rreplace(s, old, new):
END{ \
if (fastq_filename!="NA") {if (num_records < __MIN_READS__){\
record_log_str = record_log_str chr_id"\t"bpstart"\t"bpend"\t"num_records"\tNA\n"} \
else{printf("%s",fastq_records)>fastq_filename;close(fastq_filename); system("gzip -f "fastq_filename); record_log_str = record_log_str chr_id"\t"bpstart"\t"bpend"\t"num_records"\t"fastq_filename".gz\n"} \
else{print(fastq_records)>fastq_filename;close(fastq_filename); system("gzip -f "fastq_filename); record_log_str = record_log_str chr_id"\t"bpstart"\t"bpend"\t"num_records"\t"fastq_filename".gz\n"} \
}\
print record_log_str > "__DEMUX_CHR_LOGFILENAME__" \
}' '''
Expand Down Expand Up @@ -1579,8 +1574,6 @@ def rreplace(s, old, new):

#if many reads weren't aligned, print those out for the user
if RUNNING_MODE != 'ONLY_GENOME':
#N_READS_INPUT=get_n_reads_fastq(args.fastq_r1)
Colelyman marked this conversation as resolved.
Show resolved Hide resolved
#N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename)
tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum()
tot_reads = df_summary_quantification['Reads_total'].sum()

Expand Down
9 changes: 9 additions & 0 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,15 @@ def assert_fastq_format(file_path, max_lines_to_check=100):
raise InputFileFormatException('File %s is not in fastq format!' % (file_path)) from e


def get_n_reads_fastq(fastq_filename):
if not os.path.exists(fastq_filename) or os.path.getsize(fastq_filename) == 0:
return 0

p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | grep -c ." % fastq_filename, shell=True, stdout=sb.PIPE)
n_reads = int(float(p.communicate()[0])/4.0)
return n_reads


def check_output_folder(output_folder):
"""
Checks to see that the CRISPResso run has completed, and gathers the amplicon info for that run
Expand Down
118 changes: 118 additions & 0 deletions tests/unit_tests/test_CRISPRessoShared.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from CRISPResso2 import CRISPResso2Align, CRISPRessoShared
import tempfile
import os
import gzip

ALN_MATRIX = CRISPResso2Align.read_matrix('./CRISPResso2/EDNAFULL')

Expand Down Expand Up @@ -28,6 +31,120 @@ def test_get_relative_coordinates():
assert s1inds_gap_right == [0, 1, 2, 3, 4]


def test_get_n_reads_fastq():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)

def test_get_n_reads_fastq_gzip():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# gzip file
with open(f.name, 'rb') as f_in, gzip.open(f.name + '.gz', 'wb') as f_out:
f_out.writelines(f_in)

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name + '.gz') == 1

# Clean up: delete the file after the test
os.remove(f.name)
os.remove(f.name + '.gz')


def test_get_n_reads_fastq_three_extra_newlines():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.write("\n\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_n_reads_fastq_four_extra_newlines():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n") # Ensure the file content is correct and ends with a newline
f.write("\n\n\n\n\n\n\n\n") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_n_reads_fastq_100_reads():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
for i in range(100):
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA\n")
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 100

# Clean up: delete the file after the test
os.remove(f.name)

def test_get_n_reads_fastq_no_newline():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.write("@SEQ_ID\n")
f.write("GATTACA\n")
f.write("+\n")
f.write("AAAAAAA") # Ensure the file content is correct and ends with a newline
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 1

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_n_reads_fastq_empty_file():
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.fastq') as f:
f.flush() # Flush the content to disk
os.fsync(f.fileno()) # Ensure all internal buffers associated with the file are written to disk

# Since the file needs to be read by another process, we ensure it's closed and written before passing it
assert CRISPRessoShared.get_n_reads_fastq(f.name) == 0

# Clean up: delete the file after the test
os.remove(f.name)


def test_get_relative_coordinates_to_gap():
# unaligned sequences
seq_1 = 'TTCGT'
Expand Down Expand Up @@ -98,3 +215,4 @@ def test_get_quant_window_ranges_from_include_idxs_single_gap():
def test_get_quant_window_ranges_from_include_idxs_multiple_gaps():
include_idxs = [50, 51, 52, 53, 55, 56, 57, 58, 60]
assert CRISPRessoShared.get_quant_window_ranges_from_include_idxs(include_idxs) == [(50, 53), (55, 58), (60, 60)]

Loading