From 3eba6150965654a9009a8d893ceaf93fc1564ae4 Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Wed, 28 Aug 2024 16:33:48 -0400 Subject: [PATCH] Move `get_n_reads_fastq` to CRISPRessoShared --- CRISPResso2/CRISPRessoCORE.py | 7 ++----- CRISPResso2/CRISPRessoPooledCORE.py | 12 +++--------- CRISPResso2/CRISPRessoShared.py | 6 ++++++ CRISPResso2/CRISPRessoWGSCORE.py | 4 ---- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index ebdd4d23..ceabc274 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -139,9 +139,6 @@ def get_avg_read_length_fastq(fastq_filename): p = sb.Popen(cmd, shell=True, stdout=sb.PIPE) return int(p.communicate()[0].strip()) -def get_n_reads_fastq(fastq_filename): - p = sb.Popen('gunzip -c' if fastq_filename.endswith('.gz') else 'cat' + ' < "%s" | wc -l' % fastq_filename, shell=True, stdout=sb.PIPE) - return round(float(p.communicate()[0]) / 4.0) def get_n_reads_bam(bam_filename,bam_chr_loc=""): cmd = "samtools view -c " + bam_filename + " " + bam_chr_loc @@ -2458,7 +2455,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited N_READS_INPUT = 0 if args.fastq_r1: - N_READS_INPUT = get_n_reads_fastq(args.fastq_r1) + N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1) elif args.bam_input: N_READS_INPUT = get_n_reads_bam(args.bam_input, args.bam_chr_loc) @@ -2620,7 +2617,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited if args.bam_input: N_READS_AFTER_PREPROCESSING = N_READS_INPUT else: - N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename) + N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename) if N_READS_AFTER_PREPROCESSING == 0: raise CRISPRessoShared.NoReadsAfterQualityFilteringException('No reads in input or no reads survived the average or single bp quality filtering.') diff --git a/CRISPResso2/CRISPRessoPooledCORE.py b/CRISPResso2/CRISPRessoPooledCORE.py index 14184e13..2d9a00ed 100644 --- a/CRISPResso2/CRISPRessoPooledCORE.py +++ b/CRISPResso2/CRISPRessoPooledCORE.py @@ -145,10 +145,6 @@ def get_read_length_from_cigar(cigar_string): result += int(length) return result -def get_n_reads_fastq(fastq_filename): - p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE) - n_reads = int(float(p.communicate()[0])/4.0) - return n_reads def get_n_reads_bam(bam_filename): p = sb.Popen("samtools view -c %s" % bam_filename, shell=True, stdout=sb.PIPE) @@ -611,10 +607,10 @@ def main(): N_READS_INPUT = get_n_reads_bam(args.aligned_pooled_bam) N_READS_AFTER_PREPROCESSING = N_READS_INPUT else: - N_READS_INPUT = get_n_reads_fastq(args.fastq_r1) + N_READS_INPUT = CRISPRessoShared.get_n_reads_fastq(args.fastq_r1) if args.split_interleaved_input: N_READS_INPUT /= 2 - N_READS_AFTER_PREPROCESSING = get_n_reads_fastq(processed_output_filename) + N_READS_AFTER_PREPROCESSING = CRISPRessoShared.get_n_reads_fastq(processed_output_filename) crispresso2_info['running_info']['finished_steps']['count_input_reads'] = (N_READS_INPUT, N_READS_AFTER_PREPROCESSING) CRISPRessoShared.write_crispresso_info( @@ -852,7 +848,7 @@ def main(): n_reads_aligned_amplicons=[] crispresso_cmds = [] for idx, row in df_template.iterrows(): - this_n_reads = get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename']) + this_n_reads = CRISPRessoShared.get_n_reads_fastq(row['Demultiplexed_fastq.gz_filename']) n_reads_aligned_amplicons.append(this_n_reads) info('\n Processing:%s with %d reads'%(idx,this_n_reads)) this_amp_seq = row['amplicon_seq'] @@ -1552,8 +1548,6 @@ def rreplace(s, old, new): #if many reads weren't aligned, print those out for the user if RUNNING_MODE != 'ONLY_GENOME': - #N_READS_INPUT=get_n_reads_fastq(args.fastq_r1) - #N_READS_AFTER_PREPROCESSING=get_n_reads_fastq(processed_output_filename) tot_reads_aligned = df_summary_quantification['Reads_aligned'].fillna(0).sum() tot_reads = df_summary_quantification['Reads_total'].sum() diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index 78d2e052..4ab4ed0e 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -722,6 +722,12 @@ def get_command_output(command): break +def get_n_reads_fastq(fastq_filename): + """Count the number of reads from fastq_filename.""" + p = sb.Popen('gunzip -c' if fastq_filename.endswith('.gz') else 'cat' + ' < "%s" | wc -l' % fastq_filename, shell=True, stdout=sb.PIPE) + return round(float(p.communicate()[0]) / 4.0) + + def get_most_frequent_reads(fastq_r1, fastq_r2, number_of_reads_to_consider, fastp_command, min_paired_end_reads_overlap, split_interleaved_input=False, debug=False): """ Get the most frequent amplicon from a fastq file (or after merging a r1 and r2 fastq file). diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py index a50b1668..10426c41 100644 --- a/CRISPResso2/CRISPRessoWGSCORE.py +++ b/CRISPResso2/CRISPRessoWGSCORE.py @@ -203,10 +203,6 @@ def write_trimmed_fastq(in_bam_filename, bpstart, bpend, out_fastq_filename): pd=check_library('pandas') np=check_library('numpy') -def get_n_reads_fastq(fastq_filename): - p = sb.Popen(('z' if fastq_filename.endswith('.gz') else '' ) +"cat < %s | wc -l" % fastq_filename, shell=True, stdout=sb.PIPE) - n_reads = int(float(p.communicate()[0])/4.0) - return n_reads def extract_reads(row): if row.sequence: