From 01940aad96195d05b952cfe8a73f2cf53f48a644 Mon Sep 17 00:00:00 2001 From: npbhavya Date: Thu, 1 Aug 2024 10:24:27 +0930 Subject: [PATCH] smaller fixes to the summary.py --- sphae/workflow/rules/3.qc_qa.smk | 30 ++++++---------- sphae/workflow/scripts/pick_phage_contigs.py | 7 ++-- sphae/workflow/scripts/summary.py | 38 ++++++++++++-------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/sphae/workflow/rules/3.qc_qa.smk b/sphae/workflow/rules/3.qc_qa.smk index f265516..a5f59de 100644 --- a/sphae/workflow/rules/3.qc_qa.smk +++ b/sphae/workflow/rules/3.qc_qa.smk @@ -71,25 +71,21 @@ rule run_seqkit_short: threads: config['resources']['smalljob']['cpu'], log: - os.path.join(dir_log, "seqkit", "{sample}_seqkit_short.log"), + os.path.join(dir_log, "seqkit", "{sample}_bases_short.log"), shell: """ - if [[ -s input.r1 ]]; then - seqkit stats {input.r1} -T > {params.r1_temp} - seqkit stats {input.r2} -T > {params.r2_temp} + seqkit stats {input.r1} -T > {params.r1_temp} + seqkit stats {input.r2} -T > {params.r2_temp} - # Extract numeric values from the second line of the output files - value1=$(awk 'NR==2 {{print $5}}' {params.r1_temp}) - value2=$(awk 'NR==2 {{print $5}}' {params.r2_temp}) + # Extract numeric values from the second line of the output files + value1=$(awk 'NR==2 {{print $5}}' {params.r1_temp}) + value2=$(awk 'NR==2 {{print $5}}' {params.r2_temp}) - # Add the values - sum=$((value1 + value2)) + # Add the values + sum=$((value1 + value2)) - # Write the sum to output.r - echo $sum > {output.r} - else - echo 0 > {output.r} - fi + # Write the sum to output.r + echo $sum > {output.r} """ rule filtlong_long: @@ -135,9 +131,5 @@ rule run_seqkit_long: os.path.join(dir_log, "{sample}_long_seqkit.log"), shell: """ - if [[ -s {input.fastq} ]] ; then - seqkit stats {input.fastq} -T -N 50 -N 90 | awk 'NR==2 {{print $5}}' > {output.r} - - fi - echo 0 > {output.r} + seqkit stats {input.fastq} -T -N 50 -N 90 | awk 'NR==2 {{print $5}}' > {output.r} """ diff --git a/sphae/workflow/scripts/pick_phage_contigs.py b/sphae/workflow/scripts/pick_phage_contigs.py index 2a071aa..a373d3e 100755 --- a/sphae/workflow/scripts/pick_phage_contigs.py +++ b/sphae/workflow/scripts/pick_phage_contigs.py @@ -17,8 +17,8 @@ def picking_contigs(file,out): datav = data[data["Length_x"] > 1000] datav = datav[datav["Prediction"] == "Virus"] datac = datav[datav["completeness"]> 70.00] - print (len(data)) - print (data) + #print (len(data)) + #print (datac) else: open(out, 'a').close() return None @@ -38,12 +38,15 @@ def picking_contigs(file,out): #print ("entering this if statement") if (datac["Connections"] == 0).any(): print("The genome is assembled, yay!") + #print("writing to", out) datac.to_csv(out, index=False) else: print ("The genome has some regions that are frgamented, but mostly assembled") print ("Take a look at the assembly graph file in bandage for more information on where the genome is fragmented") datac.to_csv(out, index=False) + with open(out, 'a'): + os.utime(out, None) picking_contigs(snakemake.input.csv, snakemake.output.out) diff --git a/sphae/workflow/scripts/summary.py b/sphae/workflow/scripts/summary.py index 7c033a8..60a7550 100755 --- a/sphae/workflow/scripts/summary.py +++ b/sphae/workflow/scripts/summary.py @@ -56,27 +56,23 @@ def count_hypothetical_proteins(gbk_file): count += 1 return count -def get_num(params): - length=(input_files['read']) - sum_len_value = length[0] - - return sum_len_value - def write_single_genome_summary(input_files, summary): with open(input_files['table'], 'r') as table_file: lines = table_file.readlines() if len(lines) > 1: # Ensure there is at least one data line fields = lines[1].strip().split(',') # Split the second line into fields - read_length = get_num(params) - summary.write(f"Total length of reads after QC and subsampling:{read_length}\n") + filename = input_files['read'] + with open(filename, 'r') as file: + contents = file.read() + summary.write(f"Total length of reads after QC and subsampling: {contents}\n") # Ensure fields list has enough elements to access the desired indices if len(fields) >= 23: summary.write(f"Length: {lines[1].split(',')[2]}\n") if fields[3].strip() == 'True': - summary.write("Circular: True\t") + summary.write("Circular: True\n") else: summary.write("Circular: False\n") @@ -179,9 +175,9 @@ def write_single_genome_summary(input_files, summary): #CRISPR spacers if (len(open(input_files['spacers']).readlines()) == 0) and (len(open(input_files['acr']).readlines()) == 0): - summary.write("No CRISPR spacers or anti CRISPR spacers found\n") + summary.write("No anti CRISPR spacers found\n") elif (len(open(input_files['spacers']).readlines()) != 0): - summary.write("CRISPR spacers found\n") + summary.write("anti-CRISPR spacers found\n") shutil.copy(input_files['spacers'], outdir) elif (len(open(input_files['acr']).readlines()) != 0): summary.write("anti-CRISPR spacers found\n") @@ -197,8 +193,10 @@ def write_single_genome_summary(input_files, summary): def write_multiple_genome_summary(input_files, summary): summary.write("Multiple phages assembled from this sample\n") summary.write("Their characteristics are:\n") - read_length = get_num(params) - summary.write(f"Total length of reads after QC and subsampling:{read_length}\n") + filename = input_files['read'] + with open(filename, 'r') as file: + contents = file.read() + summary.write(f"Total length of reads after QC and subsampling: {contents}\n") summary.write("\n\n") with open(input_files['table'], 'r') as table_file: @@ -336,7 +334,7 @@ def write_multiple_genome_summary(input_files, summary): out_spacers = f"{outdir}/{samplenames}_pharokka_crispr.tsv" out_acr = f"{outdir}/{samplenames}_phold_acr.tsv" if spacers_lines_count == 0 and phold_lines_count == 0: - summary.write("No CRISPR spacers or anti CRISPR spacers found\n") + summary.write("No anti CRISPR spacers found\n") elif spacers_lines_count != 0: summary.write("anti-CRISPR spacers found\n") shutil.copy(spacers_pattern, out_spacers) @@ -381,14 +379,26 @@ def analyze_assembly(input_files, output_summary, params): elif line_count == 1 : with open(output_summary, 'w') as summary: summary.write(f"Sample: {params['sample']}\n") + filename = input_files['read'] + with open(filename, 'r') as file: + contents = file.read() + summary.write(f"Total length of reads after QC and subsampling: {contents}\n") summary.write("No contigs from the assembly were assigned viral, likely contigs too short in size\n") else: with open(output_summary, 'w') as summary: summary.write(f"Sample: {params['sample']}\n") + filename = input_files['read'] + with open(filename, 'r') as file: + contents = file.read() + summary.write(f"Total length of reads after QC and subsampling: {contents}\n") summary.write("No contigs from the assembly were assigned viral, likely contigs too short in size\n") else: with open(output_summary, 'w') as summary: summary.write(f"Sample: {params['sample']}\n") + filename = input_files['read'] + with open(filename, 'r') as file: + contents = file.read() + summary.write(f"Total length of reads after QC and subsampling: {contents}\n") summary.write("Failed during assembly\n") # Replace input_files and output_params with the actual paths to your input/output files and parameters