From 01940aad96195d05b952cfe8a73f2cf53f48a644 Mon Sep 17 00:00:00 2001
From: npbhavya <npbhavya@gmail.com>
Date: Thu, 1 Aug 2024 10:24:27 +0930
Subject: [PATCH] smaller fixes to the summary.py

---
 sphae/workflow/rules/3.qc_qa.smk             | 30 ++++++----------
 sphae/workflow/scripts/pick_phage_contigs.py |  7 ++--
 sphae/workflow/scripts/summary.py            | 38 ++++++++++++--------
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/sphae/workflow/rules/3.qc_qa.smk b/sphae/workflow/rules/3.qc_qa.smk
index f265516..a5f59de 100644
--- a/sphae/workflow/rules/3.qc_qa.smk
+++ b/sphae/workflow/rules/3.qc_qa.smk
@@ -71,25 +71,21 @@ rule run_seqkit_short:
     threads: 
         config['resources']['smalljob']['cpu'],
     log:
-        os.path.join(dir_log, "seqkit", "{sample}_seqkit_short.log"),
+        os.path.join(dir_log, "seqkit", "{sample}_bases_short.log"),
     shell:
         """
-        if [[ -s input.r1 ]]; then
-            seqkit stats {input.r1} -T > {params.r1_temp}
-            seqkit stats {input.r2} -T > {params.r2_temp}
+        seqkit stats {input.r1} -T > {params.r1_temp}
+        seqkit stats {input.r2} -T > {params.r2_temp}
 
-            # Extract numeric values from the second line of the output files
-            value1=$(awk 'NR==2 {{print $5}}' {params.r1_temp})
-            value2=$(awk 'NR==2 {{print $5}}' {params.r2_temp})
+        # Extract numeric values from the second line of the output files
+        value1=$(awk 'NR==2 {{print $5}}' {params.r1_temp})
+        value2=$(awk 'NR==2 {{print $5}}' {params.r2_temp})
 
-            # Add the values
-            sum=$((value1 + value2))
+        # Add the values
+        sum=$((value1 + value2))
 
-            # Write the sum to output.r
-            echo $sum > {output.r}
-        else
-            echo 0 > {output.r}
-        fi
+        # Write the sum to output.r
+        echo $sum > {output.r}
         """
 
 rule filtlong_long:
@@ -135,9 +131,5 @@ rule run_seqkit_long:
         os.path.join(dir_log, "{sample}_long_seqkit.log"),
     shell:
         """
-        if [[ -s {input.fastq} ]] ; then
-            seqkit stats {input.fastq} -T -N 50 -N 90 | awk 'NR==2 {{print $5}}' > {output.r}
-
-        fi
-        echo 0 > {output.r}
+        seqkit stats {input.fastq} -T -N 50 -N 90 | awk 'NR==2 {{print $5}}' > {output.r}
         """
diff --git a/sphae/workflow/scripts/pick_phage_contigs.py b/sphae/workflow/scripts/pick_phage_contigs.py
index 2a071aa..a373d3e 100755
--- a/sphae/workflow/scripts/pick_phage_contigs.py
+++ b/sphae/workflow/scripts/pick_phage_contigs.py
@@ -17,8 +17,8 @@ def picking_contigs(file,out):
         datav = data[data["Length_x"] > 1000]
         datav = datav[datav["Prediction"] == "Virus"]
         datac = datav[datav["completeness"]> 70.00]
-        print (len(data))
-        print (data)
+        #print (len(data))
+        #print (datac)
     else:
         open(out, 'a').close()
         return None
@@ -38,12 +38,15 @@ def picking_contigs(file,out):
         #print ("entering this if statement")
         if (datac["Connections"] == 0).any():
             print("The genome is assembled, yay!")
+            #print("writing to", out)
             datac.to_csv(out, index=False)
         else:
             print ("The genome has some regions that are frgamented, but mostly assembled")
             print ("Take a look at the assembly graph file in bandage for more information on where the genome is fragmented")
             datac.to_csv(out, index=False)
 
+    with open(out, 'a'):
+        os.utime(out, None)
 
 picking_contigs(snakemake.input.csv, snakemake.output.out)
 
diff --git a/sphae/workflow/scripts/summary.py b/sphae/workflow/scripts/summary.py
index 7c033a8..60a7550 100755
--- a/sphae/workflow/scripts/summary.py
+++ b/sphae/workflow/scripts/summary.py
@@ -56,27 +56,23 @@ def count_hypothetical_proteins(gbk_file):
                         count += 1
     return count
 
-def get_num(params):
-    length=(input_files['read'])
-    sum_len_value = length[0]
-
-    return sum_len_value
-
 def write_single_genome_summary(input_files, summary):
     with open(input_files['table'], 'r') as table_file:
         lines = table_file.readlines()
         if len(lines) > 1:  # Ensure there is at least one data line
             fields = lines[1].strip().split(',')  # Split the second line into fields
 
-            read_length = get_num(params)
-            summary.write(f"Total length of reads after QC and subsampling:{read_length}\n")
+            filename = input_files['read']
+            with open(filename, 'r') as file:
+                contents = file.read()
+            summary.write(f"Total length of reads after QC and subsampling: {contents}\n")
         
         # Ensure fields list has enough elements to access the desired indices
             if len(fields) >= 23:
                 summary.write(f"Length: {lines[1].split(',')[2]}\n")
 
                 if fields[3].strip() == 'True':
-                    summary.write("Circular: True\t")
+                    summary.write("Circular: True\n")
                 else:
                     summary.write("Circular: False\n")
 
@@ -179,9 +175,9 @@ def write_single_genome_summary(input_files, summary):
 
         #CRISPR spacers 
         if (len(open(input_files['spacers']).readlines()) == 0) and (len(open(input_files['acr']).readlines()) == 0):
-            summary.write("No CRISPR spacers or anti CRISPR spacers found\n")
+            summary.write("No anti CRISPR spacers found\n")
         elif (len(open(input_files['spacers']).readlines()) != 0):
-            summary.write("CRISPR spacers found\n")
+            summary.write("anti-CRISPR spacers found\n")
             shutil.copy(input_files['spacers'], outdir)
         elif (len(open(input_files['acr']).readlines()) != 0):
             summary.write("anti-CRISPR spacers found\n")
@@ -197,8 +193,10 @@ def write_single_genome_summary(input_files, summary):
 def write_multiple_genome_summary(input_files, summary):
     summary.write("Multiple phages assembled from this sample\n")
     summary.write("Their characteristics are:\n")
-    read_length = get_num(params)
-    summary.write(f"Total length of reads after QC and subsampling:{read_length}\n")
+    filename = input_files['read']
+    with open(filename, 'r') as file:
+        contents = file.read()
+    summary.write(f"Total length of reads after QC and subsampling: {contents}\n")
     summary.write("\n\n")
 
     with open(input_files['table'], 'r') as table_file:
@@ -336,7 +334,7 @@ def write_multiple_genome_summary(input_files, summary):
                 out_spacers = f"{outdir}/{samplenames}_pharokka_crispr.tsv"
                 out_acr = f"{outdir}/{samplenames}_phold_acr.tsv"
                 if spacers_lines_count == 0 and  phold_lines_count == 0:
-                    summary.write("No CRISPR spacers or anti CRISPR spacers found\n")
+                    summary.write("No anti CRISPR spacers found\n")
                 elif spacers_lines_count != 0:
                     summary.write("anti-CRISPR spacers found\n")
                     shutil.copy(spacers_pattern, out_spacers)
@@ -381,14 +379,26 @@ def analyze_assembly(input_files, output_summary, params):
             elif line_count == 1 :
                 with open(output_summary, 'w') as summary:
                     summary.write(f"Sample: {params['sample']}\n")
+                    filename = input_files['read']
+                    with open(filename, 'r') as file:
+                        contents = file.read()
+                    summary.write(f"Total length of reads after QC and subsampling: {contents}\n")
                     summary.write("No contigs from the assembly were assigned viral, likely contigs too short in size\n")
         else:
             with open(output_summary, 'w') as summary:
                 summary.write(f"Sample: {params['sample']}\n")
+                filename = input_files['read']
+                with open(filename, 'r') as file:
+                    contents = file.read()
+                summary.write(f"Total length of reads after QC and subsampling: {contents}\n")
                 summary.write("No contigs from the assembly were assigned viral, likely contigs too short in size\n")
     else:
         with open(output_summary, 'w') as summary:
             summary.write(f"Sample: {params['sample']}\n")
+            filename = input_files['read']
+            with open(filename, 'r') as file:
+                contents = file.read()
+            summary.write(f"Total length of reads after QC and subsampling: {contents}\n")
             summary.write("Failed during assembly\n")            
 
 # Replace input_files and output_params with the actual paths to your input/output files and parameters