a bunch of changes noted in chnages.md

linsalrob · Jul 25, 2024 · 1f948b1 · 1f948b1
1 parent 75f555a
commit 1f948b1
Show file tree

Hide file tree

Showing 18 changed files with 212 additions and 1,425 deletions.
diff --git a/Changes.md b/Changes.md
@@ -8,7 +8,10 @@
 - Updating the QC rule to touch the output file so the error is correctly recorded
 - the annotate function table after the 3Ps werent being generated, so added that in
 - add the number of unknown proteins to the final summary file
-
+- updated phytnteny yaml file to include numpy version
+- added a python script to the misc folder to merge the RESULTS to one tsv file
+- adding the total read length to summary file
+- updating the taxa description in the summary file to include the taxa description, lowest taxa and the isolated host from the pharokka inphared result
 
 
 ## v1.4.2 
@@ -35,4 +38,10 @@ Sphae can be run on illumina reads and nanopore reads
 - Assembly - [Megahit](https://github.com/voutcn/megahit) for Illumina reads and [Flye](https://github.com/fenderglass/Flye) + [Medaka](https://github.com/nanoporetech/medaka) for Nanopore reads
   Post assembly the reads are run through [CheckV](https://bitbucket.org/berkeleylab/CheckV/src), [viraverify](https://github.com/ablab/viralVerify) and looking into graph connection to confirm the genome is assembled 
 - Annotation - [Pharokka](https://github.com/gbouras13/pharokka) followed by [Phynteny](https://github.com/susiegriggo/Phynteny)
-- Generate a summary report file
+- Generate a summary report file
+
+
+## Ideas to add
+add another module to map the reads to the assembled genome 
+  - other fun things to do here
+  - calculate the mutations/variants per postion?
diff --git a/README.md b/README.md
@@ -85,7 +85,7 @@ Run the below command,
 sphae install
 
 #Install database to specific directory
-sphae install --db-dir <directory> 
+sphae install --db_dir <directory> 
 ```
 
   Install the databases to a directory, `sphae/workflow/databases`

diff --git a/misc/NOTES.md b/misc/NOTES.md
diff --git a/misc/README.md b/misc/README.md
@@ -13,4 +13,9 @@ This has a list of misc scripts that can be run on the sphae output
     | tail protein | 2 | 1 | 1 |
     | holin | 0 | 0 | 0 |
 
-
+
+- merging_sphae_output.py: Take a directory of RESULTS from the sphae output, to write the output to a tsv file
+
+  For example, 
+    `python merging_sphae_output.py KlebPhages_v1.4-out/RESULTS/ test.tsv`
+
diff --git a/misc/merging_sphae_output.py b/misc/merging_sphae_output.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+import os
+import pandas as pd
+import argparse
+
+# Define function to parse a summary file
+def parse_summary_file(file_path):
+    entries = []
+    entry = {}
+    multiple_phages = False
+    with open(file_path, 'r') as file:
+        for line in file:
+            line = line.strip()
+            if 'Sample:' in line:
+                # Extract the sample name
+                key, value = line.split(':', 1)
+                entry['sample'] = value.strip()
+            elif "No contigs from the assembly were assigned viral" in line:
+                # Custom handling for the special case
+                entry['length'] = "No contigs from the assembly were assigned viral, likely contigs too short in size"
+            elif 'Multiple phages assembled from this sample' in line:
+                # Start of multiple phage entries
+                multiple_phages = True
+                if entry:
+                    entries.append(entry)
+            elif line.startswith('Sample name:') and multiple_phages:
+                # New phage entry within multiple phages
+                if entry:
+                    entries.append(entry)
+                entry = {'sample': entry['sample']}
+                entry['sample_name'] = line.split(':', 1)[1].strip()  # Ensure line contains ':' before splitting
+            elif ':' in line:
+                # Standard key-value pair
+                key, value = line.split(':', 1)
+                key = key.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
+                value = value.strip()
+                entry[key] = value
+            else:
+                # Special handling for lines without a colon
+                line_cleaned = line.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
+                entry[line_cleaned] = True
+
+        # Append the last entry if any
+        if entry:
+            entries.append(entry)
+
+    return entries
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description='Parse summary files and convert to a DataFrame.')
+    parser.add_argument('base_dir', type=str, help='Base directory containing summary files.')
+    parser.add_argument('output_file', type=str, help='Output file name for the resulting DataFrame.')
+
+    args = parser.parse_args()
+
+    # List to hold parsed data
+    summary_data = []
+
+    # Traverse directories and read summary files
+    for dir_name in os.listdir(args.base_dir):
+        dir_path = os.path.join(args.base_dir, dir_name)
+        if os.path.isdir(dir_path):  # Check if it is a directory
+            # Read files in the directory
+            for file_name in os.listdir(dir_path):
+                if file_name.endswith('_summary.txt'):
+                    file_path = os.path.join(dir_path, file_name)
+                    # Parse the summary file and extend summary_data with parsed entries
+                    summary_data.extend(parse_summary_file(file_path))
+
+    # Convert to DataFrame
+    df = pd.DataFrame(summary_data)
+
+    # Write DataFrame to a TSV file
+    df.to_csv(args.output_file, sep='\t', index=False)
+
+    print(f"DataFrame has been successfully written to '{args.output_file}'.")
+
+if __name__ == '__main__':
+    main()
diff --git a/sphae/workflow/envs/phynteny.yaml b/sphae/workflow/envs/phynteny.yaml
@@ -7,6 +7,6 @@ dependencies:
     - python==3.10.0
     - pip 
     - pip:
-        - numpy==1.21.0
+        - numpy==1.26.4
         - scikit-learn==1.2.2 
         - phynteny==0.1.13
diff --git a/sphae/workflow/envs/qc.yaml b/sphae/workflow/envs/qc.yaml
@@ -5,4 +5,5 @@ channels:
 dependencies:
     - fastp>=0.23.4
     - filtlong>=0.2.1
-    - rasusa>=2.0.0
+    - rasusa>=2.0.0
+    - seqkit >=2.6.1
diff --git a/sphae/workflow/rules/10.final-reporting.smk b/sphae/workflow/rules/10.final-reporting.smk
@@ -70,6 +70,7 @@ rule annotate_summary_paired:
 rule summarize_paired:
     input:
         #this is literally here to make sure there is a file for each sample
+        r=os.path.join(dir_fastp, "{sample}_fastp.txt"),
         assembly=os.path.join(dir_megahit, "{sample}-pr", "log"),
         table= os.path.join(dir_genome, "{sample}-pr", "{sample}-genome-candidates.csv"),
         genome=os.path.join(dir_genome, "{sample}-pr", "{sample}.fasta"),
@@ -168,6 +169,7 @@ rule annotate_summary_longreads:
 
 rule summarize_longread:
     input:
+        r=os.path.join(dir_nanopore, "{sample}_filt.txt"),
         assembly=os.path.join(dir_flye, "{sample}-sr", "flye.log"),
         table= os.path.join(dir_genome, "{sample}-sr", "{sample}-genome-candidates.csv"),
         genome=os.path.join(dir_genome, "{sample}-sr", "{sample}.fasta"),

diff --git a/sphae/workflow/rules/2.targets.smk b/sphae/workflow/rules/2.targets.smk
@@ -6,9 +6,11 @@ targets['qc'] = []
 if config['args']['sequencing'] == 'paired':
     for sample in samples_names:
         targets['qc'].append(expand(os.path.join(dir_fastp, "{sample}_subsampled_{r12}.fastq.gz"), sample=sample, r12=["R1", "R2"]))
+        targets['qc'].append(expand(os.path.join(dir_fastp, "{sample}_fastp.txt"), sample=sample))
 elif config['args']['sequencing'] == 'longread':
     for sample in samples_names:
         targets['qc'].append(expand(os.path.join(dir_nanopore, "{sample}_filt.fastq.gz"), sample=sample))
+        targets['qc'].append(expand(os.path.join(dir_nanopore, "{sample}_filt.txt"), sample=sample))
 
 if config['args']['sequencing'] == 'paired':
     targets['assemble'].append(expand(os.path.join(dir_megahit, "{sample}-pr", "final.contigs.fa"),sample=samples_names))

diff --git a/sphae/workflow/rules/3.qc_qa.smk b/sphae/workflow/rules/3.qc_qa.smk
@@ -54,6 +54,43 @@ rule rasusa:
         touch {output.r2}
         """
 
+rule run_seqkit_short:
+    input:
+        r1 = os.path.join(dir_fastp,"{sample}_subsampled_R1.fastq.gz"),
+        r2 = os.path.join(dir_fastp,"{sample}_subsampled_R2.fastq.gz"),
+    output:
+        r=os.path.join(dir_fastp, "{sample}_fastp.txt")
+    params:
+        r1_temp = os.path.join(dir_fastp, "{sample}_r1.txt"),
+        r2_temp = os.path.join(dir_fastp, "{sample}_r2.txt"),
+    conda:
+        os.path.join(dir_env, "qc.yaml")
+    resources:
+        mem =config['resources']['smalljob']['mem'],
+        time = config['resources']['smalljob']['time']
+    threads: 
+        config['resources']['smalljob']['cpu'],
+    log:
+        os.path.join(dir_log, "seqkit", "{sample}_seqkit_short.log"),
+    shell:
+        """
+        if [[ -s input.r1 ]]; then
+            seqkit stats {input.r1} -T > {params.r1_temp}
+            seqkit stats {input.r2} -T > {params.r2_temp}
+
+            # Extract numeric values from the second line of the output files
+            value1=$(awk 'NR==2 {{print $5}}' {params.r1_temp})
+            value2=$(awk 'NR==2 {{print $5}}' {params.r2_temp})
+
+            # Add the values
+            sum=$((value1 + value2))
+
+            # Write the sum to output.r
+            echo $sum > {output.r}
+        else
+            echo 0 > {output.r}
+        fi
+        """
 
 rule filtlong_long:
     """
@@ -79,4 +116,28 @@ rule filtlong_long:
         """
         filtlong --target_bases {params.target_bases} --min_mean_q {params.qual} --min_length {params.length} {input.fastq} | pigz > {output.fastq} 2> {log}
         touch {output.fastq}
-        """
+        """
+
+rule run_seqkit_long:
+    input:
+        fastq=os.path.join(dir_nanopore, "{sample}_filt.fastq.gz"),
+    output:
+        r=os.path.join(dir_nanopore, "{sample}_filt.txt"),
+    conda:
+        os.path.join(dir_env, "qc.yaml")
+    resources:
+        cpu =config['resources']['smalljob']['cpu'],
+        mem =config['resources']['smalljob']['mem'],
+        time = config['resources']['smalljob']['time']
+    threads:
+        config['resources']['smalljob']['cpu'],
+    log:
+        os.path.join(dir_log, "{sample}_long_seqkit.log"),
+    shell:
+        """
+        if [[ -s {input.fastq} ]] ; then
+            seqkit stats {input.fastq} -T -N 50 -N 90 | awk 'NR==2 {{print $5}}' > {output.r}
+
+        fi
+        echo 0 > {output.r}
+        """
diff --git a/sphae/workflow/scripts/summary-annot.py b/sphae/workflow/scripts/summary-annot.py
@@ -8,6 +8,16 @@ def copy_files(input_files, params):
     shutil.copy(input_files['gbk'], params['gbks'])
     shutil.copy(input_files['plot'], params['plots'])
 
+def count_hypothetical_proteins(params):
+    count = 0
+    for record in SeqIO.parse(input_files['gbk'], "genbank"):
+        for feature in record.features:
+            if feature.type == "CDS":
+                if "product" in feature.qualifiers:
+                    if "hypothetical protein" in feature.qualifiers["product"]:
+                        count += 1
+    return count
+
 def generate_summary(input_files, output_summary, params):
     copy_files(input_files, params)
     with open(output_summary, 'w') as summary:
@@ -25,6 +35,9 @@ def generate_summary(input_files, output_summary, params):
                     count_value = cds_data['Count'].values[0]
                     summary.write(f"Number of CDS: {count_value}\n")
 
+                hypothetical_protein_count = count_hypothetical_proteins(params)
+                summary.write(f"Total number of CDS annotated as 'hypothetical protein': {hypothetical_protein_count}\n")
+
                 with open(input_files['cdden'], 'r') as cdden:
                     cdn=pd.read_csv(cdden, sep='\t')
                     gc_percent = cdn['gc_perc'].values[0]