Skip to content

Commit

Permalink
a bunch of changes noted in chnages.md
Browse files Browse the repository at this point in the history
  • Loading branch information
npbhavya committed Jul 25, 2024
1 parent 75f555a commit 1f948b1
Show file tree
Hide file tree
Showing 18 changed files with 212 additions and 1,425 deletions.
13 changes: 11 additions & 2 deletions Changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@
- Updating the QC rule to touch the output file so the error is correctly recorded
- the annotate function table after the 3Ps werent being generated, so added that in
- add the number of unknown proteins to the final summary file

- updated phytnteny yaml file to include numpy version
- added a python script to the misc folder to merge the RESULTS to one tsv file
- adding the total read length to summary file
- updating the taxa description in the summary file to include the taxa description, lowest taxa and the isolated host from the pharokka inphared result


## v1.4.2
Expand All @@ -35,4 +38,10 @@ Sphae can be run on illumina reads and nanopore reads
- Assembly - [Megahit](https://github.com/voutcn/megahit) for Illumina reads and [Flye](https://github.com/fenderglass/Flye) + [Medaka](https://github.com/nanoporetech/medaka) for Nanopore reads
Post assembly the reads are run through [CheckV](https://bitbucket.org/berkeleylab/CheckV/src), [viraverify](https://github.com/ablab/viralVerify) and looking into graph connection to confirm the genome is assembled
- Annotation - [Pharokka](https://github.com/gbouras13/pharokka) followed by [Phynteny](https://github.com/susiegriggo/Phynteny)
- Generate a summary report file
- Generate a summary report file


## Ideas to add
add another module to map the reads to the assembled genome
- other fun things to do here
- calculate the mutations/variants per postion?
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Run the below command,
sphae install

#Install database to specific directory
sphae install --db-dir <directory>
sphae install --db_dir <directory>
```

Install the databases to a directory, `sphae/workflow/databases`
Expand Down
11 changes: 0 additions & 11 deletions misc/NOTES.md

This file was deleted.

7 changes: 6 additions & 1 deletion misc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,9 @@ This has a list of misc scripts that can be run on the sphae output
| tail protein | 2 | 1 | 1 |
| holin | 0 | 0 | 0 |



- merging_sphae_output.py: Take a directory of RESULTS from the sphae output, to write the output to a tsv file

For example,
`python merging_sphae_output.py KlebPhages_v1.4-out/RESULTS/ test.tsv`

81 changes: 81 additions & 0 deletions misc/merging_sphae_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3

import os
import pandas as pd
import argparse

# Define function to parse a summary file
def parse_summary_file(file_path):
entries = []
entry = {}
multiple_phages = False
with open(file_path, 'r') as file:
for line in file:
line = line.strip()
if 'Sample:' in line:
# Extract the sample name
key, value = line.split(':', 1)
entry['sample'] = value.strip()
elif "No contigs from the assembly were assigned viral" in line:
# Custom handling for the special case
entry['length'] = "No contigs from the assembly were assigned viral, likely contigs too short in size"
elif 'Multiple phages assembled from this sample' in line:
# Start of multiple phage entries
multiple_phages = True
if entry:
entries.append(entry)
elif line.startswith('Sample name:') and multiple_phages:
# New phage entry within multiple phages
if entry:
entries.append(entry)
entry = {'sample': entry['sample']}
entry['sample_name'] = line.split(':', 1)[1].strip() # Ensure line contains ':' before splitting
elif ':' in line:
# Standard key-value pair
key, value = line.split(':', 1)
key = key.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
value = value.strip()
entry[key] = value
else:
# Special handling for lines without a colon
line_cleaned = line.strip().lower().replace(' ', '_').replace('(', '').replace(')', '')
entry[line_cleaned] = True

# Append the last entry if any
if entry:
entries.append(entry)

return entries

def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Parse summary files and convert to a DataFrame.')
parser.add_argument('base_dir', type=str, help='Base directory containing summary files.')
parser.add_argument('output_file', type=str, help='Output file name for the resulting DataFrame.')

args = parser.parse_args()

# List to hold parsed data
summary_data = []

# Traverse directories and read summary files
for dir_name in os.listdir(args.base_dir):
dir_path = os.path.join(args.base_dir, dir_name)
if os.path.isdir(dir_path): # Check if it is a directory
# Read files in the directory
for file_name in os.listdir(dir_path):
if file_name.endswith('_summary.txt'):
file_path = os.path.join(dir_path, file_name)
# Parse the summary file and extend summary_data with parsed entries
summary_data.extend(parse_summary_file(file_path))

# Convert to DataFrame
df = pd.DataFrame(summary_data)

# Write DataFrame to a TSV file
df.to_csv(args.output_file, sep='\t', index=False)

print(f"DataFrame has been successfully written to '{args.output_file}'.")

if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion sphae/workflow/envs/phynteny.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ dependencies:
- python==3.10.0
- pip
- pip:
- numpy==1.21.0
- numpy==1.26.4
- scikit-learn==1.2.2
- phynteny==0.1.13
3 changes: 2 additions & 1 deletion sphae/workflow/envs/qc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ channels:
dependencies:
- fastp>=0.23.4
- filtlong>=0.2.1
- rasusa>=2.0.0
- rasusa>=2.0.0
- seqkit >=2.6.1
2 changes: 2 additions & 0 deletions sphae/workflow/rules/10.final-reporting.smk
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ rule annotate_summary_paired:
rule summarize_paired:
input:
#this is literally here to make sure there is a file for each sample
r=os.path.join(dir_fastp, "{sample}_fastp.txt"),
assembly=os.path.join(dir_megahit, "{sample}-pr", "log"),
table= os.path.join(dir_genome, "{sample}-pr", "{sample}-genome-candidates.csv"),
genome=os.path.join(dir_genome, "{sample}-pr", "{sample}.fasta"),
Expand Down Expand Up @@ -168,6 +169,7 @@ rule annotate_summary_longreads:

rule summarize_longread:
input:
r=os.path.join(dir_nanopore, "{sample}_filt.txt"),
assembly=os.path.join(dir_flye, "{sample}-sr", "flye.log"),
table= os.path.join(dir_genome, "{sample}-sr", "{sample}-genome-candidates.csv"),
genome=os.path.join(dir_genome, "{sample}-sr", "{sample}.fasta"),
Expand Down
2 changes: 2 additions & 0 deletions sphae/workflow/rules/2.targets.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ targets['qc'] = []
if config['args']['sequencing'] == 'paired':
for sample in samples_names:
targets['qc'].append(expand(os.path.join(dir_fastp, "{sample}_subsampled_{r12}.fastq.gz"), sample=sample, r12=["R1", "R2"]))
targets['qc'].append(expand(os.path.join(dir_fastp, "{sample}_fastp.txt"), sample=sample))
elif config['args']['sequencing'] == 'longread':
for sample in samples_names:
targets['qc'].append(expand(os.path.join(dir_nanopore, "{sample}_filt.fastq.gz"), sample=sample))
targets['qc'].append(expand(os.path.join(dir_nanopore, "{sample}_filt.txt"), sample=sample))

if config['args']['sequencing'] == 'paired':
targets['assemble'].append(expand(os.path.join(dir_megahit, "{sample}-pr", "final.contigs.fa"),sample=samples_names))
Expand Down
63 changes: 62 additions & 1 deletion sphae/workflow/rules/3.qc_qa.smk
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,43 @@ rule rasusa:
touch {output.r2}
"""

rule run_seqkit_short:
input:
r1 = os.path.join(dir_fastp,"{sample}_subsampled_R1.fastq.gz"),
r2 = os.path.join(dir_fastp,"{sample}_subsampled_R2.fastq.gz"),
output:
r=os.path.join(dir_fastp, "{sample}_fastp.txt")
params:
r1_temp = os.path.join(dir_fastp, "{sample}_r1.txt"),
r2_temp = os.path.join(dir_fastp, "{sample}_r2.txt"),
conda:
os.path.join(dir_env, "qc.yaml")
resources:
mem =config['resources']['smalljob']['mem'],
time = config['resources']['smalljob']['time']
threads:
config['resources']['smalljob']['cpu'],
log:
os.path.join(dir_log, "seqkit", "{sample}_seqkit_short.log"),
shell:
"""
if [[ -s input.r1 ]]; then
seqkit stats {input.r1} -T > {params.r1_temp}
seqkit stats {input.r2} -T > {params.r2_temp}
# Extract numeric values from the second line of the output files
value1=$(awk 'NR==2 {{print $5}}' {params.r1_temp})
value2=$(awk 'NR==2 {{print $5}}' {params.r2_temp})
# Add the values
sum=$((value1 + value2))
# Write the sum to output.r
echo $sum > {output.r}
else
echo 0 > {output.r}
fi
"""

rule filtlong_long:
"""
Expand All @@ -79,4 +116,28 @@ rule filtlong_long:
"""
filtlong --target_bases {params.target_bases} --min_mean_q {params.qual} --min_length {params.length} {input.fastq} | pigz > {output.fastq} 2> {log}
touch {output.fastq}
"""
"""

rule run_seqkit_long:
input:
fastq=os.path.join(dir_nanopore, "{sample}_filt.fastq.gz"),
output:
r=os.path.join(dir_nanopore, "{sample}_filt.txt"),
conda:
os.path.join(dir_env, "qc.yaml")
resources:
cpu =config['resources']['smalljob']['cpu'],
mem =config['resources']['smalljob']['mem'],
time = config['resources']['smalljob']['time']
threads:
config['resources']['smalljob']['cpu'],
log:
os.path.join(dir_log, "{sample}_long_seqkit.log"),
shell:
"""
if [[ -s {input.fastq} ]] ; then
seqkit stats {input.fastq} -T -N 50 -N 90 | awk 'NR==2 {{print $5}}' > {output.r}
fi
echo 0 > {output.r}
"""
13 changes: 13 additions & 0 deletions sphae/workflow/scripts/summary-annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@ def copy_files(input_files, params):
shutil.copy(input_files['gbk'], params['gbks'])
shutil.copy(input_files['plot'], params['plots'])

def count_hypothetical_proteins(params):
count = 0
for record in SeqIO.parse(input_files['gbk'], "genbank"):
for feature in record.features:
if feature.type == "CDS":
if "product" in feature.qualifiers:
if "hypothetical protein" in feature.qualifiers["product"]:
count += 1
return count

def generate_summary(input_files, output_summary, params):
copy_files(input_files, params)
with open(output_summary, 'w') as summary:
Expand All @@ -25,6 +35,9 @@ def generate_summary(input_files, output_summary, params):
count_value = cds_data['Count'].values[0]
summary.write(f"Number of CDS: {count_value}\n")

hypothetical_protein_count = count_hypothetical_proteins(params)
summary.write(f"Total number of CDS annotated as 'hypothetical protein': {hypothetical_protein_count}\n")

with open(input_files['cdden'], 'r') as cdden:
cdn=pd.read_csv(cdden, sep='\t')
gc_percent = cdn['gc_perc'].values[0]
Expand Down
Loading

0 comments on commit 1f948b1

Please sign in to comment.