Skip to content

Commit

Permalink
✨ Added the genome discarded after MASH step to the summary
Browse files Browse the repository at this point in the history
  • Loading branch information
rdenise committed Nov 20, 2023
1 parent aaadbb4 commit 9cdf2eb
Show file tree
Hide file tree
Showing 13 changed files with 223 additions and 152 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ nosetests.xml
coverage.xml
*,cover
.hypothesis/
.mypy_cache*

# Translations
*.mo
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
FROM mambaorg/micromamba:1.4.9
LABEL maintainer="Remi Denise (rdenise@ucc.ie)"
LABEL version="0.2.3"
LABEL version="0.2.4"

RUN micromamba install -y -n base -c conda-forge -c bioconda taxmyphage==0.2.3 && \
RUN micromamba install -y -n base -c conda-forge -c bioconda taxmyphage==0.2.4 && \
micromamba clean --all --yes
WORKDIR /app
ENTRYPOINT ["/usr/local/bin/_entrypoint.sh", "taxmyphage"]
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ exclude = [
"maybe*",
"*old*",
"*data*",
".DS_Store"
".DS_Store",
"taxmyphage/bin*",
]

[tool.poetry.dependencies]
Expand Down
5 changes: 5 additions & 0 deletions taxmyphage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
This module is used to set the version of the package.
It uses the version information from the package metadata.
"""

from importlib.metadata import version

__version__ = version(__package__)
45 changes: 23 additions & 22 deletions taxmyphage/__main__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#!/usr/bin/env python3

############################################################################################################
"""
This module is used for classifying bacteriophages based on their genomes.
It provides functionalities for checking and installing necessary databases,
handling files, and performing classifications.
"""

###################################################################################################
# Imports
############################################################################################################
###################################################################################################


import os, sys
import os
from icecream import ic
from tqdm import tqdm
from Bio import SeqIO
import time
from datetime import timedelta

from taxmyphage import cli
from taxmyphage.download_check import (
Expand All @@ -19,21 +21,20 @@
check_VMR,
install_db,
)
from taxmyphage.utils import create_folder, print_ok, CheckSoftware
from taxmyphage.handle_files import create_files_and_result_paths, read_VMR
from taxmyphage.utils import create_folder, CheckSoftware
from taxmyphage.handle_files import read_VMR
from taxmyphage.actions import all_classification, viridic
from taxmyphage.classify import (
classification_mash,
classification_viridic,
classification,
)

############################################################################################################

###################################################################################################
# Functions
############################################################################################################
###################################################################################################


def main():
"""
This is the main function of the script
"""
# Set up the arguments
args, nargs = cli.cli()

Expand All @@ -44,7 +45,7 @@ def main():
ic.disable()

# this is the location of where the script and the databases are stored
VMR_path = os.path.join(args.db_folder, "VMR.xlsx")
vmr_path = os.path.join(args.db_folder, "VMR.xlsx")
blastdb_path = os.path.join(args.db_folder, "Bacteriophage_genomes.fasta")
mash_index_path = os.path.join(args.db_folder, "ICTV_2023.msh")

Expand All @@ -53,7 +54,7 @@ def main():
CheckSoftware(args.makeblastdb)

install_db(
VMR_path=VMR_path,
VMR_path=vmr_path,
blastdb_path=blastdb_path,
mash_index_path=mash_index_path,
output=args.db_folder,
Expand Down Expand Up @@ -91,10 +92,10 @@ def main():
mash_dist = args.dist

# Check if the VMR file exists
check_VMR(VMR_path)
check_VMR(vmr_path)

# read in the VMR data
taxa_df = read_VMR(VMR_path=VMR_path)
taxa_df = read_VMR(VMR_path=vmr_path)

# Check if the mash index exists
check_mash_index(mash_index_path)
Expand Down Expand Up @@ -126,9 +127,9 @@ def main():
)


############################################################################################################
###################################################################################################

if __name__ == "__main__":
main()

############################################################################################################
###################################################################################################
58 changes: 38 additions & 20 deletions taxmyphage/actions.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
############################################################################################################
"""
This module provides functionalities for classifying bacteriophages based on their genomes.
It includes functions for running all classification methods, generating heatmaps, and handling
files.
"""

####################################################################################################
# Imports
############################################################################################################
####################################################################################################


import os, sys
import os
import sys
import time
from datetime import timedelta
from argparse import Namespace
from icecream import ic
from tqdm import tqdm
from Bio import SeqIO
import time
from datetime import timedelta
import pandas as pd
from argparse import Namespace

from taxmyphage.utils import print_error, print_ok, create_folder, print_warn
from taxmyphage.pmv import PoorMansViridic
Expand All @@ -23,9 +30,9 @@
)


############################################################################################################
####################################################################################################
# Functions
############################################################################################################
####################################################################################################


def all_classification(
Expand Down Expand Up @@ -104,7 +111,7 @@ def all_classification(
query = os.path.join(results_path, "query.fasta")

# create a fasta file with just the query genome and add query_ to the id
with open(query, "w") as output_fid:
with open(query, "w", encoding="utf-8") as output_fid:
genome.name = genome.description = ""
genome.id = f"query_{genome_id}"
SeqIO.write(genome, output_fid, "fasta")
Expand All @@ -131,6 +138,17 @@ def all_classification(
)

if mash_df.empty:
dict_taxonomy[genome_id] = {
"Realm": "Unknown",
"Kingdom": "Unknown",
"Phylum": "Unknown",
"Class": "Unknown",
"Order": "Unknown",
"Family": "Unknown",
"Subfamily": "Unknown",
"Genus": "New_genus",
"Species": "New_species",
}
continue

merged_df, copy_merged_df = classification_viridic(
Expand All @@ -155,16 +173,16 @@ def all_classification(
prefix=args.prefix,
)

dict_taxonomy[genome.id] = genome_taxo
dict_taxonomy[genome_id] = genome_taxo

run_time = str(timedelta(seconds=time.time() - timer_start))
print(f"Run time for {genome.id}: {run_time}\n", file=sys.stderr)
print(f"Run time for {genome_id}: {run_time}\n", file=sys.stderr)
print("-" * 80, file=sys.stderr)

# write the taxonomy to a csv file
taxonomy_tsv = os.path.join(args.output, "Summary_taxonomy.tsv")

with open(taxonomy_tsv, "w") as output_fid:
with open(taxonomy_tsv, "w", encoding="utf-8") as output_fid:
output_fid.write(
"Genome\tRealm\tKingdom\tPhylum\tClass\tOrder\tFamily\tSubfamily\tGenus\tSpecies\tFull_taxonomy\n"
)
Expand Down Expand Up @@ -214,7 +232,7 @@ def all_classification(
return


############################################################################################################
####################################################################################################


def viridic(args: Namespace, threads: int, verbose: bool) -> None:
Expand All @@ -240,7 +258,7 @@ def viridic(args: Namespace, threads: int, verbose: bool) -> None:

reference = os.path.join(args.output, "reference_pmv.fasta")

with open(reference, "wt") as f:
with open(reference, "wt", encoding="utf-8") as f:
read_write_fasta(args.reference, f)

# Not possible if matrix not square at the moment
Expand All @@ -261,7 +279,7 @@ def viridic(args: Namespace, threads: int, verbose: bool) -> None:
print_ok(f"\nCalculating PoorManVIRIDIC in {results_path}...")

# run VIRIDIC
PMV = PoorMansViridic(
pmv = PoorMansViridic(
file=tmp_fasta,
reference=reference,
nthreads=threads,
Expand All @@ -270,19 +288,19 @@ def viridic(args: Namespace, threads: int, verbose: bool) -> None:
makeblastdb_exe=args.makeblastdb,
)

dfT, pmv_outfile = PMV.run()
dfT, pmv_outfile = pmv.run()

# heatmap and distances
if args.Figure:
print_ok("\nWill calculate and save heatmaps now\n")
accession_genus_dict = {name: "" for name in PMV.dfM.A.unique()}
heatmap(PMV.dfM, heatmap_file, top_right_matrix, accession_genus_dict)
accession_genus_dict = {name: "" for name in pmv.dfM.A.unique()}
heatmap(pmv.dfM, heatmap_file, top_right_matrix, accession_genus_dict)
else:
print_error("\nSkipping calculating heatmaps and saving them\n ")

PMV.save_similarities(similarities_file)
pmv.save_similarities(similarities_file)

df = PMV.dfM
df = pmv.dfM
df = df[df.A != df.B].set_index(["A", "B"])

# print(df.to_csv(sep='\t', float_format='%.4f'))
Expand Down
37 changes: 22 additions & 15 deletions taxmyphage/classify.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
"""
This module provides functionalities for classifying bacteriophages based on their genomes.
It includes functions for running mash, parsing mash output, running BLAST, parsing BLAST output,
and performing the classification.
"""

####################################################################################################
# IMPORTS
####################################################################################################

import io
import os
import subprocess
import sys
from icecream import ic
import pandas as pd
from Bio import SeqIO
from textwrap import dedent
from typing import Tuple, Dict
import pandas as pd
from Bio import SeqIO
from icecream import ic

from taxmyphage.pmv import PoorMansViridic
from taxmyphage.plot import heatmap
Expand Down Expand Up @@ -105,7 +110,7 @@ def classification_mash(

number_hits = mash_df.shape[0]

# get the number of genomes wih mash distance < 0.2
# get the number of genomes wih mash distance < args.dist
if number_hits < 1:
print_error(
dedent(
Expand All @@ -117,14 +122,16 @@ def classification_mash(
)
)

os.system(f"touch {taxa_csv_output_path}")
with open(taxa_csv_output_path, "w", encoding="utf-8") as wt:
wt.write("No hits were found with the default settings\n")

return pd.DataFrame(), accession_genus_dict
else:
print_res(
dedent(
f"""
Number of phage genomes detected with mash distance of < {dist} is:{number_hits}
"""
Number of phage genomes detected with mash distance of < {dist} is:{number_hits}
"""
)
)

Expand Down Expand Up @@ -312,28 +319,28 @@ def classification_viridic(
SeqIO.write(SeqIO.parse(file, "fasta"), merged_file, "fasta")

# run VIRIDIC
PMV = PoorMansViridic(
pmv = PoorMansViridic(
file = viridic_in_path,
reference = viridic_in_path,
nthreads=threads,
verbose=verbose,
blastn_exe=blastn_exe,
makeblastdb_exe=makeblastdb_exe,
)
df1, pmv_outfile = PMV.run()
df1, pmv_outfile = pmv.run()

ic(df1)
ic(pmv_outfile)
ic(PMV.dfM)
ic(pmv.dfM)

# heatmap and distances
if Figure:
print_ok("\nWill calculate and save heatmaps now\n")
heatmap(PMV.dfM, heatmap_file, top_right_matrix, accession_genus_dict)
heatmap(pmv.dfM, heatmap_file, top_right_matrix, accession_genus_dict)
else:
print_error("\nSkipping calculating heatmaps and saving them\n ")

PMV.save_similarities(similarities_file)
pmv.save_similarities(similarities_file)

# merge the ICTV dataframe with the results of viridic
# fill in missing with Not Defined yet
Expand Down Expand Up @@ -395,8 +402,8 @@ def new_genus(
with open(summary_output_path, "w") as file:
file.write(
f"""Try running again with if you larger distance if you want a Figure.
The query is both a new genus and species\n
New genus\tNew species\n"""
The query is both a new genus and species\n
New genus\tNew species\n"""
)

return {
Expand Down
Loading

0 comments on commit 9cdf2eb

Please sign in to comment.