diff --git a/HISTORY.md b/HISTORY.md index ef5a88e..08ba3bf 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,12 @@ History ======= +1.7.0 (2024-01-17) +------------------ + +* Adds `pharokka_multiplotter.py` to plot multiple phage contigs at once +* Adds separate contig FASTA files if `-s -m` is run (in `single_fastas`) + 1.6.1 (2024-01-17) ------------------ diff --git a/README.md b/README.md index d6c3db3..9d3393b 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ If you are looking for rapid standardised annotation of bacterial genomes, pleas - [Paper](#paper) - [Pharokka with Galaxy Europe Webserver](#pharokka-with-galaxy-europe-webserver) - [Brief Overview](#brief-overview) + - [Pharokka v 1.7.0 Update](#pharokka-v-170-update) - [Pharokka v 1.6.0 Update (11 January 2024)](#pharokka-v-160-update-11-january-2024) - [Pharokka v 1.5.0 Update (20 September 2023)](#pharokka-v-150-update-20-september-2023) - [Pharokka v 1.4.0 Update (27 August 2023)](#pharokka-v-140-update-27-august-2023) @@ -97,6 +98,18 @@ So if you can't get `pharokka` to install on your machine for whatever reason or `pharokka` uses [PHANOTATE](https://github.com/deprekate/PHANOTATE), the only gene prediction program tailored to bacteriophages, as the default program for gene prediction. [Prodigal](https://github.com/hyattpd/Prodigal) implemented with [pyrodigal](https://github.com/althonos/pyrodigal) and [Prodigal-gv](https://github.com/apcamargo/prodigal-gv) implemented with [pyrodigal-gv](https://github.com/althonos/pyrodigal-gv) are also available as alternatives. Following this, functional annotations are assigned by matching each predicted coding sequence (CDS) to the [PHROGs](https://phrogs.lmge.uca.fr), [CARD](https://card.mcmaster.ca) and [VFDB](http://www.mgc.ac.cn/VFs/main.htm) databases using [MMseqs2](https://github.com/soedinglab/MMseqs2). As of v1.4.0, `pharokka` will also match each CDS to the PHROGs database using more sensitive Hidden Markov Models using [PyHMMER](https://github.com/althonos/pyhmmer). Pharokka's main output is a GFF file suitable for using in downstream pangenomic pipelines like [Roary](https://sanger-pathogens.github.io/Roary/). `pharokka` also generates a `cds_functions.tsv` file, which includes counts of CDSs, tRNAs, tmRNAs, CRISPRs and functions assigned to CDSs according to the PHROGs database. See the full [usage](#usage) and check out the full [documentation](https://pharokka.readthedocs.io) for more details. +## Pharokka v 1.7.0 Update + +You can run `pharokka_multiplotter.py` to plot as many phage(s) as you want. + +It requires the `pharokka` output Genbank file (here, `pharokka.gbk`). It will save plots for each contig in the output directory (here `pharokka_plots_output_directory`). + +e.g. + +``` +pharokka_multiplotter.py -g pharokka.gbk -o pharokka_plots_output_directory +``` + ## Pharokka v 1.6.0 Update (11 January 2024) * Fixes a variety of bugs (#300 `pharokka_proteins.py` crashing if it found VFDB hits, #303 errors in the `.tbl` format, #316 errors with types and where custom HMM dbs had identical scored hits, #317 types and #320 deprecated GC function) @@ -160,7 +173,6 @@ SAOMS1 phage (GenBank: MW460250.1) was isolated and sequenced by: Yerushalmy, O. Please see [plotting](docs/plotting.md) for details on all plotting parameter options. - # Installation ## Conda Installation diff --git a/bin/pharokka.py b/bin/pharokka.py index 03f4a0c..335d42f 100755 --- a/bin/pharokka.py +++ b/bin/pharokka.py @@ -11,42 +11,21 @@ from custom_db import run_custom_pyhmmer from databases import check_db_installation from hmm import run_pyhmmer -from input_commands import ( - check_dependencies, - get_input, - instantiate_dirs, - instantiate_split_output, - validate_and_extract_genbank, - validate_custom_hmm, - validate_fasta, - validate_gene_predictor, - validate_meta, - validate_terminase, - validate_threads, -) +from input_commands import (check_dependencies, get_input, instantiate_dirs, + instantiate_split_output, + validate_and_extract_genbank, validate_custom_hmm, + validate_fasta, validate_gene_predictor, + validate_meta, validate_terminase, + validate_threads) from loguru import logger from post_processing import Pharok, remove_post_processing_files -from processes import ( - concat_phanotate_meta, - concat_trnascan_meta, - convert_gff_to_gbk, - reorient_terminase, - run_aragorn, - run_dnaapler, - run_mash_dist, - run_mash_sketch, - run_minced, - run_mmseqs, - run_phanotate, - run_phanotate_fasta_meta, - run_phanotate_txt_meta, - run_pyrodigal, - run_pyrodigal_gv, - run_trna_scan, - run_trnascan_meta, - split_input_fasta, - translate_fastas, -) +from processes import (concat_phanotate_meta, concat_trnascan_meta, + convert_gff_to_gbk, reorient_terminase, run_aragorn, + run_dnaapler, run_mash_dist, run_mash_sketch, + run_minced, run_mmseqs, run_phanotate, + run_phanotate_fasta_meta, run_phanotate_txt_meta, + run_pyrodigal, run_pyrodigal_gv, run_trna_scan, + run_trnascan_meta, split_input_fasta, translate_fastas) from util import count_contigs, get_version # add this to make sure of deprecation warning with biopython @@ -455,6 +434,11 @@ def main(): pharok.update_fasta_headers() pharok.update_final_output() + # output single gffs in meta mode + if args.split == True and args.meta == True: + # splits the faa into single .faa + pharok.split_faas_singles() + # extract terL pharok.extract_terl() diff --git a/bin/pharokka_multiplotter.py b/bin/pharokka_multiplotter.py new file mode 100755 index 0000000..7d7ec04 --- /dev/null +++ b/bin/pharokka_multiplotter.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +import argparse +import os +import shutil +import sys +from argparse import RawTextHelpFormatter +from pathlib import Path + +from loguru import logger +from plot import create_single_plot +from pycirclize.parser import Genbank +from util import get_version + + +def get_input(): + """gets input for pharokka_plotter.py + :return: args + """ + parser = argparse.ArgumentParser( + description="pharokka_multiplotter.py: pharokka plotting function for muliple phages", + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument( + "-g", + "--genbank", + action="store", + required=True, + help="Input genbank file from Pharokka.", + ) + parser.add_argument( + "-o", + "--outdir", + action="store", + help="Pharokka output directory.", + required=True, + ) + parser.add_argument( + "-f", "--force", help="Overwrites the output plot file.", action="store_true" + ) + parser.add_argument( + "--label_hypotheticals", + help="Flag to label hypothetical or unknown proteins. By default these are not labelled.", + action="store_true", + ) + parser.add_argument( + "--remove_other_features_labels", + help="Flag to remove labels for tRNA/tmRNA/CRISPRs. By default these are labelled. \nThey will still be plotted in black.", + action="store_true", + ) + parser.add_argument( + "--title_size", + action="store", + default="20", + help="Controls title size. Must be an integer. Defaults to 20.", + ) + parser.add_argument( + "--label_size", + action="store", + default="8", + help="Controls annotation label size. Must be an integer. Defaults to 8.", + ) + parser.add_argument( + "--interval", + action="store", + default="5000", + help="Axis tick interval. Must be an integer. Must be an integer. Defaults to 5000.", + ) + parser.add_argument( + "--truncate", + action="store", + default="20", + help="Number of characters to include in annoation labels before truncation with ellipsis. \nMust be an integer. Defaults to 20.", + ) + parser.add_argument( + "--dpi", + action="store", + default="600", + help="Resultion (dots per inch). Must be an integer. Defaults to 600.", + ) + parser.add_argument( + "--annotations", + action="store", + default="1", + help="Controls the proporition of annotations labelled. Must be a number between 0 and 1 inclusive. \n0 = no annotations, 0.5 = half of the annotations, 1 = all annotations. \nDefaults to 1. Chosen in order of CDS size.", + ) + parser.add_argument( + "-t", "--plot_title", action="store", default="Phage", help="Plot name." + ) + parser.add_argument( + "--label_ids", + action="store", + default="", + help="Text file with list of CDS IDs (from gff file) that are guaranteed to be labelled.", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = get_input() + # preamble + logger.add(lambda _: sys.exit(1), level="ERROR") + logger.info(f"Starting Pharokka v{get_version()}") + logger.info("Running pharokka_multiplotter.py to plot your phages.") + logger.info("Command executed: {}", args) + logger.info("Repository homepage is https://github.com/gbouras13/pharokka") + logger.info("Written by George Bouras: george.bouras@adelaide.edu.au") + logger.info("Checking your inputs.") + + try: + int(args.interval) + except: + logger.error( + f"--interval {args.interval} specified is not an integer. Please check your input and try again." + ) + + try: + int(args.label_size) + except: + logger.error( + f"--label_size {args.label_size} specified is not an integer. Please check your input and try again." + ) + + try: + int(args.title_size) + except: + logger.error( + f"--title_size {args.title_size} specified is not an integer. Please check your input and try again." + ) + + try: + int(args.dpi) + except: + logger.error( + f"--dpi {args.dpi} specified is not an integer. Please check your input and try again." + ) + + try: + float(args.annotations) + except: + logger.error( + f"--annotations {args.annotations} specified is not a float. Please check your input and try again." + ) + + if args.force == True: + if os.path.exists(args.outdir) == True: + logger.info(f"Removing {args.outdir} as --force was specified.") + shutil.rmtree(args.outdir) + else: + logger.warning( + f"--force was specified even though the output plot directory {args.outdir} does not already exist." + ) + logger.warning("Continuing") + else: + if os.path.exists(args.outdir) == True: + logger.error( + f"Output directoey {args.outdir} already exists and force was not specified. Please specify -f or --force to overwrite the output directory." + ) + + # instantiate outdir + if Path(args.outdir).exists() is False: + Path(args.outdir).mkdir(parents=True, exist_ok=True) + + # check label_ids + + # list of all IDs that need to be labelled from file + label_force_list = [] + + if args.label_ids != "": + logger.info( + f"You have specified a file {args.label_ids} containing a list of CDS IDs to force label." + ) + # check if it is a file + if os.path.isfile(args.label_ids) == False: + logger.error(f"{args.label_ids} was not found.") + # check if it contains text + try: + # Open the file in read mode + with open(Path(args.label_ids), "r") as file: + # Read the first character + first_char = file.read(1) + + # read all the labels + with open(Path(args.label_ids)) as f: + ignore_dict = {x.rstrip().split()[0] for x in f} + # label force list + label_force_list = list(ignore_dict) + + except FileNotFoundError: + logger.warning( + f"{args.label_id} contains no text. No contigs will be ignored" + ) + + logger.info("All files checked.") + + # single threaded plots + threads = 1 + + gbk = Genbank(args.genbank) + + # gets all contigs and seqs + gb_seq_dict = gbk.get_seqid2seq() + + gb_size_dict = gbk.get_seqid2size() + + contig_count = len(gb_seq_dict) + + # gets all features - will get all regardless of type (tRNA etc from pharokka) + gb_feature_dict = gbk.get_seqid2features() + + # check label_ids + # list of all IDs that need to be labelled from file + label_force_list = [] + + label_ids = args.label_ids + + if label_ids != "": + logger.info( + f"You have specified a file {label_ids} containing a list of CDS IDs to force label." + ) + # check if it is a file + if Path(label_ids).exists() is False: + logger.error(f"{label_ids} was not found.") + # check if it contains text + try: + # Open the file in read mode + with open(Path(label_ids), "r") as file: + # Read the first character + # will error if file is empty + first_char = file.read(1) + + # read all the labels + with open(Path(label_ids)) as f: + ignore_dict = {x.rstrip().split()[0] for x in f} + # label force list + label_force_list = list(ignore_dict) + + except FileNotFoundError: + logger.warning(f"{label_ids} contains no text. No contigs will be ignored") + + # if there is 1 contig, then all the parameters will apply + + for contig_id, contig_sequence in gb_seq_dict.items(): + logger.info(f"Plotting {contig_id}") + + create_single_plot( + contig_id, + contig_sequence, + contig_count, + gb_size_dict, + gb_feature_dict, + gbk, + int(args.interval), + float(args.annotations), + int(args.title_size), + args.plot_title, + int(args.truncate), + args.outdir, + int(args.dpi), + float(args.label_size), + args.label_hypotheticals, + args.remove_other_features_labels, + label_force_list, + ) diff --git a/bin/pharokka_proteins.py b/bin/pharokka_proteins.py index 11e4f30..ee5937c 100755 --- a/bin/pharokka_proteins.py +++ b/bin/pharokka_proteins.py @@ -6,20 +6,12 @@ from pathlib import Path from databases import check_db_installation -from input_commands import ( - check_dependencies, - instantiate_dirs, - validate_fasta, - validate_threads, -) +from input_commands import (check_dependencies, instantiate_dirs, + validate_fasta, validate_threads) from loguru import logger from post_processing import remove_directory, remove_file -from proteins import ( - Pharok_Prot, - get_input_proteins, - run_mmseqs_proteins, - run_pyhmmer_proteins, -) +from proteins import (Pharok_Prot, get_input_proteins, run_mmseqs_proteins, + run_pyhmmer_proteins) from util import get_version diff --git a/bin/plot.py b/bin/plot.py index f3425c6..e72f295 100644 --- a/bin/plot.py +++ b/bin/plot.py @@ -1,4 +1,10 @@ +from pathlib import Path +from typing import Dict, List + import numpy as np +from Bio import SeqUtils +from Bio.Seq import Seq +from Bio.SeqFeature import SeqFeature from loguru import logger from matplotlib.lines import Line2D from matplotlib.patches import Patch @@ -587,3 +593,624 @@ def create_plot( # Save the image as an SVG fig.savefig(svg_plot_file, format="svg", dpi=dpi) + + +### for multiplotter + + +def create_single_plot( + contig_id: str, + contig_sequence: Seq, + contig_count: int, + gb_size_dict: Dict[str, int], + gb_feature_dict: Dict[str, List[SeqFeature]], + gbk: Genbank, + interval: int, + annotations: float, + title_size: float, + plot_title: str, + truncate: int, + output: Path, + dpi: int, + label_size: int, + label_hypotheticals: bool, + remove_other_features_labels: bool, + label_force_list: List[str], +) -> None: + """ + Create a Circos plot for a given contig. + + Args: + contig_id (str): Identifier of the contig. + contig_sequence (Seq): Nucleotide sequence of the contig. + contig_count (int): Count of contigs. + gb_size_dict (Dict[str, int]): Dictionary containing sizes of contigs. + gb_feature_dict (Dict[str, List[SeqFeature]]): Dictionary containing features of contigs. + gbk (Genbank): Parser for GenBank files from pycirclize. + interval (int): Interval for x-axis ticks. + annotations (int): Number of annotations to plot. + title_size (Union[int, float]): Font size for the plot title. + plot_title (str): Title of the plot. + truncate (int): Number of characters to truncate CDS labels to. + output (str): Output directory path. + dpi (int): Dots per inch for the output plot. + label_size (Union[int, float]): Font size for labels. + label_hypotheticals (bool): Whether to include hypothetical labels. + remove_other_features_labels (bool): Whether to remove labels for other features. + label_force_list (List[str]): List of feature IDs to force label. + + Returns: + None + """ + + png_plot_file: Path = Path(output) / f"{contig_id}.png" + svg_plot_file: Path = Path(output) / f"{contig_id}.svg" + + # instantiate circos + seq_len = gb_size_dict[contig_id] + circos = Circos(sectors={contig_id: seq_len}) + + if contig_count == 1 and contig_count != None: + circos.text(plot_title, size=int(title_size), r=190) + else: + circos.text(contig_id, size=int(title_size), r=190) + + sector = circos.get_sector(contig_id) + cds_track = sector.add_track((70, 80)) + cds_track.axis(fc="#EEEEEE", ec="none") + + data_dict = { + "acr_defense_vfdb_card": {"col": "#FF0000", "fwd_list": [], "rev_list": []}, + "unk": {"col": "#AAAAAA", "fwd_list": [], "rev_list": []}, + "other": {"col": "#4deeea", "fwd_list": [], "rev_list": []}, + "tail": {"col": "#74ee15", "fwd_list": [], "rev_list": []}, + "transcription": {"col": "#ffe700", "fwd_list": [], "rev_list": []}, + "dna": {"col": "#f000ff", "fwd_list": [], "rev_list": []}, + "lysis": {"col": "#001eff", "fwd_list": [], "rev_list": []}, + "moron": {"col": "#8900ff", "fwd_list": [], "rev_list": []}, + "int": {"col": "#E0B0FF", "fwd_list": [], "rev_list": []}, + "head": {"col": "#ff008d", "fwd_list": [], "rev_list": []}, + "con": {"col": "#5A5A5A", "fwd_list": [], "rev_list": []}, + } + + fwd_features = [ + feature + for feature in gb_feature_dict[contig_id] + if feature.location.strand == 1 + ] + rev_features = [ + feature + for feature in gb_feature_dict[contig_id] + if feature.location.strand == -1 + ] + cds_features = [ + feature for feature in gb_feature_dict[contig_id] if feature.type == "CDS" + ] + trna_features = [ + feature for feature in gb_feature_dict[contig_id] if feature.type == "tRNA" + ] + tmrna_features = [ + feature for feature in gb_feature_dict[contig_id] if feature.type == "tmRNA" + ] + crispr_features = [ + feature + for feature in gb_feature_dict[contig_id] + if feature.type == "repeat_region" + ] + + # fwd features first + + for f in fwd_features: + if f.type == "CDS": + if ( + "vfdb" in f.qualifiers.get("phrog")[0] + or "card" in f.qualifiers.get("phrog")[0] + or "acr" in f.qualifiers.get("phrog")[0] + or "defensefinder" in f.qualifiers.get("phrog")[0] + ): # vfdb or card or acr or defensefinder + data_dict["acr_defense_vfdb_card"]["fwd_list"].append(f) + else: # no vfdb or card + if f.qualifiers.get("function")[0] == "unknown function": + data_dict["unk"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "other": + data_dict["other"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "tail": + data_dict["tail"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "transcription regulation": + data_dict["transcription"]["fwd_list"].append(f) + elif ( + "DNA" in f.qualifiers.get("function")[0] + ): # to make compatible with pharokka + data_dict["dna"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "lysis": + data_dict["lysis"]["fwd_list"].append(f) + elif ( + "moron" in f.qualifiers.get("function")[0] + ): # to make compatible with pharokka + data_dict["moron"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "integration and excision": + data_dict["int"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "head and packaging": + data_dict["head"]["fwd_list"].append(f) + elif f.qualifiers.get("function")[0] == "connector": + data_dict["con"]["fwd_list"].append(f) + + for f in rev_features: + if f.type == "CDS": + if ( + "vfdb" in f.qualifiers.get("phrog")[0] + or "card" in f.qualifiers.get("phrog")[0] + or "acr" in f.qualifiers.get("phrog")[0] + or "defensefinder" in f.qualifiers.get("phrog")[0] + ): # vfdb or card or acr or defensefinder + data_dict["acr_defense_vfdb_card"]["rev_list"].append(f) + else: # no vfdb or card + if f.qualifiers.get("function")[0] == "unknown function": + data_dict["unk"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "other": + data_dict["other"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "tail": + data_dict["tail"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "transcription regulation": + data_dict["transcription"]["rev_list"].append(f) + elif ( + "DNA" in f.qualifiers.get("function")[0] + ): # to make compatible with pharokka + data_dict["dna"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "lysis": + data_dict["lysis"]["rev_list"].append(f) + elif ( + "moron" in f.qualifiers.get("function")[0] + ): # to make compatible with pharokka + data_dict["moron"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "integration and excision": + data_dict["int"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "head and packaging": + data_dict["head"]["rev_list"].append(f) + elif f.qualifiers.get("function")[0] == "connector": + data_dict["con"]["rev_list"].append(f) + + # add the tracks now + # fwd + for key in data_dict.keys(): + cds_track.genomic_features( + data_dict[key]["fwd_list"], + plotstyle="arrow", + r_lim=(75, 80), + fc=data_dict[key]["col"], + ) + # rev + cds_track.genomic_features( + data_dict[key]["rev_list"], + plotstyle="arrow", + r_lim=(70, 75), + fc=data_dict[key]["col"], + ) + + #### Extra Features + ################################################### + + extras_col = "black" + + fwd_list = [] + for f in fwd_features: + if f.type in ["tRNA", "tmRNA", "tmRNA"]: + fwd_list.append(f) + + cds_track.genomic_features( + fwd_list, + plotstyle="arrow", + r_lim=(75, 80), + fc=extras_col, + ) + + rev_list = [] + for f in rev_features: + if f.type in ["tRNA", "tmRNA", "tmRNA"]: + rev_list.append(f) + + cds_track.genomic_features( + rev_list, + plotstyle="arrow", + r_lim=(70, 75), + fc=extras_col, + ) + + ################################## + ####### thin out extra features ######### + ################################## + + if remove_other_features_labels == False: + # trna + pos_list_trna, labels_trna, length_list_trna = [], [], [] + for f in trna_features: + start, end = int(str(f.location.end)), int(str(f.location.start)) + pos = (start + end) / 2.0 + length = end - start + label = "tRNA" + pos_list_trna.append(pos) + labels_trna.append(label) + length_list_trna.append(length) + + # if trnas exist + if len(length_list_trna) > 0: + # thin out the trnas to avoid overlaps + # Create an empty list to store the filtered indices + filtered_indices_trna = [] + # add the first tRNA + filtered_indices_trna.append(0) + + for i in range(1, len(length_list_trna)): + # If the position of the trna is at least 500bp away from the previous, add it + if pos_list_trna[i] > (pos_list_trna[i - 1] + 500): + filtered_indices_trna.append(i) + + # Use the filtered indices to create new lists for pos_list, labels, and length_list + pos_list_trna = [pos_list_trna[i] for i in filtered_indices_trna] + labels_trna = [labels_trna[i] for i in filtered_indices_trna] + length_list_trna = [length_list_trna[i] for i in filtered_indices_trna] + + # tmrna + pos_list_tmrna, labels_tmrna, length_list_tmrna = [], [], [] + for f in tmrna_features: + start, end = int(str(f.location.end)), int(str(f.location.start)) + pos = (start + end) / 2.0 + length = end - start + label = "tmRNA" + pos_list_tmrna.append(pos) + labels_tmrna.append(label) + length_list_tmrna.append(length) + + if len(length_list_tmrna) > 0: + # thin out the trnas to avoid overlaps + # Create an empty list to store the filtered indices + filtered_indices_tmrna = [] + # add the first tmRNA + filtered_indices_tmrna.append(0) + + for i in range(1, len(length_list_tmrna)): + # If the position of the tmRNA is at least 500bp away from the previous, add it + if pos_list_tmrna[i] > (pos_list_tmrna[i - 1] + 500): + filtered_indices_tmrna.append(i) + + # Use the filtered indices to create new lists for pos_list, labels, and length_list + pos_list_tmrna = [pos_list_tmrna[i] for i in filtered_indices_tmrna] + labels_tmrna = [labels_tmrna[i] for i in filtered_indices_tmrna] + length_list_tmrna = [length_list_tmrna[i] for i in filtered_indices_tmrna] + + # crispr + pos_list_crispr, labels_crispr, length_list_crispr = [], [], [] + for f in crispr_features: + start, end = int(str(f.location.end)), int(str(f.location.start)) + pos = (start + end) / 2.0 + length = end - start + label = "CRISPR" + pos_list_crispr.append(pos) + labels_crispr.append(label) + length_list_crispr.append(length) + + if len(length_list_crispr) > 0: + # thin out the crisprs to avoid overlaps + # Create an empty list to store the filtered indices + filtered_indices_crispr = [] + # add the first crispr + filtered_indices_crispr.append(0) + + for i in range(1, len(length_list_tmrna)): + # If the position of the crispr is at least 500bp away from the previous, add it + if pos_list_crispr[i] > (pos_list_crispr[i - 1] + 500): + filtered_indices_crispr.append(i) + + # Use the filtered indices to create new lists for pos_list, labels, and length_list + pos_list_crispr = [pos_list_crispr[i] for i in filtered_indices_crispr] + labels_crispr = [labels_crispr[i] for i in filtered_indices_crispr] + length_list_crispr = [ + length_list_crispr[i] for i in filtered_indices_crispr + ] + + ################################## + ####### truncate CDS labels + ################################## + + # Extract CDS product labels + pos_list, labels, length_list, id_list = [], [], [], [] + for f in cds_features: + start, end = int(str(f.location.end)), int(str(f.location.start)) + pos = (start + end) / 2.0 + length = end - start + label = f.qualifiers.get("product", [""])[0] + id = f.qualifiers.get("ID", [""])[0] + + # skip hypotheticals if the flag is false (default) + if id in label_force_list: # if in the list + if len(label) > truncate: + label = label[:truncate] + "..." + pos_list.append(pos) + labels.append(label) + length_list.append(length) + id_list.append(id) + continue # to break if in the list + else: + if label_hypotheticals is False: + if ( + label == "" + or label.startswith("hypothetical") + or label.startswith("unknown") + ): + continue # if hypothetical not in the list + else: # all others + if len(label) > truncate: + label = label[:truncate] + "..." + pos_list.append(pos) + labels.append(label) + length_list.append(length) + id_list.append(id) + + ################################################### + #### thin out CDS annotations + ################################################### + + if annotations == 0: + logger.info( + "By inputting --annotations 0 you have chosen to plot no annotations. Continuing" + ) + elif annotations > 1: + logger.info( + "You have input a --annotations value greater than 1. Setting to 1 (will plot all annotations). Continuing" + ) + annotations = 1 + elif annotations < 0: + logger.info( + "You have input a --annotations value less than 0. Setting to 0 (will plot no annotations). Continuing" + ) + annotations = 0 + + ####### running the sparsity + + quantile_length = np.quantile(length_list, annotations) + # Create an empty list to store the filtered indices + filtered_indices = [] + + # Loop through the indices of the length_list + for i in range(len(length_list)): + # If the length at this index is greater than or equal to the median, add the index to filtered_indices + # captures the once in the label force list + if (length_list[i] < quantile_length) or (id_list[i] in label_force_list): + filtered_indices.append(i) + + # Use the filtered indices to create new lists for pos_list, labels, and length_list + pos_list = [pos_list[i] for i in filtered_indices] + labels = [labels[i] for i in filtered_indices] + length_list = [length_list[i] for i in filtered_indices] + + # Plot CDS product labels on outer position + cds_track.xticks( + pos_list, + labels, + label_orientation="vertical", + show_bottom_line=True, + label_size=label_size, + line_kws=dict(ec="grey"), + ) + + ################################################### + # set other features + ################################################### + if remove_other_features_labels == False: + # add trnas + cds_track.xticks( + pos_list_trna, + labels_trna, + label_orientation="vertical", + show_bottom_line=True, + label_size=label_size, + line_kws=dict(ec="grey"), + ) + # add tmrnas + cds_track.xticks( + pos_list_tmrna, + labels_tmrna, + label_orientation="vertical", + show_bottom_line=True, + label_size=label_size, + line_kws=dict(ec="grey"), + ) + # add crisprs + cds_track.xticks( + pos_list_crispr, + labels_crispr, + label_orientation="vertical", + show_bottom_line=True, + label_size=label_size, + line_kws=dict(ec="grey"), + ) + + ################################################### + # set gc content and skew coordinates + ################################################### + gc_content_start = 42.5 + gc_content_end = 60 + gc_skew_start = 25 + gc_skew_end = 42.5 + + # Plot GC content + gc_content_track = sector.add_track((gc_content_start, gc_content_end)) + pos_list, gc_contents = gbk.calc_gc_content(seq=contig_sequence) + gc_contents = ( + gc_contents - SeqUtils.gc_fraction(contig_sequence) * 100 + ) # needs biopython >=1.80 + positive_gc_contents = np.where(gc_contents > 0, gc_contents, 0) + negative_gc_contents = np.where(gc_contents < 0, gc_contents, 0) + abs_max_gc_content = np.max(np.abs(gc_contents)) + vmin, vmax = -abs_max_gc_content, abs_max_gc_content + gc_content_track.fill_between( + pos_list, positive_gc_contents, 0, vmin=vmin, vmax=vmax, color="black" + ) + gc_content_track.fill_between( + pos_list, negative_gc_contents, 0, vmin=vmin, vmax=vmax, color="grey" + ) + + # Plot GC skew + gc_skew_track = sector.add_track((gc_skew_start, gc_skew_end)) + + pos_list, gc_skews = gbk.calc_gc_skew(seq=contig_sequence) + positive_gc_skews = np.where(gc_skews > 0, gc_skews, 0) + negative_gc_skews = np.where(gc_skews < 0, gc_skews, 0) + abs_max_gc_skew = np.max(np.abs(gc_skews)) + vmin, vmax = -abs_max_gc_skew, abs_max_gc_skew + gc_skew_track.fill_between( + pos_list, positive_gc_skews, 0, vmin=vmin, vmax=vmax, color="green" + ) + gc_skew_track.fill_between( + pos_list, negative_gc_skews, 0, vmin=vmin, vmax=vmax, color="purple" + ) + + label_size = int(label_size) + + # Plot xticks & intervals on inner position + cds_track.xticks_by_interval( + interval=int(interval), + outer=False, + show_bottom_line=False, + label_formatter=lambda v: f"{v/ 1000:.0f} Kb", # no decimal place + label_orientation="vertical", + line_kws=dict(ec="grey"), + label_size=7, + ) + + ################################ + # phrog legend + ############################### + + # # Add legend + handle_phrogs = [ + Patch(color=data_dict["unk"]["col"], label="Unknown Function"), + Patch(color=data_dict["other"]["col"], label="Other Function"), + Patch( + color=data_dict["transcription"]["col"], label="Transcription Regulation" + ), + Patch( + color=data_dict["dna"]["col"], label="DNA/RNA & nucleotide \n metabolism" + ), + Patch(color=data_dict["lysis"]["col"], label="Lysis"), + Patch( + color=data_dict["moron"]["col"], + label="Moron, auxiliary metabolic \n gene & host takeover", + ), + Patch(color=data_dict["int"]["col"], label="Integration & excision"), + Patch(color=data_dict["head"]["col"], label="Head & packaging"), + Patch(color=data_dict["con"]["col"], label="Connector"), + Patch(color=data_dict["tail"]["col"], label="Tail"), + Patch( + color=data_dict["acr_defense_vfdb_card"]["col"], + label="Virulence Factor/AMR", + ), + ] + + fig = circos.plotfig() + + phrog_legend_coords = (0.10, 1.185) + phrog_legend = circos.ax.legend( + handles=handle_phrogs, + bbox_to_anchor=phrog_legend_coords, + fontsize=9.5, + loc="center", + title="PHROG CDS", + handlelength=2, + ) + + circos.ax.add_artist(phrog_legend) + + ################################ + # gc and other features legend + ############################### + + handle_gc_content = [ + Line2D( + [], + [], + color="black", + label="Positive GC Content", + marker="^", + ms=6, + ls="None", + ), + Line2D( + [], + [], + color="grey", + label="Negative GC Content", + marker="v", + ms=6, + ls="None", + ), + ] + + handle_gc_skew = [ + Line2D( + [], [], color="green", label="Positive GC Skew", marker="^", ms=6, ls="None" + ), + Line2D( + [], + [], + color="purple", + label="Negative GC Skew", + marker="v", + ms=6, + ls="None", + ), + ] + + handle_other_features = [Patch(color=extras_col, label="tRNA/tmRNA/CRISPR")] + + # shrink plot a bit (0.8) + box = circos.ax.get_position() + circos.ax.set_position([box.x0, box.y0, box.width * 0.65, box.height * 0.9]) + + # gc content and skew coordinates + gc_content_anchor = (0.92, 1.30) + gc_skew_anchor = (0.92, 1.20) + + gc_legend_cont = circos.ax.legend( + handles=handle_gc_content, + bbox_to_anchor=gc_content_anchor, + loc="center", + fontsize=9.5, + title="GC Content", + handlelength=2, + ) + + circos.ax.add_artist(gc_legend_cont) + + gc_legend_skew = circos.ax.legend( + handles=handle_gc_skew, + bbox_to_anchor=gc_skew_anchor, + loc="center", + fontsize=9.5, + title="GC Skew", + handlelength=2, + ) + + circos.ax.add_artist(gc_legend_skew) + + # other features + other_features_anchor = (0.92, 1.10) + + other_features_legend = circos.ax.legend( + handles=handle_other_features, + bbox_to_anchor=other_features_anchor, + loc="center", + fontsize=9.5, + title="Other Features", + handlelength=2, + ) + + circos.ax.add_artist(other_features_legend) + + dpi = int(dpi) + + # save as png + fig.savefig(png_plot_file, dpi=dpi) + + # Save the image as an SVG + fig.savefig(svg_plot_file, format="svg", dpi=dpi) diff --git a/bin/post_processing.py b/bin/post_processing.py index b835ae7..4f44a1b 100644 --- a/bin/post_processing.py +++ b/bin/post_processing.py @@ -442,7 +442,11 @@ def get_contig_name_lengths(self): gc.append(round(gc_fraction(record.seq), 2)) # pyrodigal-gv lookup from the dict if self.gene_predictor == "prodigal-gv": - transl_table = transl_table_dict[record.id] + # try catch clause if contig too small to have a gene + try: + transl_table = transl_table_dict[record.id] + except: + transl_table = "No_CDS_called" transl_tables.append(transl_table) @@ -1432,6 +1436,25 @@ def split_fasta_singles(self): with open(os.path.join(single_fastas, contig + ".fasta"), "w") as f: SeqIO.write(dna_record, f, "fasta") + def split_faas_singles(self): + """Splits the .faa fasta into separate single fasta files for output based on contig names + + :param input_fasta: input multifasta file + :param out_dir: output directory + """ + + single_faas = os.path.join(self.out_dir, "single_faas") + check_and_create_directory(single_faas) + faa_file = os.path.join(self.out_dir, f"{self.gene_predictor}.faa") + faa_sequences = SeqIO.parse(open(faa_file), "fasta") + + for record in faa_sequences: + # remove the last 9 chars e.g. _CDS_0001 + protein_id = record.id[:-9] + # needs to be append + with open(os.path.join(single_faas, f"{protein_id}.faa"), "a") as f: + SeqIO.write(record, f, "fasta") + def write_tophits_vfdb_card(self): """ Outputs top_hits_vfdb.tsv and top_hits_card.tsv diff --git a/bin/processes.py b/bin/processes.py index 96322f5..9e3a913 100644 --- a/bin/processes.py +++ b/bin/processes.py @@ -994,3 +994,42 @@ def run_dnaapler(filepath_in, contig_count, out_dir, threads, logdir): dnaapler_success = False return dnaapler_success + + +# #### skani + +# # no need to sketch +# # takes like 10 secs to run this + +# # pharokka_v1.4.0_databases % skani dist -q ../pharokka/tests/test_data/overall/Meta_example/combined_meta.fasta -r 1Jan2024_genomes.fa --qi --ri --slow -m 200 -n 1 > out + + +# def run_skani(input_fasta, out_dir, db_dir, logdir): +# """ +# Runs mash +# :param filepath_in: input filepath +# :param out_dir: output directory +# :param mash_distance: mash distance - float +# :param logger logger +# :return: +# """ + +# inphared_fasta = os.path.join(db_dir, "1Jan2024_genomes.fa.gz") +# skani_tsv = os.path.join(out_dir, "skani_out.tsv") + +# mash_dist = ExternalTool( +# tool="skani", +# input="", +# output="", +# params=f" dist -q {input_fasta} -r {inphared_fasta} --qi --ri --slow -m 200 -n 1 ", +# logdir=logdir, +# outfile=skani_tsv, +# ) + +# # need to write to stdout + +# try: +# ExternalTool.run_tool(mash_dist, to_stdout=True) + +# except: +# logger.error("Error with mash dist\n") diff --git a/bin/proteins.py b/bin/proteins.py index ab3ce81..9ccd9db 100644 --- a/bin/proteins.py +++ b/bin/proteins.py @@ -16,14 +16,12 @@ from Bio import SeqIO from external_tools import ExternalTool from loguru import logger -from post_processing import ( - process_card_results, - process_pyhmmer_results, - process_vfdb_results, -) +from post_processing import (process_card_results, process_pyhmmer_results, + process_vfdb_results) from pyhmmer.easel import SequenceFile from pyhmmer.plan7 import HMM, HMMFile -from util import count_contigs, get_contig_headers, get_version, remove_directory +from util import (count_contigs, get_contig_headers, get_version, + remove_directory) Result = collections.namedtuple("Result", ["protein", "phrog", "bitscore", "evalue"]) diff --git a/bin/version.py b/bin/version.py index f49459c..14d9d2f 100644 --- a/bin/version.py +++ b/bin/version.py @@ -1 +1 @@ -__version__ = "1.6.1" +__version__ = "1.7.0" diff --git a/docs/custom.md b/docs/custom.md index 89ea0eb..be3f669 100644 --- a/docs/custom.md +++ b/docs/custom.md @@ -4,7 +4,7 @@ The easiest way to do this is with the `create_custom_hmm.py` script that comes with `pharokka` as follows: -``` +```bash create_custom_hmm.py -i -o -p ``` @@ -14,7 +14,7 @@ The resulting output you need to feed into `pharokka.py` with the `--custom_hmm` `create_custom_hmm.py` creates HMM profiles for each file in the input directory. It takes the name as the custom annotation for that HMM. It requires FASTA format MSAs (multiple sequence alignments) e.g. -``` +```bash >gene1 --------------------MERCMYDLSHFSFMTGRIGQLQTLTCIPVVSGDSFSVNFQGVFRLSPLRRNMVIDCMVDLFAFYVPYRHVYG-----DDWENFMLQGTDETVTFGTAD-FTTN-AVEYLGSQ-----R-TTGVAPLWRLASYNQIWNRYFRVPSDTSAILSNTALE---A-GVNKNKWGRYCARPKVIWSTGVDTTTD--AAD--REVTVTANQLDITDLDAVKGRYASEQRRDWFG-QRYNDILKNQFGGGAGTDADERPTLIMRKQHFLSGYDVDGTGDATLGQYSGKSAAVCDLQIPRKWFPEHGSL-WIMALLRFPTIHEEEIHYLFKKSQPTYDEISGDPDIWERKAPIEHQVQDFFTESTDTTSLGFMPYGQWYRYHPNVVHNQFTEVTGFSFVSA-RPT-----SKDTARYIQDDEYTPTFQTTSLGHWQSQARVDVEAVRPIPGPLKSIFAGV------------ >gene2 @@ -25,7 +25,7 @@ MSKSSGYATTSKPQEVVTRDIERKPYDLSHWSFKTGHIGRLQTFSVIPVVAGDSIELNMSAVLRLSPLRHFMYLDAVVDL For example, if my input directory was called `my_tail_msa` and contained 3 MSA files names `tail_family_1.fasta`, `tail_family_2.fasta` and `tail_family_3.fasta`, the following command will create HMM profiles for all 3 MSA files, and combine them together into a `h3m` file with the prefix `tail`: -``` +```bash create_custom_hmm.py -i my_tail_msa -o tail_hmms -p tail ``` @@ -35,13 +35,13 @@ The resulting file to be used with `--custom_hmm` is `tail_hmms/tail.h3m` So on a genome, you would run `pharokka` as follows: -``` +```bash pharokka.py -i -o -d -t --custom_hmm tail_hmms/tail.h3m ``` A gene hit to e.g. the tail_family_2 HMM profile found by PyHMMER will be indicated in the `custom_hmm_id` column of the `pharokka_cds_final_merged_output.tsv` output file as `tail_family_2`, with information about the `custom_hmm_bitscore` `custom_hmm_evalue` for bitscore and evalue from PyHMMER. The annotations will supplement any PHROG annotation found and will be included in the `pharokka.gff` and `pharokka.gbk` files as 'custom_annotation=tail_family_2' in this example. -``` +```bash usage: create_custom_hmm.py [-h] [-i INDIR] [-o OUTDIR] [-p PREFIX] [-f] [-V] create_custom_hmm.py: Creates HMMs from FASTA formatted MSAs with PyHMMER for use with Pharokka v1.4.0 and higher. diff --git a/docs/install.md b/docs/install.md index ecfc87c..1d6a9f2 100644 --- a/docs/install.md +++ b/docs/install.md @@ -4,7 +4,7 @@ The easiest way to install `pharokka` is via conda. For inexperienced command line users, this method is highly recommended. -``` +```bash conda install -c bioconda pharokka ``` @@ -12,7 +12,7 @@ This will install all the dependencies along with `pharokka`. The dependencies a If conda is taking a long time to solve the environment, try using mamba: -``` +```bash conda install mamba mamba install -c bioconda pharokka ``` @@ -21,7 +21,7 @@ mamba install -c bioconda pharokka As of v1.4.0, you can also install the python components of `pharokka` with pip. -``` +```bash pip install pharokka ``` @@ -31,7 +31,7 @@ You will still need to install the non-python dependencies manually. Alternatively, the development version of `pharokka` (which may include new, untested features) can be installed manually via github. -``` +```bash git clone https://github.com/gbouras13/pharokka.git cd pharokka pip install -e . @@ -42,7 +42,7 @@ The dependencies found in environment.yml will then need to be installed manuall For example using conda to install the required dependencies: -``` +```bash conda env create -f environment.yml conda activate pharokka_env # assuming you are in the pharokka directory @@ -67,7 +67,7 @@ If this does not work, you an alternatively download the databases from Zenodo a If you prefer to use the command line: -``` +```bash wget "https://zenodo.org/record/8267900/files/pharokka_v1.4.0_databases.tar.gz" tar -xzf pharokka_v1.4.0_databases.tar.gz ``` @@ -93,7 +93,7 @@ For Mac (Intel, will also work with M1): 4. After installation is complete, you should add the following channels to your conda configuration: -``` +```bash conda config --add channels defaults conda config --add channels bioconda conda config --add channels conda-forge @@ -105,7 +105,7 @@ conda config --add channels conda-forge 6. Finally, I would recommend installing pharokka into a fresh environment. For example to create an environment called pharokkaENV with pharokka installed: -``` +```bash mamba create -n pharokkaENV pharokka conda activate pharokkaENV install_databases.py -h diff --git a/docs/plotting.md b/docs/plotting.md index 4864e67..e627faa 100644 --- a/docs/plotting.md +++ b/docs/plotting.md @@ -1,5 +1,7 @@ # Plotting +## `pharokka_plotter.py` + Pharokka v1.3.0 implements `pharokka_plotter.py`, which creates a simple circular genome plot using [pyCirclize](https://github.com/moshi4/pyCirclize) with output in PNG format. All CDS are coloured according to their PHROG functional group. It is reasonably customisable and is designed for single input phage contigs. If an input FASTA with multiple contigs is entered, it will only plot the first contig. It requires the input FASTA, Pharokka output directory, and the `-p` or `--prefix` value used with Pharokka if specified. @@ -68,7 +70,7 @@ If you want to specify specific labels, you can do this by including a list of l e.g. if you make a text file called `labels.txt` -``` +```bash EHOONAYF_CDS_0007 EHOONAYF_CDS_0009 EHOONAYF_CDS_0023 @@ -78,7 +80,7 @@ EHOONAYF_CDS_0023 -``` +```bash usage: pharokka_plotter.py [-h] -i INFILE [-n PLOT_NAME] [-o OUTDIR] [--gff GFF] [--genbank GENBANK] [-p PREFIX] [-t PLOT_TITLE] [-f] [--label_hypotheticals] [--remove_other_features_labels] [--title_size TITLE_SIZE] [--label_size LABEL_SIZE] [--interval INTERVAL] [--truncate TRUNCATE] [--dpi DPI] [--annotations ANNOTATIONS] [--label_ids LABEL_IDS] @@ -125,3 +127,54 @@ options: ![Image](SAOMS1_plot.png) SAOMS1 phage (GenBank: MW460250.1) was isolated and sequenced by: Yerushalmy, O., Alkalay-Oren, S., Coppenhagen-Glazer, S. and Hazan, R. from the Institute of Dental Sciences and School of Dental Medicine, Hebrew University, Israel. + +## `pharokka_multiplotter.py` + +After receiving a number of requests, `pharokka` v1.7.0 implements `pharokka_multiplotter.py` that can be used to plot multiple contigs. + +It requires the `pharokka` output Genbank file (here, `pharokka.gbk`). It will save plots for each contig in the output directory (here `pharokka_plots_output_directory`). + +e.g. + +``` +pharokka_multiplotter.py -g pharokka.gbk -o pharokka_plots_output_directory +``` + +The rest of the parameters are the same as `pharokka_plotter.py` + +```bash +usage: pharokka_multiplotter.py [-h] -g GENBANK -o OUTDIR [-f] [--label_hypotheticals] [--remove_other_features_labels] + [--title_size TITLE_SIZE] [--label_size LABEL_SIZE] [--interval INTERVAL] [--truncate TRUNCATE] [--dpi DPI] + [--annotations ANNOTATIONS] [-t PLOT_TITLE] [--label_ids LABEL_IDS] + +pharokka_multiplotter.py: pharokka plotting function for muliple phages + +options: + -h, --help show this help message and exit + -g GENBANK, --genbank GENBANK + Input genbank file from Pharokka. + -o OUTDIR, --outdir OUTDIR + Pharokka output directory. + -f, --force Overwrites the output plot file. + --label_hypotheticals + Flag to label hypothetical or unknown proteins. By default these are not labelled. + --remove_other_features_labels + Flag to remove labels for tRNA/tmRNA/CRISPRs. By default these are labelled. + They will still be plotted in black. + --title_size TITLE_SIZE + Controls title size. Must be an integer. Defaults to 20. + --label_size LABEL_SIZE + Controls annotation label size. Must be an integer. Defaults to 8. + --interval INTERVAL Axis tick interval. Must be an integer. Must be an integer. Defaults to 5000. + --truncate TRUNCATE Number of characters to include in annoation labels before truncation with ellipsis. + Must be an integer. Defaults to 20. + --dpi DPI Resultion (dots per inch). Must be an integer. Defaults to 600. + --annotations ANNOTATIONS + Controls the proporition of annotations labelled. Must be a number between 0 and 1 inclusive. + 0 = no annotations, 0.5 = half of the annotations, 1 = all annotations. + Defaults to 1. Chosen in order of CDS size. + -t PLOT_TITLE, --plot_title PLOT_TITLE + Plot name. + --label_ids LABEL_IDS + Text file with list of CDS IDs (from gff file) that are guaranteed to be labelled. +``` \ No newline at end of file diff --git a/docs/proteins.md b/docs/proteins.md index 96de362..77f231f 100644 --- a/docs/proteins.md +++ b/docs/proteins.md @@ -32,7 +32,7 @@ e.g. | protein_2 | 243 | 315 | endolysin | lysis | -``` +```bash usage: pharokka_proteins.py [-h] [-i INFILE] [-o OUTDIR] [-d DATABASE] [-t THREADS] [-f] [-p PREFIX] [-e EVALUE] [--hmm_only] [--mmseqs2_only] [-V] [--citation] pharokka_proteins.py: fast phage protein annotation script diff --git a/docs/run.md b/docs/run.md index a788c89..cab6a32 100644 --- a/docs/run.md +++ b/docs/run.md @@ -6,7 +6,7 @@ Once `install_databases.py` has been run, pharokka requires an input fasta file. You can run `pharokka` using the following command: -``` +```bash pharokka.py -i -o -t -p ``` @@ -14,7 +14,7 @@ pharokka.py -i -o -t -p As of v1.4.0, you can speed up `pharokka` for small inputs (e.g. phage isolates or a small number of contigs) using `-fast`. Note: this will run only PyHMMER against PHROGs (so no VFDB or CARD annotations will be done). -``` +```bash pharokka.py -i -o -d -t --fast ``` @@ -22,55 +22,55 @@ A prefix is not required - by default it is `pharokka` e.g. `pharokka.gff` To specify a different database directory (recommended) from the default: -``` +```bash pharokka.py -i -o -d -t -p ``` To overwrite an existing output directory, use -f or `--force`: -``` +```bash pharokka.py -i -o -d -t -p -f ``` To specify a custom locus tag in the gff/genbank file, use -l: -``` +```bash pharokka.py -i -o -d -t -p -f -l ``` In versions from v0.1.7 and afterwards, the ability to specify an E-value threshold for CDS functional assignment using MMseqs2 was added using the -e flag. It defaults to 1E-5. -``` +```bash pharokka.py -i -o -d -t -e ``` To use Prodigal (pyrodigal) gene predictions instead of PHANOTATE use `-g prodigal` -``` +```bash pharokka.py -i -o -d -t -g prodigal ``` To use Prodigal-gv (pyrodigal-gv) gene predictions instead of PHANOTATE use `-g prodigal-gv`. This is recommended for metagenomic datasets where some phages likely have [alternate genetic codes](https://github.com/apcamargo/prodigal-gv). -``` +```bash pharokka.py -i -o -d -t -g prodigal-gv ``` If you are annotating more than 1 contig, it is recommended that you run pharokka in meta mode using the `-m` flag, which will enable pharokka to finish faster by making full use of all available threads when running tRNAscan-SE 2. -``` +```bash pharokka.py -i -o -d -t -m -g prodigal-gv ``` As of v1.5.0, you can skip running mash to find the closest match for each contig in INPHARED using `skip_mash`. -``` +```bash pharokka.py -i -o -d -t --skip_mash ``` As of v1.5.0, you can skip running tRNAscan-SE 2, MinCED and Aragorn fusingg `skip_extra_annotations`. -``` +```bash pharokka.py -i -o -d -t --skip_extra_annotations ``` @@ -80,49 +80,49 @@ pharokka.py -i -o -d -t -o -d -t --mmseqs2_only ``` As of v1.4.0, `pharokka` will automatically run `--mmseqs2_only` if you specify `-m` or `--meta` for meta mode. To force `pharokka` to use PyHMMER on PHROGs in meta mode as well, use `--meta_hmm`. Note: this might take a long time if you have a large metagenome! -``` +```bash pharokka.py -i -o -d -t -m --meta_hmm ``` You can also force `pharokka` to only use PyHMMER on PHROGs in meta mode using `--fast`. -``` +```bash pharokka.py -i -o -d -t -m --fast ``` If you are running meta mode, you can optionally specify split mode using the `-s` flag. This will add output directories with separated FASTA, genbank and gff files for each input contig called `single_fastas`, `single_gbks` and `single_gffs`. -``` +```bash pharokka.py -i -o -d -t -m -s ``` There is also support for alternative genetic codes if `pharokka` is run with prodigal as a gene predictor using the `-c` flag. See Prodigal's [documentation](https://github.com/hyattpd/prodigal/wiki/Advice-by-Input-Type#alternate-genetic-codes), along with [Yutin et al. 2021](https://doi.org/10.1038/s41467-022-32979-6) and [Peters et al. 2022](https://doi.org/10.1038/s41467-022-32979-6) for more information and background on phage stop codon reassignment. -``` +```bash pharokka.py -i -o -d -t -g prodigal -c 4 ``` As of v1.4.0 you can specify a genbank input file with `--genbank`. This will take all (custom) CDS calls in genbank file and PHANOTATE/pyrodigal will not be run. So if you have done manual gene curation, this option is recommended. -``` +```bash pharokka.py -i -o -d -t --genbank ``` As of v1.4.0 you want to automatically reorient your phage to begin with the large terminase subunit, you can use `--dnaapler` which will run [dnaapler](https://github.com/gbouras13/dnaapler) to automatically reorient your phage. The reoriented FASTA will be named `_dnaapler_reoriented.fasta` in the output directory. -``` +```bash pharokka.py -i -o -d -t --dnaapler ``` `--dnaapler` is recommended for reorientation as of v1.4.0. However, if you need to reorient your phage to begin at a certain point (for example, to reorient the phage to begin with the large terminase subunit) or if `dnaapler` isn't working for you, you can acheive this using the `--terminase` command along with specifying the `--terminase_strand` denoting the strandedness of the large terminase subunit (must be 'pos' or 'neg') and `--terminase_start` denoting the start co-ordinate. This requires the input to be a single contig, or else `pharokka` will fail and warn you. For example, if you annotate your phage using `pharokka`, and then see that your terminase large subunit begins at coordinate 9503 on the positive strand, you could run the following to reorient it to begin at the large terminase subunit: -``` +```bash pharokka.py -i -o -d -t --terminase --terminase_start 9503 --terminase_strand pos ``` @@ -132,7 +132,7 @@ Additionally, it should be noted that if you want to reorient to begin with the Then you need to specify the following to reorient the phage to begin with the large terminase subunit as follows: -``` +```bash pharokka.py \ -i \ -o \ @@ -147,7 +147,7 @@ Of course, you can also use this functionality to reorient your phage however yo As of v1.6.0 you can specify a custom mash distance threshold vs the INPHARED database with `--mash_distance` -``` +```bash pharokka.py -i -o -d -t --mash_distance 0.05 ``` @@ -155,12 +155,12 @@ You can also specify custom arguments to be passed to [MINced](https://doi.org/1 Note 2 things: (1) that you need to leave off the leading hyphen (i.e. `"minNR 2"` not `"-minNR 2"`) and (2) you will need to use quotation marks (as you extra arguments will contain spaces) -``` +```bash pharokka.py -i -o -d -t --minced_args "minNR 2 -minRL 21" ``` -``` +```bash usage: pharokka.py [-h] [-i INFILE] [-o OUTDIR] [-d DATABASE] [-t THREADS] [-f] [-p PREFIX] [-l LOCUSTAG] [-g GENE_PREDICTOR] [-m] [-s] [-c CODING_TABLE] [-e EVALUE] [--fast] [--mmseqs2_only] [--meta_hmm] [--dnaapler] [--custom_hmm CUSTOM_HMM] [--genbank] [--terminase] [--terminase_strand TERMINASE_STRAND] [--terminase_start TERMINASE_START] [--skip_extra_annotations] diff --git a/environment.yml b/environment.yml index 361620c..2de8df8 100644 --- a/environment.yml +++ b/environment.yml @@ -5,8 +5,8 @@ channels: - defaults dependencies: - bcbio-gff >=0.7.0 - - biopython >=1.78,<1.82 - - phanotate >=1.5.0 + - biopython >=1.80 + - phanotate >=1.5.0,<1.6.4 - mmseqs2 ==13.45111 - trnascan-se >=2.0.9 - minced >=0.4.2 diff --git a/setup.py b/setup.py index 89a5e8e..d39961a 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ def package_files(directory): setup( name="Pharokka", - version="1.6.1", + version="1.7.0", author="George Bouras", author_email="george.bouras@adelaide.edu.au", description="Fast phage annotation tool", @@ -28,6 +28,7 @@ def package_files(directory): url="https://github.com/gbouras13/pharokka", scripts=[ "bin/pharokka.py", + "bin/pharokka_multiplotter.py", "bin/pharokka_plotter.py", "bin/install_databases.py", "bin/pharokka_proteins.py", @@ -72,7 +73,7 @@ def package_files(directory): "loguru>=0.5.4", "pyyaml>=6.0", "pandas>=1.4.2", - "biopython>=1.76", + "biopython>=1.80", "pyhmmer>=0.10.0", "black>=22.3.0", "isort>=5.10.1", diff --git a/tests/test_external_commands.py b/tests/test_external_commands.py index 7fbe2d4..6aa874c 100644 --- a/tests/test_external_commands.py +++ b/tests/test_external_commands.py @@ -14,15 +14,8 @@ import pytest from loguru import logger -from bin.processes import ( - run_aragorn, - run_mash_sketch, - run_minced, - run_phanotate, - run_pyrodigal, - run_pyrodigal_gv, -) - +from bin.processes import (run_aragorn, run_mash_sketch, run_minced, + run_phanotate, run_pyrodigal, run_pyrodigal_gv) # import functions from bin.util import remove_directory diff --git a/tests/test_input_commands.py b/tests/test_input_commands.py index 4fb84b2..20d715a 100755 --- a/tests/test_input_commands.py +++ b/tests/test_input_commands.py @@ -6,7 +6,6 @@ """ import sys - # import import unittest from pathlib import Path @@ -15,16 +14,10 @@ import pytest from loguru import logger -from bin.input_commands import ( - instantiate_dirs, - validate_fasta, - validate_gene_predictor, - validate_meta, - validate_strand, - validate_terminase, - validate_terminase_start, - validate_threads, -) +from bin.input_commands import (instantiate_dirs, validate_fasta, + validate_gene_predictor, validate_meta, + validate_strand, validate_terminase, + validate_terminase_start, validate_threads) from bin.util import remove_directory # test data diff --git a/tests/test_overall.py b/tests/test_overall.py index 8660687..eb1dea5 100755 --- a/tests/test_overall.py +++ b/tests/test_overall.py @@ -7,7 +7,6 @@ # import import os import shutil - # import functions import subprocess import sys diff --git a/tests/test_proteins.py b/tests/test_proteins.py index c2f1f89..c6ae293 100755 --- a/tests/test_proteins.py +++ b/tests/test_proteins.py @@ -8,7 +8,6 @@ # import import os import shutil - # import functions import subprocess import sys