Skip to content

Commit

Permalink
create db
Browse files Browse the repository at this point in the history
  • Loading branch information
gbouras13 committed Aug 23, 2023
1 parent c00babc commit a7716b0
Show file tree
Hide file tree
Showing 10 changed files with 251 additions and 28 deletions.
176 changes: 176 additions & 0 deletions bin/create_hmms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#!/usr/bin/env python3

"""
Script to create hmm profiles for each PHROG with pyhmmer
create_hmms.py -i <directory of MSAs> -o <directory with HMMs>
Note: all MSAs must be in FASTA format and labelled with only 1 full stop e.g. "name.suffix"
"""

import os
from pathlib import Path
import pyhmmer
import shutil
from util import get_version

alphabet = pyhmmer.easel.Alphabet.amino()
background = pyhmmer.plan7.Background(alphabet)
import argparse
import os
import sys
from argparse import RawTextHelpFormatter
from pathlib import Path
from loguru import logger



def get_input():
"""gets input for create_hmms.py
:return: args
"""
parser = argparse.ArgumentParser(
description="create_hmms.py: Creates HMMs from FASTA formatted MSAs with PyHMMER for use with pharokka v1.4.0 and higher.",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument(
"-i",
"--indir",
action="store",
help="Input directory containing FASTA formatted Multiple Sequence Alignments.",
)

parser.add_argument(
"-o", "--outdir", action="store",
default="", help="Output directory to store HMM profiles."
)

parser.add_argument(
"-p",
"--prefix",
action="store",
help="Prefix used to name HMMs. The relevant file be 'prefix'.h3m",
default="pharokka",
)

parser.add_argument(
"-f", "--force", help="Overwrites the output directory.", action="store_true"
)
parser.add_argument(
"-V",
"--version",
help="Print pharokka Version",
action="version",
version=get_version(),
)
args = parser.parse_args()

return args


def main():

logger.add(lambda _: sys.exit(1), level="ERROR")
logger.info(f"Starting pharokka v{get_version()} - create_hmms.py")
args = get_input()

MSA_dir = args.indir
HMM_dir = args.outdir


#### force
if args.force == True:
if os.path.isdir(HMM_dir) == True:
logger.info(
f"Removing output directory {HMM_dir} as -f or --force was specified."
)
shutil.rmtree(HMM_dir)
elif os.path.isfile(HMM_dir) == True:
logger.info(
f"Removing output file {HMM_dir} as -f or --force was specified."
)
os.remove(HMM_dir)
else:
logger.info(
f"--force was specified even though the output directory {HMM_dir} does not already exist. Continuing."
)
else:
if os.path.isdir(HMM_dir) == True or os.path.isfile(HMM_dir) == True:
logger.error(
f"The output directory {HMM_dir} already exists and force was not specified. Please specify -f or --force to overwrite it."
)

# Check if the directory already exists and make the dir
if not os.path.exists(HMM_dir):
# Create the output directory
os.mkdir(HMM_dir)

logger.info(
f"Creating HMMs in the directory {HMM_dir} from MSAs in the directory {MSA_dir}."
)


# Get a list of all files in the directory
file_list = os.listdir(MSA_dir)


# loop over each PHROG
for file in file_list:
# check if MSA
if is_fasta_msa(f"{MSA_dir}/{file}"):
# read in each msa
with pyhmmer.easel.MSAFile(
f"{MSA_dir}/{file}", digital=True, alphabet=alphabet
) as msa_file:
msa = msa_file.read()
# split the file into root and suffix
root, _ = os.path.splitext(file)
name = root
# convert to bytes
msa.name = name.encode("utf-8")
# build the HMM
builder = pyhmmer.plan7.Builder(alphabet)
background = pyhmmer.plan7.Background(alphabet)
hmm, _, _ = builder.build_msa(msa, background)
with open(f"{HMM_dir}/{name}.hmm", "wb") as output_file:
hmm.write(output_file)
else:
logger.warning(f"{MSA_dir}/{file} does not seem to be a FASTA formatted MSA. Skipping.")


# to concatenate all hmms

hmms = []

# Specify the directory path
HMM_dir = Path(HMM_dir)

# Get a list of all files in the directory
hmm_file_list = os.listdir(HMM_dir)

# reads and saves the hmms
for file_name in hmm_file_list:
f = f"{HMM_dir}/{file_name}"
with pyhmmer.plan7.HMMFile(f) as hmm_file:
hmm = hmm_file.read()
hmms.append(hmm)

# writes all out together to .h3m, .h3p, .h3i, .h3f files prefixed "prefix"
pyhmmer.hmmer.hmmpress(hmms, f"{HMM_dir}/{args.prefix}")

logger.info(
f"HMM creation complete."
)
logger.info(
f"The combined file you will need to run with pharokka.py --custom_db is {HMM_dir}/{args.prefix}.h3m"
)


def is_fasta_msa(filename):
with open(filename, "r") as file:
lines = file.readlines()
return len([line for line in lines if line.startswith(">")]) > 1

if __name__ == "__main__":
main()
42 changes: 31 additions & 11 deletions bin/pharokka.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,39 @@

from databases import check_db_installation
from hmm import run_pyhmmer
from input_commands import (check_dependencies, get_input, instantiate_dirs,
instantiate_split_output, validate_fasta,
validate_gene_predictor, validate_meta,
validate_terminase, validate_threads)
from input_commands import (
check_dependencies,
get_input,
instantiate_dirs,
instantiate_split_output,
validate_fasta,
validate_gene_predictor,
validate_meta,
validate_terminase,
validate_threads,
)
from loguru import logger
from post_processing import Pharok, remove_post_processing_files
from processes import (concat_phanotate_meta, concat_trnascan_meta,
convert_gff_to_gbk, reorient_terminase, run_aragorn,
run_dnaapler, run_mash_dist, run_mash_sketch,
run_minced, run_mmseqs, run_phanotate,
run_phanotate_fasta_meta, run_phanotate_txt_meta,
run_pyrodigal, run_trna_scan, run_trnascan_meta,
split_input_fasta, translate_fastas)
from processes import (
concat_phanotate_meta,
concat_trnascan_meta,
convert_gff_to_gbk,
reorient_terminase,
run_aragorn,
run_dnaapler,
run_mash_dist,
run_mash_sketch,
run_minced,
run_mmseqs,
run_phanotate,
run_phanotate_fasta_meta,
run_phanotate_txt_meta,
run_pyrodigal,
run_trna_scan,
run_trnascan_meta,
split_input_fasta,
translate_fastas,
)
from util import get_version


Expand Down
16 changes: 12 additions & 4 deletions bin/pharokka_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@
from pathlib import Path

from databases import check_db_installation
from input_commands import (check_dependencies, instantiate_dirs,
validate_fasta, validate_threads)
from input_commands import (
check_dependencies,
instantiate_dirs,
validate_fasta,
validate_threads,
)
from loguru import logger
from post_processing import remove_directory, remove_file
from proteins import (Pharok_Prot, get_input_proteins, run_mmseqs_proteins,
run_pyhmmer_proteins)
from proteins import (
Pharok_Prot,
get_input_proteins,
run_mmseqs_proteins,
run_pyhmmer_proteins,
)
from util import get_version


Expand Down
10 changes: 6 additions & 4 deletions bin/proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@
from Bio import SeqIO
from Bio.SeqUtils import GC
from external_tools import ExternalTool
from util import (count_contigs, get_contig_headers, get_version,
remove_directory)
from util import count_contigs, get_contig_headers, get_version, remove_directory
from loguru import logger
from post_processing import (process_card_results, process_pyhmmer_results,
process_vfdb_results)
from post_processing import (
process_card_results,
process_pyhmmer_results,
process_vfdb_results,
)
from pyhmmer.easel import SequenceFile
from pyhmmer.plan7 import HMM, HMMFile

Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def package_files(directory):
"bin/proteins.py",
"bin/util.py",
"bin/version.py",
"bin/create_hmms.py",
],
packages=["pharokka_runner"],
package_dir=dict(pharokka_runner="bin"),
Expand Down
10 changes: 8 additions & 2 deletions tests/test_external_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@
import pytest
from loguru import logger

from bin.processes import (run_aragorn, run_mash_sketch, run_minced,
run_phanotate, run_pyrodigal)
from bin.processes import (
run_aragorn,
run_mash_sketch,
run_minced,
run_phanotate,
run_pyrodigal,
)

# import functions
from bin.util import remove_directory

Expand Down
15 changes: 11 additions & 4 deletions tests/test_input_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import sys

# import
import unittest
from pathlib import Path
Expand All @@ -14,10 +15,16 @@
import pytest
from loguru import logger

from bin.input_commands import (instantiate_dirs, validate_fasta,
validate_gene_predictor, validate_meta,
validate_strand, validate_terminase,
validate_terminase_start, validate_threads)
from bin.input_commands import (
instantiate_dirs,
validate_fasta,
validate_gene_predictor,
validate_meta,
validate_strand,
validate_terminase,
validate_terminase_start,
validate_threads,
)
from bin.util import remove_directory

# test data
Expand Down
4 changes: 4 additions & 0 deletions tests/test_overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# import
import os
import shutil

# import functions
import subprocess
import sys
Expand All @@ -34,6 +35,7 @@
logger.add(lambda _: sys.exit(1), level="ERROR")
threads = 4


def remove_directory(dir_path):
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
Expand All @@ -43,8 +45,10 @@ def remove_directory(dir_path):
def tmp_dir(tmpdir_factory):
return tmpdir_factory.mktemp("tmp")


temp_dir = Path(f"{test_data}/fake_out")


def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
"""executes shell command and returns stdout if completes exit code 0
Parameters
Expand Down
5 changes: 2 additions & 3 deletions tests/test_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# import
import os
import shutil

# import functions
import subprocess
import sys
Expand Down Expand Up @@ -71,9 +72,7 @@ def test_download(tmp_dir):
def test_proteins(tmp_dir):
"""test pharokka proteins"""
input_fasta: Path = f"{proteins_data}/phanotate.faa"
cmd = (
f"pharokka_proteins.py -i {input_fasta} -d {database_dir} -o {tmp_dir} -t {threads} -f"
)
cmd = f"pharokka_proteins.py -i {input_fasta} -d {database_dir} -o {tmp_dir} -t {threads} -f"
exec_command(cmd)


Expand Down

0 comments on commit a7716b0

Please sign in to comment.