db change

gbouras13 · Aug 23, 2023 · 3bd9e9c · 3bd9e9c
1 parent e219dfb
commit 3bd9e9c
Show file tree

Hide file tree

Showing 7 changed files with 29 additions and 12 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,6 +1,20 @@
 History
 =======
 
+1.4.0 (2023-08-23)
+------------------
+
+* More sensitive search for PHROGs using Hidden Markov Models (HMMs) using the amazing [PyHMMER](https://github.com/althonos/pyhmmer).
+* By default, Pharokka will now run both MMseqs2 (PHROGs, CARD and VFDB) and HMMs (PHROGs). MMseqs2 was kept for PHROGs as it provides more information than the HMM results (sequence identities, top hit PHROG protein).
+* `--fast` or `--hmm_only` which only runs HMMs on PHROGs, not MMseqs2. For phage isolates, this will be much faster than MMseqs2, but you will not get CARD or VFDB annotations. For metagenomes, this will be (much) slower though!
+* `--mmseqs_only` which will essentially runs Pharokka v1.3.2 and is default in meta mode `-m` or `--meta`. 
+* `pharokka_proteins.py`, which takes an input file of amino acid proteins in FASTA format and runs MMseqs2 (PHROGs, CARD, VFDB) and Pyhmmer (PHROGs).
+* `--custom_hmm`, which allows for custom HMM profile databases to be used with Pharokka.
+* `create_custom_hmm.py` which facillitates the creation of a HMM profile database from multiple sequence alignments.
+* `--dnaapler`, which automatic detects and reorients your phage to start with the large terminase subunit. For more information, see [dnaapler](https://github.com/gbouras13/dnaapler).
+* Updated databases as of 23 August 2023. You will need to download the new v1.4.0 databases. The VFDB database is now clustered at 50% sequence identity (which speeds up runtime).
+* Fixes to `-c`, which should now workwith `-g prodigal` (thanks Alistair Legione).
+
 1.3.2 (2023-04-26)
 ------------------
 

diff --git a/README.md b/README.md
@@ -64,10 +64,12 @@ Pharokka v1.4.0 is a large update implementing:
 * More sensitive search for PHROGs using Hidden Markov Models (HMMs) using the amazing [PyHMMER](https://github.com/althonos/pyhmmer).
 * By default, Pharokka will now run both MMseqs2 (PHROGs, CARD and VFDB) and HMMs (PHROGs). MMseqs2 was kept for PHROGs as it provides more information than the HMM results (sequence identities, top hit PHROG protein).
 * `--fast` or `--hmm_only` which only runs HMMs on PHROGs, not MMseqs2. For phage isolates, this will be much faster than MMseqs2, but you will not get CARD or VFDB annotations. For metagenomes, this will be (much) slower though!
-* `--mmseqs_only` which will essentially runs Pharokka v1.3.2. 
+* `--mmseqs_only` which will essentially runs Pharokka v1.3.2 and is default in meta mode `-m` or `--meta`. 
 * `pharokka_proteins.py`, which takes an input file of amino acid proteins in FASTA format and runs MMseqs2 (PHROGs, CARD, VFDB) and Pyhmmer (PHROGs).
+* `--custom_hmm`, which allows for custom HMM profile databases to be used with Pharokka.
+* `create_custom_hmm.py` which facillitates the creation of a HMM profile database from multiple sequence alignments.
 * `--dnaapler`, which automatic detects and reorients your phage to start with the large terminase subunit. For more information, see [dnaapler](https://github.com/gbouras13/dnaapler).
-* Updated databases as of 21 August 2023. You will need to download the new v1.4.0 databases.
+* Updated databases as of 23 August 2023. You will need to download the new v1.4.0 databases. The VFDB database is now clustered at 50% sequence identity (which speeds up runtime).
 * Fixes to `-c`, which should now workwith `-g prodigal` (thanks Alistair Legione).
 
 ## Pharokka v 1.3.0 Update

diff --git a/bin/databases.py b/bin/databases.py
@@ -12,7 +12,10 @@
 # for VFDB, only need the FASTA
 
 # VFDB update as of August 18 2023 (not versioned)
-mmseqs createdb VFDB_setB_pro.fas vfdb
+# clustered 
+
+mmseqs easy-cluster VFDB_setB_pro_form.fas VFDBclusterRes tmp --min-seq-id 0.5 -c 0.8 --cov-mode 1
+mmseqs createdb VFDBclusterRes_rep_seq.fasta vfdb
 
 """
 
@@ -44,11 +47,11 @@
 
 VERSION_DICTIONARY = {
     "1.4.0": {
-        "md5": "cd9cc60dfaa2de63ec23902ab6d5b9d7",
+        "md5": "c21144209b993c06fae2dac906d73b96",
         "major": 1,
         "minor": 4,
         "minorest": 0,
-        "db_url": "https://zenodo.org/record/8267900/files/pharokka_v1.4.0_databases.tar.gz",
+        "db_url": "https://zenodo.org/record/8276347/files/pharokka_v1.4.0_databases.tar.gz",
         "dir_name": "pharokka_v1.4.0_databases",
         "inphared_mash": "1Aug2023_genomes.fa.msh",
         "inphared_annot": "1Aug2023_data.tsv",

diff --git a/bin/input_commands.py b/bin/input_commands.py
@@ -111,7 +111,7 @@ def get_input():
     )
     parser.add_argument(
         "--custom_hmm",
-        help="Runs pharokka with a set ",
+        help="Run pharokka with a custom HMM profile database suffixed .h3m. \nPlease use create this with the create_custom_hmm.py script.",
         action="store",
         default="",
     )

diff --git a/bin/post_processing.py b/bin/post_processing.py
@@ -959,7 +959,6 @@ def create_tbl(
         # check if no trnas
         if self.trna_empty == False:
             trna_df = self.total_gff[self.total_gff["Method"] == "tRNAscan-SE"]
-            print(trna_df)
             # keep only trnas and pseudogenes
             trna_df.start = trna_df.start.astype(int)
             trna_df.stop = trna_df.stop.astype(int)
@@ -1896,9 +1895,7 @@ def inphared_top_hits(self):
 
         # read in the plasdb tsv
         inphared_tsv_file = os.path.join(self.db_dir, "1Aug2023_data.tsv")
-        # with open(plsdb_tsv_file, 'rb') as f:
-        #     result = chardet.detect(f.readline())
-        #     print(result)
+
         cols = [
             "Accession",
             "Description",

diff --git a/environment.yml b/environment.yml
@@ -4,8 +4,8 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - bcbio-gff
-  - biopython >=1.78,<1.81
+  - bcbio-gff >=0.7.0
+  - biopython >=1.78
   - phanotate >=1.5.0
   - mmseqs2 ==13.45111
   - trnascan-se >=2.0.9

diff --git a/setup.py b/setup.py
@@ -92,5 +92,6 @@ def package_files(directory):
         "pytest-cov>=3.0.0",
         "alive-progress>=3.0.1",
         "requests>=2.25.1",
+        "bcbio-gff >=0.7.0"
     ],
 )