diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e889026..66627e61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,10 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#427](https://github.com/nf-core/funcscan/pull/427) Updated AMPcombi from v0.2.2 to v2.0.1. AMP now can use multiple other databases for classifications. (by @darcy220606) + ### `Fixed` +- [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606) + ### `Dependencies` +| Tool | Previous version | New version | +| -------- | ---------------- | ----------- | +| AMPcombi | 0.2.2 | 2.0.1 | + ### `Deprecated` ## v2.0.0 - [2024-09-05] diff --git a/bin/ampcombi_download.py b/bin/ampcombi_download.py index dd1373ce..c9a4f639 100755 --- a/bin/ampcombi_download.py +++ b/bin/ampcombi_download.py @@ -1,78 +1,144 @@ #!/usr/bin/env python3 ######################################### -# Authors: [Anan Ibrahim](https://github.com/brianjohnhaas), [Louisa Perelo](https://github.com/louperelo) +# Authors: [Anan Ibrahim](https://github.com/Darcy220606/AMPcombi), [Louisa Perelo](https://github.com/louperelo) # File: amp_database.py # Source: https://github.com/Darcy220606/AMPcombi/blob/main/ampcombi/amp_database.py -# Source+commit: https://github.com/Darcy220606/AMPcombi/commit/a75bc00c32ecf873a133b18cf01f172ad9cf0d2d/ampcombi/amp_database.py -# Download Date: 2023-03-08, commit: a75bc00c # This source code is licensed under the MIT license ######################################### -# TITLE: Download the DRAMP database if input db empty AND and make database compatible for diamond +# TITLE: Download the reference database specified by the user. import pandas as pd import requests import os -from datetime import datetime +import re import subprocess -from Bio import SeqIO -import tempfile -import shutil +import argparse +from datetime import datetime +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO ######################################## -# FUNCTION: DOWNLOAD DRAMP DATABASE AND CLEAN IT +# FUNCTION: DOWNLOAD DATABASES AND CLEAN DRAMP and APD ######################################### -def download_DRAMP(db): - ##Download the (table) file and store it in a results directory - url = "http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.xlsx" - r = requests.get(url, allow_redirects=True) - with open(db + "/" + "general_amps.xlsx", "wb") as f: - f.write(r.content) - ##Convert excel to tab sep file and write it to a file in the DRAMP_db directly with the date its downloaded - date = datetime.now().strftime("%Y_%m_%d") - ref_amps = pd.read_excel(db + "/" + r"general_amps.xlsx") - ref_amps.to_csv(db + "/" + f"general_amps_{date}.tsv", index=None, header=True, sep="\t") - ##Download the (fasta) file and store it in a results directory - urlfasta = ( - "http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.fasta" - ) - z = requests.get(urlfasta) - fasta_path = os.path.join(db + "/" + f"general_amps_{date}.fasta") - with open(fasta_path, "wb") as f: - f.write(z.content) - ##Cleaning step to remove ambigous aminoacids from sequences in the database (e.g. zeros and brackets) - new_fasta = db + "/" + f"general_amps_{date}_clean.fasta" - seq_record = SeqIO.parse(open(fasta_path), "fasta") - with open(new_fasta, "w") as f: - for record in seq_record: - id, sequence = record.id, str(record.seq) - letters = [ - "A", - "C", - "D", - "E", - "F", - "G", - "H", - "I", - "K", - "L", - "M", - "N", - "P", - "Q", - "R", - "S", - "T", - "V", - "W", - "Y", - ] - new = "".join(i for i in sequence if i in letters) - f.write(">" + id + "\n" + new + "\n") - return os.remove(fasta_path), os.remove(db + "/" + r"general_amps.xlsx") +def download_ref_db(database, threads): + """ + Downloads a specified AMP (antimicrobial peptide) reference database based on the + provided database name and saves it to the specified directory. + This supports downloading databases only from DRAMP, APD, and UniRef100. + Parameters: + ---------- + db : str + The directory path where the downloaded database should be saved. + database : str + The name of the database to download. Must be one of 'DRAMP', 'APD', or 'UniRef100'. + threads : int + Number of threads to use when downloading the UniRef100 database with `mmseqs`. + """ + # Check which database was given + if database == 'DRAMP': + # Create dir + db = 'amp_DRAMP_database' + os.makedirs(db, exist_ok=True) + # Download the file + try: + url = 'http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.txt' + response = requests.get(url, allow_redirects=True) + response.raise_for_status() # Check for any download errors + date = datetime.now().strftime("%Y_%m_%d") + with open(db + '/' + f'general_amps_{date}.txt', 'wb') as file: + file.write(response.content) + print(f"File downloaded successfully and saved to {db}/general_amps_{date}.txt") + # Create fasta version and clean it + db_df = pd.read_csv(f'{db}/general_amps_{date}.txt', sep='\t') + records = [] + valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$") + for index, row in db_df.iterrows(): + sequence = row['Sequence'] + if valid_sequence_pattern.match(sequence): + record = SeqRecord(Seq(sequence), id=str(row['DRAMP_ID']), description="") + records.append(record) + output_file = f'{db}/general_amps_{date}.fasta' + SeqIO.write(records, output_file, "fasta") + except requests.exceptions.RequestException as e: + print(f"Failed to download DRAMP AMP general database file: {e}") + return + + if database == 'APD': + # Create dir + db = 'amp_APD_database' + os.makedirs(db, exist_ok=True) + # Download the file + try: + url = 'https://aps.unmc.edu/assets/sequences/APD_sequence_release_09142020.fasta' + response = requests.get(url, allow_redirects=True, verify=False) # Disable SSL verification due to site certificate issue + response.raise_for_status() + content = response.text + print("APD AMP database downloaded successfully.") + except requests.exceptions.RequestException as e: + print(f"Failed to download content: {e}") + return + # Save the content line-by-line exactly as is + try: + with open(db + '/' + 'APD_orig.fasta', 'w') as file: + file.write(content) + with open(f'{db}/APD.fasta', 'w') as output_handle: + valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$") + for record in SeqIO.parse(f'{db}/APD_orig.fasta', "fasta"): + sequence = str(record.seq) + if valid_sequence_pattern.match(sequence): + SeqIO.write(record, output_handle, "fasta") + os.remove(db + '/' + 'APD_orig.fasta') + print(f"APD AMP database saved successfully to {db}/APD.fasta") + # Fasta to table + headers = [] + sequences = [] + seq_ids = [] + for i, record in enumerate(SeqIO.parse(f'{db}/APD.fasta', "fasta")): + sequence_id = record.description.split('|')[0] + headers.append(record.description) + sequences.append(str(record.seq)) + seq_ids.append(sequence_id) + db_df = pd.DataFrame({ + "APD_ID": seq_ids, + "APD_Description": headers, + "APD_Sequence": sequences}) + db_df.to_csv(f'{db}/APD.txt', sep='\t', index=False, header=True) + os.remove(db + '/' + 'APD.fasta') + # Table to fasta + records = [] + for index, row in db_df.iterrows(): + sequence = row['APD_Sequence'] + record = SeqRecord(Seq(sequence), id=str(row['APD_ID']), description="") + records.append(record) + output_file = f'{db}/APD.fasta' + SeqIO.write(records, output_file, "fasta") + except Exception as e: + print(f"Failed to save APD AMP database: {e}") + + if database == 'UniRef100': + # Create dir + db = 'amp_UniRef100_database' + os.makedirs(db, exist_ok=True) + # Download the file + try: + os.makedirs(f'{db}/mmseqs2', exist_ok=True) + command = f"mmseqs databases UniRef100 {db}/mmseqs2/ref_DB {db}/mmseqs2/tmp --remove-tmp-files true --threads {threads} -v 0" + subprocess.run(command, shell=True, check=True) + print(f"UniRef100 protein database downloaded successfully and saved to {db}/mmseqs2/UniRef100") + except subprocess.CalledProcessError as e: + print(f"Failed to download UniRef100 protein database: {e}") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Downloads a specified AMP (antimicrobial peptide) reference database based on the provided database name and saves it to the specified directory.") + parser.add_argument("--database_id", dest="database", type=str, required=True, choices=["DRAMP", "APD", "UniRef100"], + help="Database ID - one of DRAMP, APD, or UniRef100. This parameter is required.") + parser.add_argument("--threads", type=int, default=4, + help="Number of threads supplied to mmseqs databases. Only relevant in the case of 'UniRef100'. Default is 4.") -download_DRAMP("amp_ref_database") + args = parser.parse_args() + download_ref_db(args.database, args.threads) diff --git a/conf/base.config b/conf/base.config index b16699af..3133bc74 100644 --- a/conf/base.config +++ b/conf/base.config @@ -222,6 +222,8 @@ process { withName: AMPCOMBI2_PARSETABLES { memory = { 8.GB * task.attempt } time = { 2.h * task.attempt } + errorStrategy = { task.exitStatus == 1 ? 'retry' : 'finish' } + maxRetries = 2 // Retry the process up to 2 times } withName: AMPCOMBI2_CLUSTER { diff --git a/conf/modules.config b/conf/modules.config index b8c8f747..59b5738f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -674,9 +674,9 @@ process { ] } - withName: DRAMP_DOWNLOAD { + withName: AMP_DATABASE_DOWNLOAD { publishDir = [ - path: { "${params.outdir}/databases/dramp" }, + path: { "${params.outdir}/databases/${params.amp_ampcombi_db}" }, mode: params.publish_dir_mode, enabled: params.save_db, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } diff --git a/docs/output.md b/docs/output.md index 9f71278a..577f0ae8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -457,15 +457,15 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `Ampcombi_parse_tables.log`: log file containing the run information from AMPcombi submodule `ampcombi2/parsetables` - `Ampcombi_complete.log`: log file containing the run information from AMPcombi submodule `ampcombi2/complete` - `Ampcombi_summary_cluster.tsv`: tab-separated table containing the clustered AMP hits. This is the output given when the taxonomic classification is not activated (pipeline default). - - `Ampcombi_summary_cluster_representative_seq.tsv`: tab-separated table containing the representative sequence of each cluster. This can be used in AMPcombi for constructing 3D structures using ColabFold. For more details on how to do this, please refer to the [AMPcombi documentation](https://github.com/Darcy220606/AMPcombi/blob/main/README.md). + - `Ampcombi_summary_cluster_representative_seq.tsv`: tab-separated table containing the representative sequence of each cluster. This can be used in AMPcombi for constructing 3D structures using ColabFold. For more details on how to do this, please refer to the [AMPcombi documentation](https://ampcombi.readthedocs.io/en/main/). - `Ampcombi_cluster.log`: log file containing the run information from AMPcombi submodule `ampcombi2/cluster` - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format. This is the same output as `Ampcombi_summary_cluster.tsv` file but with taxonomic classification of the contig. - `/contig_gbks`: contains all the contigs in gbk format that an AMP was found on using the custom parameters - `/*_ampcombi.log`: a log file generated by AMPcombi - `/*_ampcombi.tsv`: summarised output in tsv format for each sample - `/*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample - - `/*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample - AMP summary table header descriptions + - `/*_mmseqs_matches.txt*`: alignment file generated by NMseqs2 for each sample + AMP summary table header descriptions using DRAMP as reference database | Table column | Description | | ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -477,9 +477,9 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation | `prob_amplify` | Probability associated with the AMP prediction using `AMPLIFY` | | `evalue_hmmer` | Expected number of false positives (nonhomologous sequences) with a similar of higher score. This stands for how significant the hit is, the lower the evalue, the more significant the hit | | `aa_sequence` | Amino acid sequence that forms part of the contig and is AMP encoding | -| `target_id` | [DRAMP](http://dramp.cpu-bioinfor.org/) ID within the database found to be similar to the predicted AMP by `DIAMOND` alignment | +| `target_id` | [DRAMP](http://dramp.cpu-bioinfor.org/) ID within the database found to be similar to the predicted AMP by `MMseqs2` alignment | | `pident` | Percentage identity of amino acid residues that fully aligned between the `DRAMP` sequence and the predicted AMP sequence | -| `evalue` | Number of alignments of similar or better qualities that can be expected when searching a database of similar size with a random sequence distribution. This is generated by `DIAMOND` alignments using the [DRAMP](http://dramp.cpu-bioinfor.org/) AMP database. The lower the value the more significant that the hit is positive. An e-value of < 0.001 means that the this hit will be found by chance once per 1,0000 queries | +| `evalue` | Number of alignments of similar or better qualities that can be expected when searching a database of similar size with a random sequence distribution. This is generated by `MMseqs2` alignments using the [DRAMP](http://dramp.cpu-bioinfor.org/) AMP database. The lower the value the more significant that the hit is positive. An e-value of < 0.001 means that the this hit will be found by chance once per 1,0000 queries | | `Sequence` | Sequence corresponding to the `DRAMP` ID found to be similar to the predicted AMP sequence | | `Sequence_length` | Number of amino acid residues in the `DRAMP` sequence | | `Name` | Full name of the peptide copied from the database it was uploaded to | @@ -510,7 +510,9 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation -[AMPcombi](https://github.com/Darcy220606/AMPcombi) summarizes the results of **antimicrobial peptide (AMP)** prediction tools (ampir, AMPlify, Macrel, and other non-nf-core tools) into a single table and aligns the hits against a reference AMP database for functional and taxonomic classification. It assigns the physiochemical properties (e.g. hydrophobicity, molecular weight) using the [Biopython toolkit](https://github.com/biopython/biopython). Additionally, it clusters the resulting AMP hits from all samples using [MMseqs2](https://github.com/soedinglab/MMseqs2). For further filtering for AMPs with signaling peptides, the output file `Ampcombi_summary_cluster.tsv` or `ampcombi_complete_summary_taxonomy.tsv.gz` can be used downstream as detailed [here](https://github.com/Darcy220606/AMPcombi/blob/main/README.md). +[AMPcombi](https://github.com/Darcy220606/AMPcombi) summarizes the results of **antimicrobial peptide (AMP)** prediction tools (ampir, AMPlify, Macrel, and other non-nf-core supported tools) into a single table and aligns the hits against a reference AMP database for functional, structural and taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2). It further assigns the physiochemical properties (e.g. hydrophobicity, molecular weight) using the [Biopython toolkit](https://github.com/biopython/biopython) and clusters the resulting AMP hits from all samples using [MMseqs2](https://github.com/soedinglab/MMseqs2). To further filter the recovered AMPs using the presence of signaling peptides, the output file `Ampcombi_summary_cluster.tsv` or `ampcombi_complete_summary_taxonomy.tsv.gz` can be used downstream as detailed [here](https://ampcombi.readthedocs.io/en/main/usage.html#signal-peptide). The final tables generated may also be visualized and explored using an interactive [user interface](https://ampcombi.readthedocs.io/en/main/visualization.html). + +AMPcombi interface #### hAMRonization diff --git a/docs/usage.md b/docs/usage.md index 7d220c49..a2fce4f6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -224,14 +224,35 @@ wget https://github.com/nf-core/funcscan/raw//bin/ampcombi_dow python3 ampcombi_download.py ``` -However, the user can also supply their own custom AMP database by following the guidelines in [AMPcombi](https://github.com/Darcy220606/AMPcombi). +In addition to [DRAMP](http://dramp.cpu-bioinfor.org/), two more reference databases can be used to classify the recovered AMPs in the AMP workflow; [APD](https://aps.unmc.edu/) and [UniRef100](https://academic.oup.com/bioinformatics/article/23/10/1282/197795). Only one database can be used at a time using `--amp_ampcombi_db database_name`. + +However, the user can also supply their own custom AMP database by following the guidelines in [AMPcombi](https://ampcombi.readthedocs.io/en/main/). This can then be passed to the pipeline with: ```bash ---amp_ampcombi_db '/// +--amp_ampcombi_db_dir_path '/// ``` -The contents of the directory should have files such as `*.dmnd` and `*.fasta` in the top level. +The contents of the directory should have files such as `*.fasta` and `*.tsv` in the top level; a fasta file and the corresponding table with structural, functional and (if reported) taxonomic classifications. AMPcombi will then generate the corresponding `mmseqs2` directory, in which all binary files are prepared for downstream alignment of the recovered AMPs with [MMseqs2](https://github.com/soedinglab/MMseqs2). These can also be provided by the user by setting up an mmseqs2 compatible database using `mmseqs createdb *.fasta` in a directory called `mmseqs2`. An example file structure for [DRAMP](http://dramp.cpu-bioinfor.org/) used as the reference database: + +```bash +amp_DRAMP_database/ +├── general_amps_2024_11_13.fasta +├── general_amps_2024_11_13.txt +└── mmseqs2 + ├── ref_DB + ├── ref_DB.dbtype + ├── ref_DB_h + ├── ref_DB_h.dbtype + ├── ref_DB_h.index + ├── ref_DB.index + ├── ref_DB.lookup + └── ref_DB.source +``` + +:::note{.fa-whale} +For both [DRAMP](http://dramp.cpu-bioinfor.org/) and [APD](https://aps.unmc.edu/), AMPcombi removes entries that contains any non amino acid residues by default. +::: :::warning The pipeline will automatically run Pyrodigal instead of Prodigal if the parameters `--run_annotation_tool prodigal --run_amp_screening` are both provided. diff --git a/modules.json b/modules.json index ec386d66..3dd4848b 100644 --- a/modules.json +++ b/modules.json @@ -12,17 +12,17 @@ }, "ampcombi2/cluster": { "branch": "master", - "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "git_sha": "993865fe60cb1569155fbbbe0cee113e1127abaf", "installed_by": ["modules"] }, "ampcombi2/complete": { "branch": "master", - "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "git_sha": "993865fe60cb1569155fbbbe0cee113e1127abaf", "installed_by": ["modules"] }, "ampcombi2/parsetables": { "branch": "master", - "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "git_sha": "993865fe60cb1569155fbbbe0cee113e1127abaf", "installed_by": ["modules"] }, "ampir": { diff --git a/modules/local/dramp_download.nf b/modules/local/amp_database_download.nf similarity index 50% rename from modules/local/dramp_download.nf rename to modules/local/amp_database_download.nf index 8b7eb2d1..8e2bc05a 100644 --- a/modules/local/dramp_download.nf +++ b/modules/local/amp_database_download.nf @@ -1,22 +1,26 @@ -process DRAMP_DOWNLOAD { +process AMP_DATABASE_DOWNLOAD { label 'process_single' - conda "bioconda::ampcombi=0.2.2" + conda "bioconda::ampcombi=2.0.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampcombi:0.2.2--pyhdfd78af_0': - 'biocontainers/ampcombi:0.2.2--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/ampcombi:2.0.1--pyhdfd78af_0': + 'biocontainers/ampcombi:2.0.1--pyhdfd78af_0' }" + + input: + val database_id output: - path "amp_ref_database/" , emit: db - path "versions.yml" , emit: versions + path "amp_${database_id}_database" , emit: db + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ """ - mkdir amp_ref_database/ - ampcombi_download.py + ampcombi_download.py \\ + --database_id $database_id \\ + --threads ${task.cpus} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/ampcombi2/cluster/environment.yml b/modules/nf-core/ampcombi2/cluster/environment.yml index 420c955b..f9c25b04 100644 --- a/modules/nf-core/ampcombi2/cluster/environment.yml +++ b/modules/nf-core/ampcombi2/cluster/environment.yml @@ -4,4 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - "bioconda::ampcombi=0.2.2" + - "bioconda::ampcombi=2.0.1" diff --git a/modules/nf-core/ampcombi2/cluster/main.nf b/modules/nf-core/ampcombi2/cluster/main.nf index 90495dba..98a19a96 100644 --- a/modules/nf-core/ampcombi2/cluster/main.nf +++ b/modules/nf-core/ampcombi2/cluster/main.nf @@ -4,8 +4,8 @@ process AMPCOMBI2_CLUSTER { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampcombi:0.2.2--pyhdfd78af_0': - 'biocontainers/ampcombi:0.2.2--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/ampcombi:2.0.1--pyhdfd78af_0': + 'biocontainers/ampcombi:2.0.1--pyhdfd78af_0' }" input: path(summary_file) diff --git a/modules/nf-core/ampcombi2/cluster/tests/main.nf.test b/modules/nf-core/ampcombi2/cluster/tests/main.nf.test deleted file mode 100644 index 49bee6cf..00000000 --- a/modules/nf-core/ampcombi2/cluster/tests/main.nf.test +++ /dev/null @@ -1,65 +0,0 @@ -nextflow_process { - - name "Test Process AMPCOMBI2_CLUSTER" - script "../main.nf" - process "AMPCOMBI2_CLUSTER" - - tag "modules" - tag "modules_nfcore" - tag "ampcombi2" - tag "ampcombi2/cluster" - tag "ampcombi2/complete" - - setup { - run("AMPCOMBI2_COMPLETE") { - script "../../../ampcombi2/complete/main.nf" - process { - """ - input[0] = - [ - file('https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/ampcombi/ampcombi2/sample_1_ampcombi.tsv', checkIfExists: true), - file('https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/ampcombi/ampcombi2/sample_2_ampcombi.tsv', checkIfExists: true) - ] - """ - } - } - } - - test("ampcombi2_cluster - metagenome") { - when { - process { - """ - input[0] = AMPCOMBI2_COMPLETE.out.tsv - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - file(process.out.cluster_tsv[0]).readLines()[0].contains("Linear/Cyclic/Branched"), - file(process.out.rep_cluster_tsv[0]).readLines()[0].contains("total_cluster_members"), - process.out.versions).match() } - ) - } - } - - test("ampcombi2_cluster - metagenome - stub") { - options "-stub" - when { - process { - """ - input[0] = AMPCOMBI2_COMPLETE.out.tsv - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } -} diff --git a/modules/nf-core/ampcombi2/cluster/tests/main.nf.test.snap b/modules/nf-core/ampcombi2/cluster/tests/main.nf.test.snap index f4123c76..fd79a83b 100644 --- a/modules/nf-core/ampcombi2/cluster/tests/main.nf.test.snap +++ b/modules/nf-core/ampcombi2/cluster/tests/main.nf.test.snap @@ -4,14 +4,14 @@ true, true, [ - "versions.yml:md5,4e9aa3812bfee6ec22a1b6ccb62de2ca" + "versions.yml:md5,b629089d44775078dce5e664a455422b" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-04-24T12:11:40.928513749" + "timestamp": "2024-12-03T07:57:01.869983435" }, "ampcombi2_cluster - metagenome - stub": { "content": [ @@ -26,7 +26,7 @@ "Ampcombi_cluster.log:md5,d41d8cd98f00b204e9800998ecf8427e" ], "3": [ - "versions.yml:md5,4e9aa3812bfee6ec22a1b6ccb62de2ca" + "versions.yml:md5,b629089d44775078dce5e664a455422b" ], "cluster_tsv": [ "Ampcombi_summary_cluster.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" @@ -38,14 +38,14 @@ "Ampcombi_summary_cluster_representative_seq.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ], "versions": [ - "versions.yml:md5,4e9aa3812bfee6ec22a1b6ccb62de2ca" + "versions.yml:md5,b629089d44775078dce5e664a455422b" ] } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-04-24T12:12:08.780718892" + "timestamp": "2024-12-03T07:57:23.939137628" } } \ No newline at end of file diff --git a/modules/nf-core/ampcombi2/cluster/tests/tags.yml b/modules/nf-core/ampcombi2/cluster/tests/tags.yml deleted file mode 100644 index 783f4d52..00000000 --- a/modules/nf-core/ampcombi2/cluster/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -ampcombi2/cluster: - - "modules/nf-core/ampcombi2/cluster/**" diff --git a/modules/nf-core/ampcombi2/complete/environment.yml b/modules/nf-core/ampcombi2/complete/environment.yml index 420c955b..f9c25b04 100644 --- a/modules/nf-core/ampcombi2/complete/environment.yml +++ b/modules/nf-core/ampcombi2/complete/environment.yml @@ -4,4 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - "bioconda::ampcombi=0.2.2" + - "bioconda::ampcombi=2.0.1" diff --git a/modules/nf-core/ampcombi2/complete/main.nf b/modules/nf-core/ampcombi2/complete/main.nf index 0e4d5d53..98f62347 100644 --- a/modules/nf-core/ampcombi2/complete/main.nf +++ b/modules/nf-core/ampcombi2/complete/main.nf @@ -4,8 +4,8 @@ process AMPCOMBI2_COMPLETE { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampcombi:0.2.2--pyhdfd78af_0': - 'biocontainers/ampcombi:0.2.2--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/ampcombi:2.0.1--pyhdfd78af_0': + 'biocontainers/ampcombi:2.0.1--pyhdfd78af_0' }" input: path(summaries) diff --git a/modules/nf-core/ampcombi2/complete/tests/main.nf.test b/modules/nf-core/ampcombi2/complete/tests/main.nf.test deleted file mode 100644 index 176d975f..00000000 --- a/modules/nf-core/ampcombi2/complete/tests/main.nf.test +++ /dev/null @@ -1,56 +0,0 @@ -nextflow_process { - - name "Test Process AMPCOMBI2_COMPLETE" - script "../main.nf" - process "AMPCOMBI2_COMPLETE" - - tag "modules" - tag "modules_nfcore" - tag "ampcombi2" - tag "ampcombi2/complete" - - test("ampcombi2_complete - contigs") { - when { - process { - """ - input[0] = - [ - file('https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/ampcombi/ampcombi2/sample_1_ampcombi.tsv', checkIfExists: true), - file('https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/ampcombi/ampcombi2/sample_2_ampcombi.tsv', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot( - file(process.out.tsv[0]).readLines()[0].contains("ampir"), - process.out.versions).match() } - ) - } - } - - test("ampcombi2_complete - contigs - stub") { - options "-stub" - when { - process { - """ - input[0] = - [ - file('https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/ampcombi/ampcombi2/sample_1_ampcombi.tsv', checkIfExists: true), - file('https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/ampcombi/ampcombi2/sample_2_ampcombi.tsv', checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } -} diff --git a/modules/nf-core/ampcombi2/complete/tests/main.nf.test.snap b/modules/nf-core/ampcombi2/complete/tests/main.nf.test.snap index cd8fa18f..87435e5b 100644 --- a/modules/nf-core/ampcombi2/complete/tests/main.nf.test.snap +++ b/modules/nf-core/ampcombi2/complete/tests/main.nf.test.snap @@ -6,39 +6,39 @@ "Ampcombi_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ], "1": [ - + ], "2": [ - "versions.yml:md5,0aa35e86761a6c160482b8b8dbfc5440" + "versions.yml:md5,bfba0046e0cfa7b0b6d79663823f94c0" ], "log": [ - + ], "tsv": [ "Ampcombi_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ], "versions": [ - "versions.yml:md5,0aa35e86761a6c160482b8b8dbfc5440" + "versions.yml:md5,bfba0046e0cfa7b0b6d79663823f94c0" ] } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-04-29T11:55:16.030399422" + "timestamp": "2024-12-03T07:57:53.385349848" }, "ampcombi2_complete - contigs": { "content": [ true, [ - "versions.yml:md5,0aa35e86761a6c160482b8b8dbfc5440" + "versions.yml:md5,bfba0046e0cfa7b0b6d79663823f94c0" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-04-29T11:54:54.334224301" + "timestamp": "2024-12-03T07:57:40.263912946" } -} \ No newline at end of file +} diff --git a/modules/nf-core/ampcombi2/complete/tests/tags.yml b/modules/nf-core/ampcombi2/complete/tests/tags.yml deleted file mode 100644 index f8ac5fee..00000000 --- a/modules/nf-core/ampcombi2/complete/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -ampcombi2/complete: - - "modules/nf-core/ampcombi2/complete/**" diff --git a/modules/nf-core/ampcombi2/parsetables/environment.yml b/modules/nf-core/ampcombi2/parsetables/environment.yml index 420c955b..f9c25b04 100644 --- a/modules/nf-core/ampcombi2/parsetables/environment.yml +++ b/modules/nf-core/ampcombi2/parsetables/environment.yml @@ -4,4 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - "bioconda::ampcombi=0.2.2" + - "bioconda::ampcombi=2.0.1" diff --git a/modules/nf-core/ampcombi2/parsetables/main.nf b/modules/nf-core/ampcombi2/parsetables/main.nf index d779440b..b9d855df 100644 --- a/modules/nf-core/ampcombi2/parsetables/main.nf +++ b/modules/nf-core/ampcombi2/parsetables/main.nf @@ -1,31 +1,33 @@ process AMPCOMBI2_PARSETABLES { - tag "$meta.id" + tag "${meta.id}" label 'process_medium' conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampcombi:0.2.2--pyhdfd78af_0': - 'biocontainers/ampcombi:0.2.2--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/ampcombi:2.0.1--pyhdfd78af_0': + 'biocontainers/ampcombi:2.0.1--pyhdfd78af_0' }" input: tuple val(meta), path(amp_input) - path(faa_input) - path(gbk_input) - path(opt_amp_db) + path faa_input + path gbk_input + val opt_amp_db + path opt_amp_db_dir + path opt_interproscan output: - tuple val(meta), path("${meta.id}/") , emit: sample_dir - tuple val(meta), path("${meta.id}/contig_gbks/") , emit: contig_gbks - tuple val(meta), path("${meta.id}/${meta.id}_diamond_matches.txt"), emit: txt - tuple val(meta), path("${meta.id}/${meta.id}_ampcombi.tsv") , emit: tsv - tuple val(meta), path("${meta.id}/${meta.id}_amp.faa") , emit: faa - tuple val(meta), path("${meta.id}/${meta.id}_ampcombi.log") , emit: sample_log, optional:true - tuple val(meta), path("Ampcombi_parse_tables.log") , emit: full_log, optional:true - tuple val(meta), path("amp_ref_database/") , emit: results_db, optional:true - tuple val(meta), path("amp_ref_database/*.dmnd") , emit: results_db_dmnd, optional:true - tuple val(meta), path("amp_ref_database/*.clean.fasta") , emit: results_db_fasta, optional:true - tuple val(meta), path("amp_ref_database/*.tsv") , emit: results_db_tsv, optional:true - path "versions.yml" , emit: versions + tuple val(meta), path("${meta.id}/") , emit: sample_dir + tuple val(meta), path("${meta.id}/contig_gbks/") , emit: contig_gbks + tuple val(meta), path("${meta.id}/${meta.id}_mmseqs_matches.tsv") , emit: db_tsv + tuple val(meta), path("${meta.id}/${meta.id}_ampcombi.tsv") , emit: tsv + tuple val(meta), path("${meta.id}/${meta.id}_amp.faa") , emit: faa + tuple val(meta), path("${meta.id}/${meta.id}_ampcombi.log") , emit: sample_log , optional:true + tuple val(meta), path("Ampcombi_parse_tables.log") , emit: full_log , optional:true + tuple val(meta), path("amp_${opt_amp_db}_database/") , emit: db , optional:true + tuple val(meta), path("amp_${opt_amp_db}_database/*.txt") , emit: db_txt , optional:true + tuple val(meta), path("amp_${opt_amp_db}_database/*.fasta") , emit: db_fasta , optional:true + tuple val(meta), path("amp_${opt_amp_db}_database/mmseqs2/") , emit: db_mmseqs , optional:true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -33,16 +35,20 @@ process AMPCOMBI2_PARSETABLES { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def db = opt_amp_db? "--amp_database $opt_amp_db": "" + def db_dir = opt_amp_db_dir ? "--amp_database_dir ${opt_amp_db_dir}" : "" + def interpro = opt_interproscan ? "--interproscan_output ${opt_interproscan}" : "" + """ ampcombi parse_tables \\ - --path_list '${amp_input.collect{"$it"}.join("' '")}' \\ - --faa ${faa_input} \\ - --gbk ${gbk_input} \\ - --sample_list ${prefix} \\ - ${db} \\ - $args \\ - --threads ${task.cpus} + --path_list '${amp_input.collect { "${it}" }.join("' '")}' \\ + --faa ${faa_input} \\ + --gbk ${gbk_input} \\ + --sample_list ${prefix} \\ + --amp_database ${opt_amp_db} \\ + ${db_dir} \\ + ${interpro} \\ + ${args} \\ + --threads ${task.cpus} cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -53,20 +59,30 @@ process AMPCOMBI2_PARSETABLES { stub: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def db = opt_amp_db? "--amp_database $opt_amp_db": "" + def db_dir = opt_amp_db_dir ? "--amp_database_dir ${opt_amp_db_dir}" : "" + def interpro = opt_interproscan ? "--interproscan_output ${opt_interproscan}" : "" + """ mkdir -p ${prefix} mkdir -p ${prefix}/contig_gbks - touch ${prefix}/${meta.id}_diamond_matches.txt + touch ${prefix}/${meta.id}_mmseqs_matches.tsv touch ${prefix}/${meta.id}_ampcombi.tsv touch ${prefix}/${meta.id}_amp.faa touch ${prefix}/${meta.id}_ampcombi.log touch Ampcombi_parse_tables.log - mkdir -p amp_ref_database - touch amp_ref_database/*.dmnd - touch amp_ref_database/*.clean.fasta - touch amp_ref_database/*.tsv + mkdir -p amp_${opt_amp_db}_database + mkdir -p amp_${opt_amp_db}_database/mmseqs2 + touch amp_${opt_amp_db}_database/*.fasta + touch amp_${opt_amp_db}_database/*.txt + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB.dbtype + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB_h + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB_h.dbtype + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB_h.index + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB.index + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB.lookup + touch amp_${opt_amp_db}_database/mmseqs2/ref_DB.source cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/ampcombi2/parsetables/meta.yml b/modules/nf-core/ampcombi2/parsetables/meta.yml index 7159b117..14a0fd02 100644 --- a/modules/nf-core/ampcombi2/parsetables/meta.yml +++ b/modules/nf-core/ampcombi2/parsetables/meta.yml @@ -16,12 +16,14 @@ keywords: - ampgram - amptransformer - DRAMP + - MMseqs2 + - InterProScan tools: - ampcombi2/parsetables: description: "A parsing tool to convert and summarise the outputs from multiple AMP detection tools in a standardized format." homepage: "https://github.com/Darcy220606/AMPcombi" - documentation: "https://github.com/Darcy220606/AMPcombi" + documentation: "https://ampcombi.readthedocs.io/en/main/" tool_dev_url: "https://github.com/Darcy220606/AMPcombi/tree/dev" licence: ["MIT"] identifier: "" @@ -52,9 +54,17 @@ input: name. pattern: "*.gbk" - - opt_amp_db: + type: string + description: The name of the database to download and set up. This can either be 'DRAMP', 'APD' or 'UniRef100'. + pattern: "DRAMP|APD|UniRef100" + - - opt_amp_db_dir: type: directory description: The path to the folder containing the fasta and tsv database files. - pattern: "*/" + pattern: "path/to/amp_*_database" + - - opt_interproscan: + type: directory + description: A path to a file corresponding to the respective tsv files containing protein classifications of the annotated CDSs. The file must be the raw output from InterProScan. + pattern: "*.tsv" output: - sample_dir: - meta: @@ -78,17 +88,17 @@ output: description: The output subdirectory that contains the gbk files containing the AMP hits for each sample. pattern: "/*/contig_gbks" - - txt: + - db_tsv: - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1', single_end:false ]` - - ${meta.id}/${meta.id}_diamond_matches.txt: + - ${meta.id}/${meta.id}_mmseqs_matches.tsv: type: file - description: An alignment file containing the results from the DIAMOND alignment + description: An alignment file containing the results from the MMseqs2 alignment step done on all AMP hits. - pattern: "/*/*_diamond_matches.txt" + pattern: "/*/*_mmseqs_matches.tsv" - tsv: - meta: type: map @@ -134,50 +144,51 @@ output: description: A log file that captures the standard output for the entire process in a log file. Can be activated by `--log`. pattern: "Ampcombi_parse_tables.log" - - results_db: + - db: - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1', single_end:false ]` - - amp_ref_database/: + - amp_${opt_amp_db}_database/: type: directory - description: If the AMP reference database is not provided by the user using + description: If the AMP reference database ID is not provided by the user using the flag `--amp_database', by default the DRAMP database will be downloaded, filtered and stored in this folder. - pattern: "/amp_ref_database" - - results_db_dmnd: + pattern: "/amp_*_database" + - db_txt: - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1', single_end:false ]` - - amp_ref_database/*.dmnd: + - amp_${opt_amp_db}_database/*.txt: type: file - description: AMP reference database converted to DIAMOND database format. - pattern: "/amp_ref_database/*.dmnd" - - results_db_fasta: + description: AMP reference database in tsv-format with two columns containing + header and sequence. + pattern: "/amp_*_database/*.txt" + - db_fasta: - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1', single_end:false ]` - - amp_ref_database/*.clean.fasta: + - amp_${opt_amp_db}_database/*.fasta: type: file - description: AMP reference database fasta file, cleaned of diamond-uncompatible + description: AMP reference database fasta file in clean format. characters. - pattern: "/amp_ref_database/*.clean.fasta" - - results_db_tsv: + pattern: "/amp_*_database/*.fasta" + - db_mmseqs: - meta: type: map description: | Groovy Map containing sample information e.g. `[ id:'sample1', single_end:false ]` - - amp_ref_database/*.tsv: - type: file - description: AMP reference database in tsv-format with two columns containing - header and sequence. - pattern: "/amp_ref_database/*.tsv" + - amp_${opt_amp_db}_database/mmseqs2/: + type: directory + description: As alignment to the reference database is carried out by MMseqs2, this directory + contains all the files generated by MMseqs2 on the fasta file of the database. + pattern: "/amp_*_database/mmseqs2" - versions: - versions.yml: type: file diff --git a/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test b/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test index 2d775179..272d31e6 100644 --- a/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test +++ b/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test @@ -28,7 +28,9 @@ nextflow_process { input[0] = amp_input input[1] = faa_input input[2] = gbk_input - input[3] = [] + input[3] = 'DRAMP' + input[4] = [] + input[5] = [] """ } } @@ -37,15 +39,17 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out.sample_dir.collect { file(it[1]).getName() } + - process.out.results_db.collect { file(it[1]).getName() } + - process.out.contig_gbks.collect { file(it[1]).getName() } + - process.out.full_log.collect { file(it[1]).readLines().contains("<--AMP_database>") } + - process.out.sample_log.collect { file(it[1]).readLines().contains("found ampir file") } + - process.out.txt.collect { file(it[1]).readLines()[0] } + - process.out.tsv.collect { file(it[1]).readLines()[0] } + - process.out.faa.collect { file(it[1]).readLines()[0] } + - process.out.summary_csv.collect { file(it[1]).readLines().contains("Structure_Description") } + - process.out.versions ).match() } + process.out.contig_gbks.collect { file(it[1]).getName() } + + process.out.db_tsv.collect { file(it[1]).readLines()[0] } + + process.out.tsv.collect { file(it[1]).readLines()[0] } + + process.out.faa.collect { file(it[1]).readLines()[0] } + + process.out.full_log.collect { file(it[1]).readLines().contains("File downloaded successfully") } + + process.out.sample_log.collect { file(it[1]).readLines().contains("found ampir file") } + + process.out.db.collect { file(it[1]).getName() } + + process.out.db_txt.collect { file(it[1]).readLines()[0] } + + process.out.db_fasta.collect { file(it[1]).readLines()[0] } + + process.out.db_mmseqs.collect { file(it[1]).getName() } + + process.out.versions ).match() } ) } } @@ -67,7 +71,9 @@ nextflow_process { input[0] = amp_input input[1] = faa_input input[2] = gbk_input - input[3] = [] + input[3] = 'DRAMP' + input[4] = [] + input[5] = [] """ } } diff --git a/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test.snap b/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test.snap index 54faf69f..47102283 100644 --- a/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test.snap +++ b/modules/nf-core/ampcombi2/parsetables/tests/main.nf.test.snap @@ -3,21 +3,24 @@ "content": [ [ "sample_1", - "amp_ref_database", "contig_gbks", + null, + "sample_id\tCDS_id\tprob_ampir\tprob_amplify\taa_sequence\tmolecular_weight\thelix_fraction\tturn_fraction\tsheet_fraction\tisoelectric_point\thydrophobicity\ttransporter_protein\tcontig_id\tCDS_start\tCDS_end\tCDS_dir\tCDS_stop_codon_found", + ">BAONEE_00005", false, true, - "contig_id\ttarget_id\tpident\tevalue\tnident\tfull_qseq\tfull_sseq\tqseq\tsseq\tqcovhsp\tscovhsp", - "sample_id\tCDS_id\tprob_ampir\tprob_amplify\taa_sequence\ttarget_id\tpident\tevalue\tSequence\tFamily\tSource\tPDB_ID\tLinear/Cyclic/Branched\tOther_Modifications\tPubmed_ID\tReference\tmolecular_weight\thelix_fraction\tturn_fraction\tsheet_fraction\tisoelectric_point\thydrophobicity\ttransporter_protein\tcontig_id\tCDS_start\tCDS_end\tCDS_dir\tCDS_stop_codon_found", - ">BAONEE_00005", - "versions.yml:md5,f32ab4ba79e66feba755b78d7d7a1f36" + "amp_DRAMP_database", + "DRAMP_ID\tSequence\tSequence_Length\tName\tSwiss_Prot_Entry\tFamily\tGene\tSource\tActivity\tProtein_existence\tStructure\tStructure_Description\tPDB_ID\tComments\tTarget_Organism\tHemolytic_activity\tLinear/Cyclic/Branched\tN-terminal_Modification\tC-terminal_Modification\tOther_Modifications\tStereochemistry\tCytotoxicity\tBinding_Traget\tPubmed_ID\tReference\tAuthor\tTitle", + ">DRAMP00005", + "mmseqs2", + "versions.yml:md5,09f086e07825d96816d792d73eee90ca" ] ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-04-24T12:05:11.848363584" + "timestamp": "2024-12-11T13:58:57.988191067" }, "ampcombi2_parsetables - metagenome - stub": { "content": [ @@ -34,7 +37,7 @@ "sample_1_amp.faa:md5,d41d8cd98f00b204e9800998ecf8427e", "sample_1_ampcombi.log:md5,d41d8cd98f00b204e9800998ecf8427e", "sample_1_ampcombi.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "sample_1_diamond_matches.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_1_mmseqs_matches.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -53,18 +56,27 @@ { "id": "sample_1" }, - "*.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + [ + "ref_DB:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.index:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.lookup:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.source:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.index:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ] ], "11": [ - "versions.yml:md5,f32ab4ba79e66feba755b78d7d7a1f36" + "versions.yml:md5,09f086e07825d96816d792d73eee90ca" ], "2": [ [ { "id": "sample_1" }, - "sample_1_diamond_matches.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_1_mmseqs_matches.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "3": [ @@ -105,9 +117,18 @@ "id": "sample_1" }, [ - "*.clean.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", - "*.dmnd:md5,d41d8cd98f00b204e9800998ecf8427e", - "*.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + "*.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "*.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + "ref_DB:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.index:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.lookup:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.source:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.index:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ] ] ], @@ -116,7 +137,7 @@ { "id": "sample_1" }, - "*.dmnd:md5,d41d8cd98f00b204e9800998ecf8427e" + "*.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "9": [ @@ -124,7 +145,7 @@ { "id": "sample_1" }, - "*.clean.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + "*.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "contig_gbks": [ @@ -137,56 +158,82 @@ ] ] ], - "faa": [ + "db": [ [ { "id": "sample_1" }, - "sample_1_amp.faa:md5,d41d8cd98f00b204e9800998ecf8427e" + [ + "*.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", + "*.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + [ + "ref_DB:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.index:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.lookup:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.source:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.index:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] ] ], - "full_log": [ + "db_fasta": [ [ { "id": "sample_1" }, - "Ampcombi_parse_tables.log:md5,d41d8cd98f00b204e9800998ecf8427e" + "*.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "results_db": [ + "db_mmseqs": [ [ { "id": "sample_1" }, [ - "*.clean.fasta:md5,d41d8cd98f00b204e9800998ecf8427e", - "*.dmnd:md5,d41d8cd98f00b204e9800998ecf8427e", - "*.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + "ref_DB:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.index:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.lookup:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB.source:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.dbtype:md5,d41d8cd98f00b204e9800998ecf8427e", + "ref_DB_h.index:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], - "results_db_dmnd": [ + "db_tsv": [ + [ + { + "id": "sample_1" + }, + "sample_1_mmseqs_matches.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "db_txt": [ [ { "id": "sample_1" }, - "*.dmnd:md5,d41d8cd98f00b204e9800998ecf8427e" + "*.txt:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "results_db_fasta": [ + "faa": [ [ { "id": "sample_1" }, - "*.clean.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_1_amp.faa:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "results_db_tsv": [ + "full_log": [ [ { "id": "sample_1" }, - "*.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + "Ampcombi_parse_tables.log:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], "sample_dir": [ @@ -201,7 +248,7 @@ "sample_1_amp.faa:md5,d41d8cd98f00b204e9800998ecf8427e", "sample_1_ampcombi.log:md5,d41d8cd98f00b204e9800998ecf8427e", "sample_1_ampcombi.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", - "sample_1_diamond_matches.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + "sample_1_mmseqs_matches.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ] ], @@ -221,23 +268,15 @@ "sample_1_ampcombi.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" ] ], - "txt": [ - [ - { - "id": "sample_1" - }, - "sample_1_diamond_matches.txt:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], "versions": [ - "versions.yml:md5,f32ab4ba79e66feba755b78d7d7a1f36" + "versions.yml:md5,09f086e07825d96816d792d73eee90ca" ] } ], "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nf-test": "0.9.2", + "nextflow": "24.10.2" }, - "timestamp": "2024-04-24T12:05:34.675308615" + "timestamp": "2024-12-05T13:03:22.741430379" } } \ No newline at end of file diff --git a/modules/nf-core/ampcombi2/parsetables/tests/nextflow.config b/modules/nf-core/ampcombi2/parsetables/tests/nextflow.config index d39b0509..75396b7d 100644 --- a/modules/nf-core/ampcombi2/parsetables/tests/nextflow.config +++ b/modules/nf-core/ampcombi2/parsetables/tests/nextflow.config @@ -12,7 +12,8 @@ process { "--hmmsearch_file 'candidates.txt'", "--ampgram_file '.tsv'", "--amptransformer_file '.txt'", - "--log true" + "--log true", + "--interproscan_filter 'nonsense'" ].join(' ') ext.prefix = "sample_1" diff --git a/modules/nf-core/ampcombi2/parsetables/tests/tags.yml b/modules/nf-core/ampcombi2/parsetables/tests/tags.yml deleted file mode 100644 index b56b0468..00000000 --- a/modules/nf-core/ampcombi2/parsetables/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -ampcombi2/parsetables: - - "modules/nf-core/ampcombi2/parsetables/**" diff --git a/nextflow.config b/nextflow.config index 49f88337..623c1d36 100644 --- a/nextflow.config +++ b/nextflow.config @@ -119,13 +119,14 @@ params { amp_hmmsearch_savetargets = false amp_hmmsearch_savedomains = false + amp_ampcombi_db_id = 'DRAMP' amp_ampcombi_db = null amp_ampcombi_parsetables_cutoff = 0.6 amp_ampcombi_parsetables_ampir = '.ampir.tsv' amp_ampcombi_parsetables_amplify = '.amplify.tsv' amp_ampcombi_parsetables_macrel = '.macrel.prediction' amp_ampcombi_parsetables_hmmsearch = '.hmmer_hmmsearch.txt' - amp_ampcombi_parsetables_aalength = 100 + amp_ampcombi_parsetables_aalength = 120 amp_ampcombi_parsetables_dbevalue = 5 amp_ampcombi_parsetables_hmmevalue = 0.06 amp_ampcombi_parsetables_windowstopcodon = 60 diff --git a/nextflow_schema.json b/nextflow_schema.json index 6b56c8f1..b1a7bf06 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -626,11 +626,18 @@ "description": "Antimicrobial peptides parsing, filtering, and annotating submodule of AMPcombi2. More info: https://github.com/Darcy220606/AMPcombi", "default": "", "properties": { + "amp_ampcombi_db_id": { + "type": "string", + "description": "The name of the database used to classify the AMPs.", + "help_text": "AMPcombi can use three different AMP databases to classify the recovered AMPS. These can either be: \n\n- [DRAMP database](http://dramp.cpu-bioinfor.org/downloads/): Only general AMPs are downloaded and filtered to remove any entry that has an instance of non amino acid residues in their sequence.\n\n- [APD](https://aps.unmc.edu/): Only experimentally validated AMPs are present.\n\n- [UniRef100](https://academic.oup.com/bioinformatics/article/23/10/1282/197795): Combines a more general protein dataset including curated and non curated AMPs. Helpful for identifying the clusters to remove any potential false positives. Beware: If the thresholds are for ampcombi are not strict enough, alignment with this database can take a long time. \n\nBy default this is set to 'DRAMP'. Other valid options include 'APD' or 'UniRef100'.\n\nFor more information check the AMPcombi [documentation](https://ampcombi.readthedocs.io/en/main/usage.html#parse-tables).", + "fa_icon": "fas fa-address-book", + "default": "DRAMP", + "enum": ["DRAMP", "APD", "UniRef100"] + }, "amp_ampcombi_db": { "type": "string", - "description": "Path to AMPcombi reference database directory (DRAMP).", - "help_text": "AMPcombi uses the 'general AMPs' dataset of the [DRAMP database](http://dramp.cpu-bioinfor.org/downloads/) for taxonomic classification. If you have a local version of it, you can provide the path to the directory(!) that contains the following reference database files:\n1. fasta file with `.fasta` file extension\n2. the corresponding table with with functional and taxonomic classifications in `.tsv` file extension.\n\nThe contents of the directory should have files such as `*.dmnd` and `*.fasta` in the top level.\n\nFor more information check the AMPcombi [documentation](https://github.com/Darcy220606/AMPcombi).", - "fa_icon": "fas fa-address-book" + "description": "The path to the folder containing the reference database files.", + "help_text": "The path to the folder containing the reference database files (`*.fasta` and `*.tsv`); a fasta file and the corresponding table with structural, functional and if reported taxonomic classifications. AMPcombi will then generate the corresponding `mmseqs2` directory, in which all binary files are prepared for the downstream alignment of teh recovered AMPs with [MMseqs2](https://github.com/soedinglab/MMseqs2). These can also be provided by the user by setting up an mmseqs2 compatible database using `mmseqs createdb *.fasta` in a directory called `mmseqs2`.\n\nExample file structure for the reference database supplied by the user:\n\n```bash\namp_DRAMP_database/\n\u251c\u2500\u2500 general_amps_2024_11_13.fasta\n\u251c\u2500\u2500 general_amps_2024_11_13.txt\n\u2514\u2500\u2500 mmseqs2\n \u251c\u2500\u2500 ref_DB\n \u251c\u2500\u2500 ref_DB.dbtype\n \u251c\u2500\u2500 ref_DB_h\n \u251c\u2500\u2500 ref_DB_h.dbtype\n \u251c\u2500\u2500 ref_DB_h.index\n \u251c\u2500\u2500 ref_DB.index\n \u251c\u2500\u2500 ref_DB.lookup\n \u2514\u2500\u2500 ref_DB.source\n\nFor more information check the AMPcombi [documentation](https://ampcombi.readthedocs.io/en/main/usage.html#parse-tables)." }, "amp_ampcombi_parsetables_cutoff": { "type": "number", @@ -641,7 +648,7 @@ }, "amp_ampcombi_parsetables_aalength": { "type": "integer", - "default": 100, + "default": 120, "description": "Filter out all amino acid fragments shorter than this number.", "help_text": "Any AMP hit that does not satisfy this length cut-off will be removed from the final AMPcombi2 summary table.\n\n> Modifies tool parameter(s):\n> - AMPCOMBI: `--aminoacid_length`", "fa_icon": "fas fa-ruler-horizontal" diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 88f75393..293692a1 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -6,7 +6,7 @@ include { MACREL_CONTIGS } from '.. include { HMMER_HMMSEARCH as AMP_HMMER_HMMSEARCH } from '../../modules/nf-core/hmmer/hmmsearch/main' include { AMPLIFY_PREDICT } from '../../modules/nf-core/amplify/predict/main' include { AMPIR } from '../../modules/nf-core/ampir/main' -include { DRAMP_DOWNLOAD } from '../../modules/local/dramp_download' +include { AMP_DATABASE_DOWNLOAD } from '../../modules/local/amp_database_download' include { AMPCOMBI2_PARSETABLES } from '../../modules/nf-core/ampcombi2/parsetables' include { AMPCOMBI2_COMPLETE } from '../../modules/nf-core/ampcombi2/complete' include { AMPCOMBI2_CLUSTER } from '../../modules/nf-core/ampcombi2/cluster' @@ -111,13 +111,13 @@ workflow AMP { } if ( params.amp_ampcombi_db != null ) { - AMPCOMBI2_PARSETABLES ( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_input_for_ampcombi.gbk, params.amp_ampcombi_db ) - } else { - DRAMP_DOWNLOAD() - ch_versions = ch_versions.mix( DRAMP_DOWNLOAD.out.versions ) - ch_ampcombi_input_db = DRAMP_DOWNLOAD.out.db - AMPCOMBI2_PARSETABLES ( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_input_for_ampcombi.gbk, ch_ampcombi_input_db ) - } + AMPCOMBI2_PARSETABLES ( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_input_for_ampcombi.gbk, params.amp_ampcombi_db_id, params.amp_ampcombi_db, [] ) + } else { + AMP_DATABASE_DOWNLOAD( params.amp_ampcombi_db_id ) + ch_versions = ch_versions.mix( AMP_DATABASE_DOWNLOAD.out.versions ) + ch_ampcombi_input_db = AMP_DATABASE_DOWNLOAD.out.db + AMPCOMBI2_PARSETABLES ( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_input_for_ampcombi.gbk, params.amp_ampcombi_db_id, ch_ampcombi_input_db, [] ) + } ch_versions = ch_versions.mix( AMPCOMBI2_PARSETABLES.out.versions ) ch_ampcombi_summaries = AMPCOMBI2_PARSETABLES.out.tsv.map{ it[1] }.collect()