From b00ebb6cb81bce9989f34db969d934585208c5eb Mon Sep 17 00:00:00 2001 From: Toni Hermoso Pulido Date: Tue, 30 Apr 2024 09:58:21 +0200 Subject: [PATCH] Blastdbcmd new module (#5482) * starting blastdbcmd * carry on * Making it work with simple entry version * Upgrade to make entry_batch work * Upgrade to make it work with tests * adding missing tag * Removed versions to make it pass the tests in Github * Make it work with versions and so * Update modules/nf-core/blast/blastdbcmd/meta.yml Co-authored-by: James A. Fellows Yates * Update modules/nf-core/blast/blastdbcmd/meta.yml Co-authored-by: James A. Fellows Yates * Move module into two and upgrade according to comments * upgrade with outfmt forced * Turn back into one module * Update modules/nf-core/blast/blastdbcmd/main.nf Co-authored-by: James A. Fellows Yates * update tags and stub test * fix stub * making it work more widely * editorcheck error * addressing comments * Update modules/nf-core/blast/blastdbcmd/meta.yml Co-authored-by: James A. Fellows Yates --------- Co-authored-by: James A. Fellows Yates --- .../nf-core/blast/blastdbcmd/environment.yml | 7 + modules/nf-core/blast/blastdbcmd/main.nf | 64 +++++++ modules/nf-core/blast/blastdbcmd/meta.yml | 61 +++++++ .../blast/blastdbcmd/tests/main.nf.test | 122 ++++++++++++++ .../blast/blastdbcmd/tests/main.nf.test.snap | 158 ++++++++++++++++++ .../blast/blastdbcmd/tests/nextflow.config | 8 + .../blastdbcmd/tests/nextflow.txt.config | 5 + .../nf-core/blast/blastdbcmd/tests/tags.yml | 2 + tests/config/test_data.config | 2 +- 9 files changed, 428 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/blast/blastdbcmd/environment.yml create mode 100644 modules/nf-core/blast/blastdbcmd/main.nf create mode 100644 modules/nf-core/blast/blastdbcmd/meta.yml create mode 100644 modules/nf-core/blast/blastdbcmd/tests/main.nf.test create mode 100644 modules/nf-core/blast/blastdbcmd/tests/main.nf.test.snap create mode 100644 modules/nf-core/blast/blastdbcmd/tests/nextflow.config create mode 100644 modules/nf-core/blast/blastdbcmd/tests/nextflow.txt.config create mode 100644 modules/nf-core/blast/blastdbcmd/tests/tags.yml diff --git a/modules/nf-core/blast/blastdbcmd/environment.yml b/modules/nf-core/blast/blastdbcmd/environment.yml new file mode 100644 index 00000000000..b90a0b74587 --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/environment.yml @@ -0,0 +1,7 @@ +name: "blast_blastdbcmd" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::blast=2.15.0" diff --git a/modules/nf-core/blast/blastdbcmd/main.nf b/modules/nf-core/blast/blastdbcmd/main.nf new file mode 100644 index 00000000000..b7541e9f9c9 --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/main.nf @@ -0,0 +1,64 @@ +process BLAST_BLASTDBCMD { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/blast:2.15.0--pl5321h6f7f691_1': + 'biocontainers/blast:2.15.0--pl5321h6f7f691_1' }" + + input: + tuple val(meta) , val(entry), path(entry_batch) + tuple val(meta2), path(db) + + output: + tuple val(meta), path("*.fasta"), optional: true, emit: fasta + tuple val(meta), path("*.txt") , optional: true, emit: text + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + assert (!entry && entry_batch) || (entry && !entry_batch) : "ERROR: You must use either entry or entry_batch, not both at the same time" + def input = '' + if (entry) { + input = "-entry ${entry}" + } else { + input = "-entry_batch ${entry_batch}" + } + def extension = args.contains("-outfmt") && !args.contains("-outfmt %f") ? "txt" : "fasta" + """ + DB=`find -L ./ -name "*.nhr" | sed 's/\\.nhr\$//'` + if test -z "\$DB" + then + DB=`find -L ./ -name "*.phr" | sed 's/\\.phr\$//'` + fi + + blastdbcmd \\ + -db \$DB \\ + ${args} \\ + -out ${prefix}.${extension} \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastdbcmd -version 2>&1 | head -n1 | sed 's/^.*blastdbcmd: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("-outfmt") && !args.contains("-outfmt %f") ? "txt" : "fasta" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + blast: \$(blastdbcmd -version 2>&1 | head -n1 | sed 's/^.*blastdbcmd: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/blast/blastdbcmd/meta.yml b/modules/nf-core/blast/blastdbcmd/meta.yml new file mode 100644 index 00000000000..1c2f90e1ab8 --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/meta.yml @@ -0,0 +1,61 @@ +name: blast_blastdbcmd +description: Retrieve entries from a BLAST database +keywords: + - fasta + - blast + - database + - retrieval + - identifier +tools: + - blast: + description: | + BLAST finds regions of similarity between biological sequences. + homepage: https://blast.ncbi.nlm.nih.gov/Blast.cgi + documentation: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=Blastdocs + doi: 10.1016/S0022-2836(05)80360-2 + licence: ["US-Government-Work"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - entry: + type: string + description: Entry identifier of sequence in database. It cannot be used along with entry_batch + - entry_batch: + type: file + description: | + File with a list of entry identifiers of sequences in database (one identifier per line). It cannot be used along with entry + - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: file + description: Input BLAST-indexed database + pattern: "*.{fa.*,fasta.*}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Output fasta file (default format) + pattern: "*.{fasta}" + - text: + type: file + description: | + Output text file (generic format if fasta not used, i.e. `--outfmt` is supplied to `ext.args`) + pattern: "*.{txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@toniher" +maintainers: + - "@toniher" diff --git a/modules/nf-core/blast/blastdbcmd/tests/main.nf.test b/modules/nf-core/blast/blastdbcmd/tests/main.nf.test new file mode 100644 index 00000000000..26d0397ac19 --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/tests/main.nf.test @@ -0,0 +1,122 @@ +nextflow_process { + + name "Test Process BLAST_BLASTDBCMD" + script "../main.nf" + process "BLAST_BLASTDBCMD" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "blast" + tag "blast/blastdbcmd" + tag "blast/makeblastdb" + + setup { + run("BLAST_MAKEBLASTDB") { + script "../../makeblastdb/main.nf" + process { + """ + input[0] = [ [id:'test2'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] + """ + } + } + } + + + test("Should query with a protein identifier against a FASTA DB") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id: 'test'], 'ENSSASP00005000002.1', [] ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should query with a protein identifier against a FASTA DB - stub") { + + options '-stub' + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id: 'test'], 'ENSSASP00005000002.1', [] ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should query with a file containing a list of protein identifiers against a FASTA DB") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [id:'test'], '', file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/blast/proteome.list', checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Should query with a file containing a list of protein identifiers against a FASTA DB - text file") { + + config "./nextflow.txt.config" + + when { + params { + outdir = "$outputDir" + } + process { + + """ + input[0] = [ [id:'test'], '', file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/blast/proteome.list', checkIfExists: true) ] + input[1] = BLAST_MAKEBLASTDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/blast/blastdbcmd/tests/main.nf.test.snap b/modules/nf-core/blast/blastdbcmd/tests/main.nf.test.snap new file mode 100644 index 00000000000..bcd57489a4b --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/tests/main.nf.test.snap @@ -0,0 +1,158 @@ +{ + "Should query with a file containing a list of protein identifiers against a FASTA DB": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,0b209b6d43b3d5a160944227d3eb660b" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ], + "fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,0b209b6d43b3d5a160944227d3eb660b" + ] + ], + "text": [ + + ], + "versions": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-26T17:58:26.342873297" + }, + "Should query with a file containing a list of protein identifiers against a FASTA DB - text file": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.txt:md5,357bc4aac41b649ad48b756ae93943ac" + ] + ], + "2": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ], + "fasta": [ + + ], + "text": [ + [ + { + "id": "test" + }, + "test.txt:md5,357bc4aac41b649ad48b756ae93943ac" + ] + ], + "versions": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-26T17:58:37.353763379" + }, + "Should query with a protein identifier against a FASTA DB": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,11a6e8a5cb36e439e6209e8dfe94656a" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ], + "fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,11a6e8a5cb36e439e6209e8dfe94656a" + ] + ], + "text": [ + + ], + "versions": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-26T17:58:04.333482407" + }, + "Should query with a protein identifier against a FASTA DB - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ], + "fasta": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "text": [ + + ], + "versions": [ + "versions.yml:md5,f57a6202dd1899e5081abda557352926" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-04-26T17:58:14.318938272" + } +} \ No newline at end of file diff --git a/modules/nf-core/blast/blastdbcmd/tests/nextflow.config b/modules/nf-core/blast/blastdbcmd/tests/nextflow.config new file mode 100644 index 00000000000..51127b00980 --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: BLAST_MAKEBLASTDB { + ext.args = '-dbtype prot -parse_seqids' + } + withName: BLAST_BLASTDBCMD { + ext.args = '-dbtype prot' + } +} diff --git a/modules/nf-core/blast/blastdbcmd/tests/nextflow.txt.config b/modules/nf-core/blast/blastdbcmd/tests/nextflow.txt.config new file mode 100644 index 00000000000..b522244378f --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/tests/nextflow.txt.config @@ -0,0 +1,5 @@ +process { + withName: BLAST_BLASTDBCMD { + ext.args = '-dbtype prot -outfmt "%a %l"' + } +} diff --git a/modules/nf-core/blast/blastdbcmd/tests/tags.yml b/modules/nf-core/blast/blastdbcmd/tests/tags.yml new file mode 100644 index 00000000000..d0a6c8419f5 --- /dev/null +++ b/modules/nf-core/blast/blastdbcmd/tests/tags.yml @@ -0,0 +1,2 @@ +blast/blastdbcmd: + - "modules/nf-core/blast/blastdbcmd/**" diff --git a/tests/config/test_data.config b/tests/config/test_data.config index bf4936592d6..a616fb4aac2 100644 --- a/tests/config/test_data.config +++ b/tests/config/test_data.config @@ -227,7 +227,7 @@ params { test_scATAC_2_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/10xgenomics/cellranger-atac/test_scATAC_S1_L001_R2_001.fastq.gz" test_scATAC_3_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/10xgenomics/cellranger-atac/test_scATAC_S1_L001_R3_001.fastq.gz" test_scATAC_I_fastq_gz = "${params.test_data_base}/data/genomics/homo_sapiens/10xgenomics/cellranger-atac/test_scATAC_S1_L001_I1_001.fastq.gz" - + test_10x_matrix_rna_raw_h5 = "${params.test_data_base}/data/genomics/homo_sapiens/10xgenomics/cellranger/hashing_demultiplexing/438-21-raw_feature_bc_matrix.h5" test_10x_matrix_hto_csv = "${params.test_data_base}/data/genomics/homo_sapiens/10xgenomics/cellranger/hashing_demultiplexing/438_21_raw_HTO.csv" }