From 088a94ded0c78f955c8bb4c79f5d71f1d07c2bcd Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Tue, 16 Jan 2024 19:46:27 +0100 Subject: [PATCH 1/9] update vep --- ...ces_v1.txt => variant_consequences_v2.txt} | 8 +- conf/modules/annotate_genome_snvs.config | 9 +- conf/modules/annotate_mt_snvs.config | 9 +- .../annotate_structural_variants.config | 4 +- conf/test.config | 9 +- conf/test_one_sample.config | 9 +- modules.json | 5 + .../nf-core/ensemblvep/vep/environment.yml | 7 ++ modules/nf-core/ensemblvep/vep/main.nf | 71 ++++++++++++++ modules/nf-core/ensemblvep/vep/meta.yml | 92 +++++++++++++++++++ nextflow_schema.json | 13 ++- subworkflows/local/annotate_genome_snvs.nf | 14 ++- subworkflows/local/annotate_mt_snvs.nf | 33 ++++--- .../local/annotate_structural_variants.nf | 30 +++--- workflows/raredisease.nf | 25 ++++- 15 files changed, 273 insertions(+), 65 deletions(-) rename assets/{variant_consequences_v1.txt => variant_consequences_v2.txt} (95%) create mode 100644 modules/nf-core/ensemblvep/vep/environment.yml create mode 100644 modules/nf-core/ensemblvep/vep/main.nf create mode 100644 modules/nf-core/ensemblvep/vep/meta.yml diff --git a/assets/variant_consequences_v1.txt b/assets/variant_consequences_v2.txt similarity index 95% rename from assets/variant_consequences_v1.txt rename to assets/variant_consequences_v2.txt index 0893a8b9..effe32b1 100644 --- a/assets/variant_consequences_v1.txt +++ b/assets/variant_consequences_v2.txt @@ -6,12 +6,14 @@ frameshift_variant stop_lost start_lost transcript_amplification +feature_elongation +feature_truncation inframe_insertion inframe_deletion missense_variant protein_altering_variant -splice_region_variant splice_donor_5th_base_variant +splice_region_variant splice_donor_region_variant splice_polypyrimidine_tract_variant incomplete_terminal_codon_variant @@ -26,6 +28,7 @@ non_coding_transcript_exon_variant intron_variant NMD_transcript_variant non_coding_transcript_variant +coding_transcript_variant upstream_gene_variant downstream_gene_variant TFBS_ablation @@ -33,7 +36,6 @@ TFBS_amplification TF_binding_site_variant regulatory_region_ablation regulatory_region_amplification -feature_elongation regulatory_region_variant -feature_truncation intergenic_variant +sequence_variant diff --git a/conf/modules/annotate_genome_snvs.config b/conf/modules/annotate_genome_snvs.config index 6697c498..5b1e9928 100644 --- a/conf/modules/annotate_genome_snvs.config +++ b/conf/modules/annotate_genome_snvs.config @@ -79,16 +79,15 @@ process { ext.prefix = { "${vcf.simpleName}_rohann_vcfanno_filter_vep" } ext.args = [ '--dir_plugins vep_cache/Plugins', - '--plugin LoFtool,vep_cache/LoFtool_scores.txt', - '--plugin pLI,vep_cache/pLI_values_107.txt', - '--plugin SpliceAI,snv=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', - '--plugin MaxEntScan,vep_cache/fordownload,SWA,NCSS', + '--plugin LoFtool,LoFtool_scores.txt', + '--plugin pLI,pLI_values_107.txt', + '--plugin SpliceAI,snv=spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', '--distance 5000', '--buffer_size 20000', '--format vcf --max_sv_size 248956422', '--appris --biotype --cache --canonical --ccds --compress_output bgzip', '--domains --exclude_predicted --force_overwrite', - '--hgvs --humdiv --no_progress --no_stats --numbers', + '--hgvs --humdiv --no_progress --numbers', '--merged --polyphen p --protein --offline --regulatory --sift p --symbol --tsl', '--uniprot --vcf' ].join(' ') diff --git a/conf/modules/annotate_mt_snvs.config b/conf/modules/annotate_mt_snvs.config index 391a3e71..f0e46836 100644 --- a/conf/modules/annotate_mt_snvs.config +++ b/conf/modules/annotate_mt_snvs.config @@ -20,16 +20,15 @@ process { withName: '.*ANNOTATE_MT_SNVS:ENSEMBLVEP_MT' { ext.args = [ '--dir_plugins vep_cache/Plugins', - '--plugin LoFtool,vep_cache/LoFtool_scores.txt', - '--plugin pLI,vep_cache/pLI_values_107.txt', - '--plugin SpliceAI,snv=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=vep_cache/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', - '--plugin MaxEntScan,vep_cache/fordownload,SWA,NCSS', + '--plugin LoFtool,LoFtool_scores.txt', + '--plugin pLI,pLI_values_107.txt', + '--plugin SpliceAI,snv=spliceai_21_scores_raw_snv_-v1.3-.vcf.gz,indel=spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', '--distance 0', '--buffer_size 20000', '--format vcf --fork 4 --max_sv_size 248956422', '--appris --biotype --cache --canonical --ccds --compress_output bgzip', '--domains --exclude_predicted --force_overwrite', - '--hgvs --humdiv --no_progress --no_stats --numbers', + '--hgvs --humdiv --no_progress --numbers', '--merged --polyphen p --protein --offline --regulatory --sift p --symbol --tsl --vcf', '--uniprot' ].join(' ') diff --git a/conf/modules/annotate_structural_variants.config b/conf/modules/annotate_structural_variants.config index b2ee6218..9f8f5f19 100644 --- a/conf/modules/annotate_structural_variants.config +++ b/conf/modules/annotate_structural_variants.config @@ -46,12 +46,12 @@ process { ext.args = [ '--dir_cache vep_cache', '--dir_plugins vep_cache/Plugins', - '--plugin pLI,vep_cache/pLI_values_107.txt', + '--plugin pLI,pLI_values_107.txt', '--appris --biotype --buffer_size 100 --canonical --cache --ccds', '--compress_output bgzip --distance 5000 --domains', '--exclude_predicted --force_overwrite --format vcf', '--fork 4 --hgvs --humdiv --max_sv_size 248956422 --merged', - '--no_progress --no_stats --numbers --per_gene --polyphen p', + '--no_progress --numbers --per_gene --polyphen p', '--protein --offline --regulatory --sift p', '--symbol --tsl --uniprot --vcf' ].join(' ') diff --git a/conf/test.config b/conf/test.config index d43f319b..5a5fdc39 100644 --- a/conf/test.config +++ b/conf/test.config @@ -56,11 +56,6 @@ params { vcfanno_toml = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_config.toml" vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" vep_filters = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/hgnc.txt" - vep_cache_version = 107 -} - -process { - withName: '.*FILTERVEP.*' { - container = "docker.io/ensemblorg/ensembl-vep:release_107.0" - } + vep_cache_version = 110 + vep_plugin_files = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_files.csv" } diff --git a/conf/test_one_sample.config b/conf/test_one_sample.config index de8436c8..82795453 100644 --- a/conf/test_one_sample.config +++ b/conf/test_one_sample.config @@ -56,11 +56,6 @@ params { vcfanno_toml = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_config.toml" vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" vep_filters = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/hgnc.txt" - vep_cache_version = 107 -} - -process { - withName: '.*FILTERVEP.*' { - container = "docker.io/ensemblorg/ensembl-vep:release_107.0" - } + vep_cache_version = 110 + vep_plugin_files = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_files.csv" } diff --git a/modules.json b/modules.json index 9a1324fd..eac809df 100644 --- a/modules.json +++ b/modules.json @@ -115,6 +115,11 @@ "git_sha": "29984d70aea47d06f0062a1785d76c357dd40ea9", "installed_by": ["modules"] }, + "ensemblvep/vep": { + "branch": "master", + "git_sha": "214d575774c172062924ad3564b4f66655600730", + "installed_by": ["modules"] + }, "expansionhunter": { "branch": "master", "git_sha": "0260e5d22372eae434816d6970dedf3f5adc0053", diff --git a/modules/nf-core/ensemblvep/vep/environment.yml b/modules/nf-core/ensemblvep/vep/environment.yml new file mode 100644 index 00000000..7a127746 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/environment.yml @@ -0,0 +1,7 @@ +name: ensemblvep_vep +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ensembl-vep=110.0 diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf new file mode 100644 index 00000000..3a2b7423 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/main.nf @@ -0,0 +1,71 @@ +process ENSEMBLVEP_VEP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:110.0--pl5321h2a3209d_0' : + 'biocontainers/ensembl-vep:110.0--pl5321h2a3209d_0' }" + + input: + tuple val(meta), path(vcf), path(custom_extra_files) + val genome + val species + val cache_version + path cache + tuple val(meta2), path(fasta) + path extra_files + + output: + tuple val(meta), path("*.vcf.gz") , optional:true, emit: vcf + tuple val(meta), path("*.tab.gz") , optional:true, emit: tab + tuple val(meta), path("*.json.gz") , optional:true, emit: json + path "*.summary.html" , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf' + def compress_cmd = args.contains("--compress_output") ? '' : '--compress_output bgzip' + def prefix = task.ext.prefix ?: "${meta.id}" + def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep" + def reference = fasta ? "--fasta $fasta" : "" + """ + vep \\ + -i $vcf \\ + -o ${prefix}.${file_extension}.gz \\ + $args \\ + $compress_cmd \\ + $reference \\ + --assembly $genome \\ + --species $species \\ + --cache \\ + --cache_version $cache_version \\ + --dir_cache $dir_cache \\ + --fork $task.cpus \\ + --stats_file ${prefix}.summary.html \\ + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.tab.gz + touch ${prefix}.json.gz + touch ${prefix}.summary.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/vep/meta.yml b/modules/nf-core/ensemblvep/vep/meta.yml new file mode 100644 index 00000000..d8ff8d14 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/meta.yml @@ -0,0 +1,92 @@ +name: ensemblvep_vep +description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled through `task.ext.args`. +keywords: + - annotation + - vcf + - json + - tab +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - custom_extra_files: + type: file + description: | + extra sample-specific files to be used with the `--custom` flag to be configured with ext.args + (optional) + - genome: + type: string + description: | + which genome to annotate with + - species: + type: string + description: | + which species to annotate with + - cache_version: + type: integer + description: | + which version of the cache to annotate with + - cache: + type: file + description: | + path to VEP cache (optional) + - meta2: + type: map + description: | + Groovy Map containing fasta reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: | + reference FASTA file (optional) + pattern: "*.{fasta,fa}" + - extra_files: + type: file + description: | + path to file(s) needed for plugins (optional) +output: + - vcf: + type: file + description: | + annotated vcf (optional) + pattern: "*.ann.vcf.gz" + - tab: + type: file + description: | + tab file with annotated variants (optional) + pattern: "*.ann.tab.gz" + - json: + type: file + description: | + json file with annotated variants (optional) + pattern: "*.ann.json.gz" + - report: + type: file + description: VEP report file + pattern: "*.html" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" diff --git a/nextflow_schema.json b/nextflow_schema.json index 1b1b7641..d29a9320 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -374,6 +374,15 @@ "help_text": "If no directory path is passed, vcf files will not be annotated by vep.", "fa_icon": "fas fa-folder-open" }, + "vep_plugin_files": { + "type": "string", + "exists": true, + "format": "file-path", + "description": "Databases used by both named and custom plugins to annotate variants.", + "fa_icon": "fas fa-file-csv", + "help_text": "Path to a file containing the absolute paths to databases and their indices used by VEP's custom and named plugins resources defined within the vcfanno toml file. One line per resource.", + "mimetype": "text/csv" + }, "vep_filters": { "type": "string", "exists": true, @@ -557,10 +566,10 @@ "properties": { "vep_cache_version": { "type": "integer", - "default": 107, + "default": 110, "description": "Specify the version of the VEP cache provided to the `--vep_cache` option.", "fa_icon": "fas fa-align-center", - "enum": [107] + "enum": [107, 110] } } }, diff --git a/subworkflows/local/annotate_genome_snvs.nf b/subworkflows/local/annotate_genome_snvs.nf index 334b7d4b..291d3acd 100644 --- a/subworkflows/local/annotate_genome_snvs.nf +++ b/subworkflows/local/annotate_genome_snvs.nf @@ -11,7 +11,7 @@ include { UPD as UPD_SITES } from '../../modules/nf-core/up include { UPD as UPD_REGIONS } from '../../modules/nf-core/upd/main' include { CHROMOGRAPH as CHROMOGRAPH_SITES } from '../../modules/nf-core/chromograph/main' include { CHROMOGRAPH as CHROMOGRAPH_REGIONS } from '../../modules/nf-core/chromograph/main' -include { ENSEMBLVEP as ENSEMBLVEP_SNV } from '../../modules/local/ensemblvep/main' +include { ENSEMBLVEP_VEP as ENSEMBLVEP_SNV } from '../../modules/nf-core/ensemblvep/vep/main' include { TABIX_BGZIPTABIX as ZIP_TABIX_ROHCALL } from '../../modules/nf-core/tabix/bgziptabix/main' include { TABIX_BGZIPTABIX as ZIP_TABIX_VCFANNO } from '../../modules/nf-core/tabix/bgziptabix/main' include { TABIX_TABIX as TABIX_VEP } from '../../modules/nf-core/tabix/tabix/main' @@ -36,6 +36,7 @@ workflow ANNOTATE_GENOME_SNVS { ch_genome_fasta // channel: [mandatory] [ val(meta), path(fasta) ] ch_gnomad_af // channel: [optional] [ path(tab), path(tbi) ] ch_split_intervals // channel: [mandatory] [ path(intervals) ] + ch_vep_extra_files // channel: [mandatory] [ path(files) ] main: ch_cadd_vcf = Channel.empty() @@ -115,20 +116,23 @@ workflow ANNOTATE_GENOME_SNVS { } .set { ch_for_mix } - ch_vep_in = ch_for_mix.selvar.mix(ch_for_mix.cadd) + ch_for_mix.selvar.mix(ch_for_mix.cadd) + .map { meta, vcf -> return [meta, vcf, []]} + .set { ch_vep_in } + // Annotating with ensembl Vep ENSEMBLVEP_SNV( ch_vep_in, - ch_genome_fasta, val_vep_genome, "homo_sapiens", val_vep_cache_version, ch_vep_cache, - [] + ch_genome_fasta, + ch_vep_extra_files ) - ENSEMBLVEP_SNV.out.vcf_gz + ENSEMBLVEP_SNV.out.vcf .map { meta, vcf -> [meta - meta.subMap('scatterid'), vcf] } .set { ch_vep_out } diff --git a/subworkflows/local/annotate_mt_snvs.nf b/subworkflows/local/annotate_mt_snvs.nf index 8f7c24eb..e1ed903a 100644 --- a/subworkflows/local/annotate_mt_snvs.nf +++ b/subworkflows/local/annotate_mt_snvs.nf @@ -2,13 +2,13 @@ // Annotate MT // -include { TABIX_TABIX as TABIX_TABIX_MT } from '../../modules/nf-core/tabix/tabix/main' -include { ENSEMBLVEP as ENSEMBLVEP_MT } from '../../modules/local/ensemblvep/main' -include { HAPLOGREP2_CLASSIFY as HAPLOGREP2_CLASSIFY_MT } from '../../modules/nf-core/haplogrep2/classify/main' -include { VCFANNO as VCFANNO_MT } from '../../modules/nf-core/vcfanno/main' -include { ANNOTATE_CADD } from './annotation/annotate_cadd' -include { TABIX_BGZIPTABIX as ZIP_TABIX_HMTNOTE } from '../../modules/nf-core/tabix/bgziptabix/main' -include { HMTNOTE_ANNOTATE } from '../../modules/nf-core/hmtnote/annotate/main' +include { TABIX_TABIX as TABIX_TABIX_MT } from '../../modules/nf-core/tabix/tabix/main' +include { ENSEMBLVEP_VEP as ENSEMBLVEP_MT } from '../../modules/nf-core/ensemblvep/vep/main' +include { HAPLOGREP2_CLASSIFY as HAPLOGREP2_CLASSIFY_MT } from '../../modules/nf-core/haplogrep2/classify/main' +include { VCFANNO as VCFANNO_MT } from '../../modules/nf-core/vcfanno/main' +include { ANNOTATE_CADD } from './annotation/annotate_cadd' +include { TABIX_BGZIPTABIX as ZIP_TABIX_HMTNOTE } from '../../modules/nf-core/tabix/bgziptabix/main' +include { HMTNOTE_ANNOTATE } from '../../modules/nf-core/hmtnote/annotate/main' workflow ANNOTATE_MT_SNVS { take: @@ -22,6 +22,8 @@ workflow ANNOTATE_MT_SNVS { val_vep_genome // string: [mandatory] GRCh37 or GRCh38 val_vep_cache_version // string: [mandatory] 107 ch_vep_cache // channel: [mandatory] [ path(cache) ] + ch_vep_cache // channel: [mandatory] [ path(cache) ] + ch_vep_extra_files // channel: [mandatory] [ path(files) ] main: ch_cadd_vcf = Channel.empty() @@ -49,22 +51,27 @@ workflow ANNOTATE_MT_SNVS { return [it[0], it[2]] } .set { ch_for_mix } - ch_vep_in = ch_for_mix.merged.mix(ch_for_mix.cadd) + + ch_for_mix.merged.mix(ch_for_mix.cadd) + .tap { ch_haplogrep_in } + .map { meta, vcf -> return [meta, vcf, []]} + .set { ch_vep_in } + // Annotating with ensembl Vep ENSEMBLVEP_MT( ch_vep_in, - ch_genome_fasta, val_vep_genome, "homo_sapiens", val_vep_cache_version, ch_vep_cache, - [] + ch_genome_fasta, + ch_vep_extra_files ) // Running vcfanno - TABIX_TABIX_MT(ENSEMBLVEP_MT.out.vcf_gz) - ENSEMBLVEP_MT.out.vcf_gz + TABIX_TABIX_MT(ENSEMBLVEP_MT.out.vcf) + ENSEMBLVEP_MT.out.vcf .join(TABIX_TABIX_MT.out.tbi, failOnMismatch:true, failOnDuplicate:true) .map { meta, vcf, tbi -> return [meta, vcf, tbi, []]} .set { ch_in_vcfanno } @@ -84,7 +91,7 @@ workflow ANNOTATE_MT_SNVS { ch_tbi_out = ZIP_TABIX_HMTNOTE.out.gz_tbi.map{meta, vcf, tbi -> return [meta, tbi] } // Running haplogrep2 - HAPLOGREP2_CLASSIFY_MT(ch_vep_in, "vcf.gz") + HAPLOGREP2_CLASSIFY_MT(ch_haplogrep_in, "vcf.gz") ch_versions = ch_versions.mix(ENSEMBLVEP_MT.out.versions) ch_versions = ch_versions.mix(TABIX_TABIX_MT.out.versions) diff --git a/subworkflows/local/annotate_structural_variants.nf b/subworkflows/local/annotate_structural_variants.nf index 6766a73c..d2d42027 100644 --- a/subworkflows/local/annotate_structural_variants.nf +++ b/subworkflows/local/annotate_structural_variants.nf @@ -2,12 +2,12 @@ // A subworkflow to annotate structural variants. // -include { SVDB_QUERY as SVDB_QUERY_DB } from '../../modules/nf-core/svdb/query/main' -include { SVDB_QUERY as SVDB_QUERY_BEDPE } from '../../modules/nf-core/svdb/query/main' -include { PICARD_SORTVCF } from '../../modules/nf-core/picard/sortvcf/main' -include { BCFTOOLS_VIEW } from '../../modules/nf-core/bcftools/view/main' -include { ENSEMBLVEP as ENSEMBLVEP_SV } from '../../modules/local/ensemblvep/main' -include { TABIX_TABIX as TABIX_VEP } from '../../modules/nf-core/tabix/tabix/main' +include { SVDB_QUERY as SVDB_QUERY_DB } from '../../modules/nf-core/svdb/query/main' +include { SVDB_QUERY as SVDB_QUERY_BEDPE } from '../../modules/nf-core/svdb/query/main' +include { PICARD_SORTVCF } from '../../modules/nf-core/picard/sortvcf/main' +include { BCFTOOLS_VIEW } from '../../modules/nf-core/bcftools/view/main' +include { ENSEMBLVEP_VEP as ENSEMBLVEP_SV } from '../../modules/nf-core/ensemblvep/vep/main' +include { TABIX_TABIX as TABIX_VEP } from '../../modules/nf-core/tabix/tabix/main' workflow ANNOTATE_STRUCTURAL_VARIANTS { @@ -20,6 +20,7 @@ workflow ANNOTATE_STRUCTURAL_VARIANTS { ch_vep_cache // channel: [mandatory] [ path(cache) ] ch_genome_fasta // channel: [mandatory] [ val(meta), path(fasta) ] ch_genome_dictionary // channel: [mandatory] [ val(meta), path(dict) ] + ch_vep_extra_files // channel: [mandatory] [ path(files) ] main: ch_versions = Channel.empty() @@ -97,18 +98,21 @@ workflow ANNOTATE_STRUCTURAL_VARIANTS { PICARD_SORTVCF.out.vcf.map { meta, vcf -> return [meta,vcf,[]] }.set { ch_sortvcf } BCFTOOLS_VIEW(ch_sortvcf, [], [], []) + .vcf + .map { meta, vcf -> return [meta, vcf, []]} + .set { ch_vep_in } ENSEMBLVEP_SV( - BCFTOOLS_VIEW.out.vcf, - ch_genome_fasta, + ch_vep_in, val_vep_genome, "homo_sapiens", val_vep_cache_version, ch_vep_cache, - [] + ch_genome_fasta, + ch_vep_extra_files ) - TABIX_VEP (ENSEMBLVEP_SV.out.vcf_gz) + TABIX_VEP (ENSEMBLVEP_SV.out.vcf) ch_versions = ch_versions.mix(SVDB_QUERY_DB.out.versions) ch_versions = ch_versions.mix(SVDB_QUERY_BEDPE.out.versions) @@ -118,7 +122,7 @@ workflow ANNOTATE_STRUCTURAL_VARIANTS { ch_versions = ch_versions.mix(TABIX_VEP.out.versions) emit: - vcf_ann = ENSEMBLVEP_SV.out.vcf_gz // channel: [ val(meta), path(vcf) ] - tbi = TABIX_VEP.out.tbi // channel: [ val(meta), path(tbi) ] - versions = ch_versions // channel: [ path(versions.yml) ] + vcf_ann = ENSEMBLVEP_SV.out.vcf // channel: [ val(meta), path(vcf) ] + tbi = TABIX_VEP.out.tbi // channel: [ val(meta), path(tbi) ] + versions = ch_versions // channel: [ path(versions.yml) ] } diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index dac92ee9..2227303c 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -275,7 +275,7 @@ workflow RAREDISEASE { ch_target_intervals = ch_references.target_intervals ch_variant_catalog = params.variant_catalog ? Channel.fromPath(params.variant_catalog).map { it -> [[id:it[0].simpleName],it]}.collect() : Channel.value([[],[]]) - ch_variant_consequences = Channel.fromPath("$projectDir/assets/variant_consequences_v1.txt", checkIfExists: true).collect() + ch_variant_consequences = Channel.fromPath("$projectDir/assets/variant_consequences_v2.txt", checkIfExists: true).collect() ch_vcfanno_resources = params.vcfanno_resources ? Channel.fromPath(params.vcfanno_resources).splitText().map{it -> it.trim()}.collect() : Channel.value([]) ch_vcf2cytosure_blacklist = params.vcf2cytosure_blacklist ? Channel.fromPath(params.vcf2cytosure_blacklist).collect() @@ -286,6 +286,8 @@ workflow RAREDISEASE { : Channel.value([]) ch_vep_cache = ( params.vep_cache && params.vep_cache.endsWith("tar.gz") ) ? ch_references.vep_resources : ( params.vep_cache ? Channel.fromPath(params.vep_cache).collect() : Channel.value([]) ) + ch_vep_extra_files_unsplit = params.vep_plugin_files ? Channel.fromPath(params.vep_plugin_files).collect() + : Channel.value([]) ch_vep_filters = params.vep_filters ? Channel.fromPath(params.vep_filters).collect() : Channel.value([]) ch_versions = ch_versions.mix(ch_references.versions) @@ -298,11 +300,25 @@ workflow RAREDISEASE { ch_svcaller_priority = Channel.value(["tiddit", "manta", "gcnvcaller", "cnvnator"]) } + // Read and store paths in the vep_plugin_files file + ch_vep_extra_files_unsplit.splitCsv ( header:true ) + .map { row -> + f = file(row.vep_files[0]) + if(f.isFile() || f.isDirectory()){ + return [f] + } else { + error("\nVep database file ${f} does not exist.") + } + } + .collect() + .set {ch_vep_extra_files} + // Input QC if (!params.skip_fastqc) { FASTQC (ch_reads) ch_versions = ch_versions.mix(FASTQC.out.versions.first()) } + // CREATE CHROMOSOME BED AND INTERVALS SCATTER_GENOME ( ch_genome_dictionary, @@ -425,7 +441,8 @@ workflow RAREDISEASE { params.vep_cache_version, ch_vep_cache, ch_genome_fasta, - ch_genome_dictionary + ch_genome_dictionary, + ch_vep_extra_files ).set {ch_sv_annotate} ch_versions = ch_versions.mix(ch_sv_annotate.versions) @@ -472,7 +489,8 @@ workflow RAREDISEASE { ch_vep_cache, ch_genome_fasta, ch_gnomad_af, - ch_scatter_split_intervals + ch_scatter_split_intervals, + ch_vep_extra_files ).set {ch_snv_annotate} ch_versions = ch_versions.mix(ch_snv_annotate.versions) @@ -519,6 +537,7 @@ workflow RAREDISEASE { params.genome, params.vep_cache_version, ch_vep_cache, + ch_vep_extra_files ).set {ch_mt_annotate} ch_versions = ch_versions.mix(ch_mt_annotate.versions) From 9434630d71cdb4dac710e7c03935cef0faccd2b7 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:05:41 +0100 Subject: [PATCH 2/9] update usage --- docs/usage.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 34776196..6735eb07 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -225,7 +225,7 @@ The mandatory and optional parameters for each category are tabulated below. | vcfanno_resources2 | vcfanno_lua | | vcfanno_toml3 | vep_filters8 | | vep_cache_version | cadd_resources9 | -| vep_cache4 | | +| vep_cache4 | vep_plugin_files10 | | gnomad_af5 | | | score_config_snv6 | | @@ -233,7 +233,7 @@ The mandatory and optional parameters for each category are tabulated below. 2Path to VCF files and their indices used by vcfanno. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vcfanno_resources.txt).
3Path to a vcfanno configuration file. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vcfanno_config.toml).
4 VEP caches can be downloaded [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html#cache). -VEP plugins and associated files may be installed in the cache directory, and the plugin pLI is mandatory to install. +VEP plugins may be installed in the cache directory, and the plugin pLI is mandatory to install. To supply files required by VEP plugins, use `vep_plugin_files` parameter. See example cache [here](https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz).
5 GnomAD VCF files can be downloaded from [here](https://gnomad.broadinstitute.org/downloads). The option `gnomad_af` expects a tab-delimited file with no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/gnomad_reformated.tab.gz).
@@ -241,6 +241,7 @@ no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sampl 7Used by GENMOD while modeling the variants. Contains a list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv).
8 This file contains a list of candidate genes (with [HGNC](https://www.genenames.org/) IDs) that is used to split the variants into canditate variants and research variants. Research variants contain all the variants, while candidate variants are a subset of research variants and are associated with candidate genes. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/hgnc.txt). Not required if --skip_vep_filter is set to true.
9Path to a folder containing cadd annotations. Equivalent of the data/annotations/ folder described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation), and it is used to calculate CADD scores for small indels.
+10A CSV file that describes the files used by VEP's named and custom plugins. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vep_files.csv).
> NB: We use CADD only to annotate small indels. To annotate SNVs with precomputed CADD scores, pass the file containing CADD scores as a resource to vcfanno instead. Files containing the precomputed CADD scores for SNVs can be downloaded from [here](https://cadd.gs.washington.edu/download) (description: "All possible SNVs of GRCh3<7/8>/hg3<7/8>") @@ -251,22 +252,22 @@ no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sampl | genome | reduced_penetrance | | svdb_query_dbs/svdb_query_bedpedbs1 | | | vep_cache_version | vep_filters | -| vep_cache | | +| vep_cache | vep_plugin_files | | score_config_sv | | 1 A CSV file that describes the databases (VCFs or BEDPEs) used by SVDB for annotating structural variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/svdb_querydb_files.csv). Information about the column headers can be found [here](https://github.com/J35P312/SVDB#Query). ##### 9. Mitochondrial annotation -| Mandatory | Optional | -| ----------------- | ----------- | -| genome | vep_filters | -| mito_name | | -| vcfanno_resources | | -| vcfanno_toml | | -| vep_cache_version | | -| vep_cache | | -| score_config_mt | | +| Mandatory | Optional | +| ----------------- | ---------------- | +| genome | vep_filters | +| mito_name | vep_plugin_files | +| vcfanno_resources | | +| vcfanno_toml | | +| vep_cache_version | | +| vep_cache | | +| score_config_mt | | #### Run the pipeline From dc4e176116c288a09b735ea7f1416d7711d71793 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:21:44 +0100 Subject: [PATCH 3/9] update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0878e5c8..dda4ef6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - GATK CNVCaller uses segments instead of intervals, filters out "reference" segments between the calls, and fixes a bug with how `ch_readcount_intervals` was handled [#472](https://github.com/nf-core/raredisease/pull/472) - bwa aligner [#474](https://github.com/nf-core/raredisease/pull/474) - Add FOUND_IN tag, which mentions the variant caller that found the mutation, in the INFO column of the vcf files [#471](https://github.com/nf-core/raredisease/pull/471) +- A new parameter `vep_plugin_files` to supply files required by vep plugins [#482](https://github.com/nf-core/raredisease/pull/482) ### `Changed` @@ -42,6 +43,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Changed the name of the parameter from `skip_cnv_calling` to `skip_germlinecnvcaller` [#435](https://github.com/nf-core/raredisease/pull/435) - Check SVDB query input files for existence and correct format [#476](https://github.com/nf-core/raredisease/pull/476) - Change hardcoded platform value to params.platform in align_MT.config [#475](https://github.com/nf-core/raredisease/pull/475) +- Installed the nf-core version of ensemblvep/vep module [#482](https://github.com/nf-core/raredisease/pull/482) ### `Fixed` From de9ff795de76bedf89cec51310dff3124b43cdf9 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:30:50 +0100 Subject: [PATCH 4/9] fix lint error --- nextflow.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nextflow.config b/nextflow.config index a6551e77..0ff60379 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,6 +39,9 @@ params { cadd_resources = null platform = 'illumina' + // Annotation + vep_cache_version = 110 + // Bam_qc ngsbits_samplegender_method = 'xy' From 3ee9fed33c5c14af78c88b7e30f76d51f70d9e61 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:36:42 +0100 Subject: [PATCH 5/9] fix lint error --- main.nf | 2 +- nextflow.config | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 0ebf4e51..d87f9e02 100644 --- a/main.nf +++ b/main.nf @@ -54,7 +54,7 @@ params.vcfanno_toml = WorkflowMain.getGenomeAttribute(params, params.vcfanno_lua = WorkflowMain.getGenomeAttribute(params, 'vcfanno_lua') params.vep_cache = WorkflowMain.getGenomeAttribute(params, 'vep_cache') params.vep_cache_version = WorkflowMain.getGenomeAttribute(params, 'vep_cache_version') - +params.vep_plugin_files = WorkflowMain.getGenomeAttribute(params, 'vep_plugin_files') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE & PRINT PARAMETER SUMMARY diff --git a/nextflow.config b/nextflow.config index 0ff60379..a6551e77 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,9 +39,6 @@ params { cadd_resources = null platform = 'illumina' - // Annotation - vep_cache_version = 110 - // Bam_qc ngsbits_samplegender_method = 'xy' From 76fb0d438d800c9be00d721d08fea322709ff2e0 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Sat, 20 Jan 2024 10:36:53 +0100 Subject: [PATCH 6/9] merge me --- conf/modules/annotate_mobile_elements.config | 4 +- conf/test.config | 2 +- conf/test_one_sample.config | 2 +- modules/local/ensemblvep/main.nf | 80 ------------------- modules/local/ensemblvep/meta.yml | 73 ----------------- .../local/annotate_mobile_elements.nf | 27 ++++--- workflows/raredisease.nf | 3 +- 7 files changed, 21 insertions(+), 170 deletions(-) delete mode 100644 modules/local/ensemblvep/main.nf delete mode 100644 modules/local/ensemblvep/meta.yml diff --git a/conf/modules/annotate_mobile_elements.config b/conf/modules/annotate_mobile_elements.config index 0e04095c..442652a3 100644 --- a/conf/modules/annotate_mobile_elements.config +++ b/conf/modules/annotate_mobile_elements.config @@ -40,12 +40,12 @@ process { ext.args = { [ '--dir_cache vep_cache', '--dir_plugins vep_cache/Plugins', - '--plugin pLI,vep_cache/pLI_values_107.txt', + '--plugin pLI,pLI_values_107.txt', '--appris --biotype --buffer_size 100 --canonical --cache --ccds', '--compress_output bgzip --distance 5000 --domains', '--exclude_predicted --force_overwrite --format vcf', '--fork 4 --hgvs --humdiv --max_sv_size 248956422 --merged', - '--no_progress --no_stats --numbers --per_gene --polyphen p', + '--no_progress --numbers --per_gene --polyphen p', '--protein --offline --regulatory --sift p', '--symbol --tsl --uniprot --vcf' ].join(' ') } diff --git a/conf/test.config b/conf/test.config index 1347680d..fd2f873e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -57,6 +57,6 @@ params { vcfanno_toml = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_config.toml" vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" vep_filters = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/hgnc.txt" - vep_cache_version = 110 + vep_cache_version = 107 vep_plugin_files = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_files.csv" } diff --git a/conf/test_one_sample.config b/conf/test_one_sample.config index 12eb9f39..d521a8a3 100644 --- a/conf/test_one_sample.config +++ b/conf/test_one_sample.config @@ -57,6 +57,6 @@ params { vcfanno_toml = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_config.toml" vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" vep_filters = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/hgnc.txt" - vep_cache_version = 110 + vep_cache_version = 107 vep_plugin_files = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_files.csv" } diff --git a/modules/local/ensemblvep/main.nf b/modules/local/ensemblvep/main.nf deleted file mode 100644 index 81d4191f..00000000 --- a/modules/local/ensemblvep/main.nf +++ /dev/null @@ -1,80 +0,0 @@ -process ENSEMBLVEP { - tag "$meta.id" - label 'process_medium' - - // Exit if running this module with -profile conda / -profile mamba - if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { - error("Local VEP module does not support Conda. Please use Docker / Singularity / Podman instead.") - } - - container "docker.io/ensemblorg/ensembl-vep:release_107.0" - - input: - tuple val(meta), path(vcf) - tuple val(meta2), path(fasta) - val genome - val species - val cache_version - path cache - path extra_files - - output: - tuple val(meta), path("*.vcf") , optional:true, emit: vcf - tuple val(meta), path("*.tab") , optional:true, emit: tab - tuple val(meta), path("*.json") , optional:true, emit: json - tuple val(meta), path("*.vcf.gz") , optional:true, emit: vcf_gz - tuple val(meta), path("*.tab.gz") , optional:true, emit: tab_gz - tuple val(meta), path("*.json.gz"), optional:true, emit: json_gz - path "*.summary.html" , optional:true, emit: report - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf' - def compress_out = args.contains("--compress_output") ? '.gz' : '' - def prefix = task.ext.prefix ?: "${meta.id}" - def stats_file = args.contains("--no_stats") ? '' : "--stats_file ${prefix}.summary.html" - def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep" - def reference = fasta ? "--fasta $fasta" : "" - - """ - vep \\ - -i $vcf \\ - -o ${prefix}.${file_extension}${compress_out} \\ - $args \\ - $reference \\ - --assembly $genome \\ - --species $species \\ - --cache \\ - --cache_version $cache_version \\ - --dir_cache $dir_cache \\ - --fork $task.cpus \\ - ${stats_file} - - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.vcf - touch ${prefix}.tab - touch ${prefix}.json - touch ${prefix}.vcf.gz - touch ${prefix}.tab.gz - touch ${prefix}.json.gz - touch ${prefix}.summary.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') - END_VERSIONS - """ -} diff --git a/modules/local/ensemblvep/meta.yml b/modules/local/ensemblvep/meta.yml deleted file mode 100644 index a4dde8a6..00000000 --- a/modules/local/ensemblvep/meta.yml +++ /dev/null @@ -1,73 +0,0 @@ -name: ENSEMBLVEP -description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled through `task.ext.args`. -keywords: - - annotation -tools: - - ensemblvep: - description: | - VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs - or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. - homepage: https://www.ensembl.org/info/docs/tools/vep/index.html - documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html - licence: ["Apache-2.0"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - vcf: - type: file - description: | - vcf to annotate - - genome: - type: value - description: | - which genome to annotate with - - species: - type: value - description: | - which species to annotate with - - cache_version: - type: value - description: | - which version of the cache to annotate with - - cache: - type: file - description: | - path to VEP cache (optional) - - fasta: - type: file - description: | - reference FASTA file (optional) - pattern: "*.{fasta,fa}" - - extra_files: - type: tuple - description: | - path to file(s) needed for plugins (optional) -output: - - vcf: - type: file - description: | - annotated vcf (optional) - pattern: "*.ann.vcf" - - tab: - type: file - description: | - tab file with annotated variants (optional) - pattern: "*.ann.tab" - - json: - type: file - description: | - json file with annotated variants (optional) - pattern: "*.ann.json" - - report: - type: file - description: VEP report file - pattern: "*.html" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@maxulysse" diff --git a/subworkflows/local/annotate_mobile_elements.nf b/subworkflows/local/annotate_mobile_elements.nf index 04b15ad2..dc6247bf 100644 --- a/subworkflows/local/annotate_mobile_elements.nf +++ b/subworkflows/local/annotate_mobile_elements.nf @@ -2,13 +2,12 @@ // A subworkflow to annotate structural variants. // -include { SVDB_QUERY as SVDB_QUERY_DB } from '../../modules/nf-core/svdb/query/main' -include { PICARD_SORTVCF } from '../../modules/nf-core/picard/sortvcf/main' -include { ENSEMBLVEP as ENSEMBLVEP_ME } from '../../modules/local/ensemblvep/main' -include { ENSEMBLVEP_FILTERVEP as FILTERVEP_ME } from '../../modules/nf-core/ensemblvep/filtervep' -include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_FILTER } from '../../modules/nf-core/bcftools/view/main' -include { TABIX_BGZIPTABIX as BGZIP_TABIX_ME } from '../../modules/nf-core/tabix/bgziptabix/main' - +include { SVDB_QUERY as SVDB_QUERY_DB } from '../../modules/nf-core/svdb/query/main' +include { PICARD_SORTVCF } from '../../modules/nf-core/picard/sortvcf/main' +include { ENSEMBLVEP_VEP as ENSEMBLVEP_ME } from '../../modules/nf-core/ensemblvep/vep/main' +include { ENSEMBLVEP_FILTERVEP as FILTERVEP_ME } from '../../modules/nf-core/ensemblvep/filtervep' +include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_FILTER } from '../../modules/nf-core/bcftools/view/main' +include { TABIX_BGZIPTABIX as BGZIP_TABIX_ME } from '../../modules/nf-core/tabix/bgziptabix/main' include { ANNOTATE_CSQ_PLI as ANNOTATE_CSQ_PLI_ME } from '../../subworkflows/local/annotate_consequence_pli.nf' workflow ANNOTATE_MOBILE_ELEMENTS { @@ -23,6 +22,7 @@ workflow ANNOTATE_MOBILE_ELEMENTS { ch_vep_filters // channel: [mandatory] [ path(vep_filter) ] val_vep_genome // string: [mandatory] GRCh37 or GRCh38 val_vep_cache_version // string: [mandatory] default: 107 + ch_vep_extra_files // channel: [mandatory] [ path(files) ] main: ch_versions = Channel.empty() @@ -54,18 +54,21 @@ workflow ANNOTATE_MOBILE_ELEMENTS { ch_genome_fasta, ch_genome_dictionary ) + .vcf + .map { meta, vcf -> return [meta, vcf, []]} + .set { ch_vep_in } ENSEMBLVEP_ME( - PICARD_SORTVCF.out.vcf, - ch_genome_fasta, + ch_vep_in, val_vep_genome, "homo_sapiens", val_vep_cache_version, ch_vep_cache, - [] + ch_genome_fasta, + ch_vep_extra_files ) - ENSEMBLVEP_ME.out.vcf_gz + ENSEMBLVEP_ME.out.vcf .map { meta, vcf -> [ meta, vcf, [] ] } @@ -76,7 +79,7 @@ workflow ANNOTATE_MOBILE_ELEMENTS { BCFTOOLS_VIEW_FILTER.out.vcf .multiMap { meta, vcf -> clinical: [ meta + [ set: "clinical" ], vcf ] - research: [ meta + [ set: "research" ], vcf ] + research: [ meta + [ set: "research" ], vcf ] } .set { ch_clin_research_vcf } diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index e5781c17..0f2a4fe1 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -647,7 +647,8 @@ workflow RAREDISEASE { ch_variant_consequences, ch_vep_filters, params.genome, - params.vep_cache_version + params.vep_cache_version, + ch_vep_extra_files ) ch_versions = ch_versions.mix(ANNOTATE_MOBILE_ELEMENTS.out.versions) } From 4c6851e0b85cf9ee58e43a1767f47a935da4d243 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Mon, 22 Jan 2024 10:01:31 +0100 Subject: [PATCH 7/9] update module --- modules.json | 2 +- modules/nf-core/ensemblvep/vep/main.nf | 5 +- .../nf-core/ensemblvep/vep/tests/main.nf.test | 102 ++++++++++++++++++ .../ensemblvep/vep/tests/nextflow.config | 13 +++ .../ensemblvep/vep/tests/tab.gz.config | 5 + modules/nf-core/ensemblvep/vep/tests/tags.yml | 2 + .../nf-core/ensemblvep/vep/tests/vcf.config | 5 + 7 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 modules/nf-core/ensemblvep/vep/tests/main.nf.test create mode 100644 modules/nf-core/ensemblvep/vep/tests/nextflow.config create mode 100644 modules/nf-core/ensemblvep/vep/tests/tab.gz.config create mode 100644 modules/nf-core/ensemblvep/vep/tests/tags.yml create mode 100644 modules/nf-core/ensemblvep/vep/tests/vcf.config diff --git a/modules.json b/modules.json index eac809df..03000a35 100644 --- a/modules.json +++ b/modules.json @@ -117,7 +117,7 @@ }, "ensemblvep/vep": { "branch": "master", - "git_sha": "214d575774c172062924ad3564b4f66655600730", + "git_sha": "76a0696a60c41c57fc5f6040ac31b11ce5d4d8dd", "installed_by": ["modules"] }, "expansionhunter": { diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf index 3a2b7423..a7fc5ad1 100644 --- a/modules/nf-core/ensemblvep/vep/main.nf +++ b/modules/nf-core/ensemblvep/vep/main.nf @@ -20,7 +20,7 @@ process ENSEMBLVEP_VEP { tuple val(meta), path("*.vcf.gz") , optional:true, emit: vcf tuple val(meta), path("*.tab.gz") , optional:true, emit: tab tuple val(meta), path("*.json.gz") , optional:true, emit: json - path "*.summary.html" , emit: report + path "*.summary.html" , optional:true, emit: report path "versions.yml" , emit: versions when: @@ -45,8 +45,7 @@ process ENSEMBLVEP_VEP { --cache \\ --cache_version $cache_version \\ --dir_cache $dir_cache \\ - --fork $task.cpus \\ - --stats_file ${prefix}.summary.html \\ + --fork $task.cpus cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/ensemblvep/vep/tests/main.nf.test b/modules/nf-core/ensemblvep/vep/tests/main.nf.test new file mode 100644 index 00000000..f072dcab --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/main.nf.test @@ -0,0 +1,102 @@ +nextflow_process { + + name "Test Process ENSEMBLVEP_VEP" + script "modules/nf-core/ensemblvep/vep/main.nf" + process "ENSEMBLVEP_VEP" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "ensemblvep" + tag "ensemblvep/vep" + tag "ensemblvep/download" + + + test("test_ensemblvep_vep_fasta_vcf") { + + config "./vcf.config" + + setup { + run("ENSEMBLVEP_DOWNLOAD") { + script "../../download/main.nf" + process { + """ + input[0] = Channel.of([[id:"${params.vep_cache_version}_${params.vep_genome}"], params.vep_genome, params.vep_species, params.vep_cache_version]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + [] + ]) + input[1] = params.vep_genome + input[2] = params.vep_species + input[3] = params.vep_cache_version + input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] } + input[5] = Channel.value([ + [id:"fasta"], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ]) + input[6] = [] + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert path(process.out.vcf.get(0).get(1)).linesGzip.contains("##fileformat=VCFv4.2")} + ) + } + + } + + test("test_ensemblvep_vep_fasta_tab_gz") { + + config "./tab.gz.config" + + setup { + run("ENSEMBLVEP_DOWNLOAD") { + script "../../download/main.nf" + process { + """ + input[0] = Channel.of([[id:"${params.vep_cache_version}_${params.vep_genome}"], params.vep_genome, params.vep_species, params.vep_cache_version]) + """ + } + } + } + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_vcf'], checkIfExists: true), + [] + ]) + input[1] = params.vep_genome + input[2] = params.vep_species + input[3] = params.vep_cache_version + input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] } + input[5] = Channel.value([ + [id:"fasta"], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ]) + input[6] = [] + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert path(process.out.tab.get(0).get(1)).linesGzip.contains("## ENSEMBL VARIANT EFFECT PREDICTOR v110.0")} + ) + } + } +} diff --git a/modules/nf-core/ensemblvep/vep/tests/nextflow.config b/modules/nf-core/ensemblvep/vep/tests/nextflow.config new file mode 100644 index 00000000..cfaef733 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/nextflow.config @@ -0,0 +1,13 @@ +params { + vep_cache_version = "110" + vep_genome = "WBcel235" + vep_species = "caenorhabditis_elegans" +} + +process { + + withName: ENSEMBLVEP_DOWNLOAD { + ext.args = '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' + } + +} diff --git a/modules/nf-core/ensemblvep/vep/tests/tab.gz.config b/modules/nf-core/ensemblvep/vep/tests/tab.gz.config new file mode 100644 index 00000000..40eb03e5 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/tab.gz.config @@ -0,0 +1,5 @@ +process { + withName: ENSEMBLVEP_VEP { + ext.args = '--tab --compress_output bgzip' + } +} diff --git a/modules/nf-core/ensemblvep/vep/tests/tags.yml b/modules/nf-core/ensemblvep/vep/tests/tags.yml new file mode 100644 index 00000000..4aa4aa45 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/tags.yml @@ -0,0 +1,2 @@ +ensemblvep/vep: + - "modules/nf-core/ensemblvep/vep/**" diff --git a/modules/nf-core/ensemblvep/vep/tests/vcf.config b/modules/nf-core/ensemblvep/vep/tests/vcf.config new file mode 100644 index 00000000..ad8955a3 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/tests/vcf.config @@ -0,0 +1,5 @@ +process { + withName: ENSEMBLVEP_VEP { + ext.args = '--vcf' + } +} From 1b1d6a125bef3a2179f35c5fb93494f54cfc43c4 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:04:12 +0100 Subject: [PATCH 8/9] review suggestions --- assets/vep_plugin_files_schema.json | 19 +++++++++++++++++++ nextflow_schema.json | 4 +++- 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 assets/vep_plugin_files_schema.json diff --git a/assets/vep_plugin_files_schema.json b/assets/vep_plugin_files_schema.json new file mode 100644 index 00000000..6f728a7b --- /dev/null +++ b/assets/vep_plugin_files_schema.json @@ -0,0 +1,19 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/raredisease/master/assets/mobile_element_references_schema.json", + "title": "Schema for VEP plugin files and their indices", + "description": "Schema for VEP plugin files and their indices", + "type": "array", + "items": { + "type": "object", + "properties": { + "vep_files": { + "type": "string", + "format": "file-path", + "exists": true, + "errorMessage": "Path to vep plugin files and their indices" + } + }, + "required": ["vep_files"] + } +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 64056370..e1d952f1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -390,7 +390,9 @@ "description": "Databases used by both named and custom plugins to annotate variants.", "fa_icon": "fas fa-file-csv", "help_text": "Path to a file containing the absolute paths to databases and their indices used by VEP's custom and named plugins resources defined within the vcfanno toml file. One line per resource.", - "mimetype": "text/csv" + "pattern": "^\\S+\\.csv$", + "mimetype": "text/csv", + "schema": "assets/vep_plugin_files_schema.json" }, "vep_filters": { "type": "string", From f1b75d4f8cceaae9968907946f968a721cf18191 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:36:54 +0530 Subject: [PATCH 9/9] Apply suggestions from code review Co-authored-by: Anders Jemt --- subworkflows/local/annotate_genome_snvs.nf | 2 +- subworkflows/local/annotate_mobile_elements.nf | 2 +- subworkflows/local/annotate_mt_snvs.nf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/annotate_genome_snvs.nf b/subworkflows/local/annotate_genome_snvs.nf index 291d3acd..9e3d74d6 100644 --- a/subworkflows/local/annotate_genome_snvs.nf +++ b/subworkflows/local/annotate_genome_snvs.nf @@ -117,7 +117,7 @@ workflow ANNOTATE_GENOME_SNVS { .set { ch_for_mix } ch_for_mix.selvar.mix(ch_for_mix.cadd) - .map { meta, vcf -> return [meta, vcf, []]} + .map { meta, vcf -> return [meta, vcf, []] } .set { ch_vep_in } diff --git a/subworkflows/local/annotate_mobile_elements.nf b/subworkflows/local/annotate_mobile_elements.nf index dc6247bf..265ccce0 100644 --- a/subworkflows/local/annotate_mobile_elements.nf +++ b/subworkflows/local/annotate_mobile_elements.nf @@ -55,7 +55,7 @@ workflow ANNOTATE_MOBILE_ELEMENTS { ch_genome_dictionary ) .vcf - .map { meta, vcf -> return [meta, vcf, []]} + .map { meta, vcf -> return [meta, vcf, []] } .set { ch_vep_in } ENSEMBLVEP_ME( diff --git a/subworkflows/local/annotate_mt_snvs.nf b/subworkflows/local/annotate_mt_snvs.nf index e1ed903a..e7b8ae6a 100644 --- a/subworkflows/local/annotate_mt_snvs.nf +++ b/subworkflows/local/annotate_mt_snvs.nf @@ -54,7 +54,7 @@ workflow ANNOTATE_MT_SNVS { ch_for_mix.merged.mix(ch_for_mix.cadd) .tap { ch_haplogrep_in } - .map { meta, vcf -> return [meta, vcf, []]} + .map { meta, vcf -> return [meta, vcf, []] } .set { ch_vep_in }