diff --git a/CHANGELOG.md b/CHANGELOG.md index db07d7eb..06a3851f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New workflow for annotating mobile elements [#483](https://github.com/nf-core/raredisease/pull/483) - Added a functionality to subsample mitochondrial alignment, and a new parameter `skip_mt_subsample` to skip the subworkflow [#508](https://github.com/nf-core/raredisease/pull/508). - Chromograph to plot coverage across chromosomes [#507](https://github.com/nf-core/raredisease/pull/507) +- Added two new parameters `variant_consequences_snv` and `variant_consequences_sv` to supply variant consequence files for annotating SNVs and SVs. [#509](https://github.com/nf-core/raredisease/pull/509) ### `Changed` diff --git a/assets/variant_consequences_v2.txt b/assets/variant_consequences_v2.txt deleted file mode 100644 index effe32b1..00000000 --- a/assets/variant_consequences_v2.txt +++ /dev/null @@ -1,41 +0,0 @@ -transcript_ablation -splice_acceptor_variant -splice_donor_variant -stop_gained -frameshift_variant -stop_lost -start_lost -transcript_amplification -feature_elongation -feature_truncation -inframe_insertion -inframe_deletion -missense_variant -protein_altering_variant -splice_donor_5th_base_variant -splice_region_variant -splice_donor_region_variant -splice_polypyrimidine_tract_variant -incomplete_terminal_codon_variant -start_retained_variant -stop_retained_variant -synonymous_variant -coding_sequence_variant -mature_miRNA_variant -5_prime_UTR_variant -3_prime_UTR_variant -non_coding_transcript_exon_variant -intron_variant -NMD_transcript_variant -non_coding_transcript_variant -coding_transcript_variant -upstream_gene_variant -downstream_gene_variant -TFBS_ablation -TFBS_amplification -TF_binding_site_variant -regulatory_region_ablation -regulatory_region_amplification -regulatory_region_variant -intergenic_variant -sequence_variant diff --git a/conf/test.config b/conf/test.config index fd2f873e..75e7a92f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -43,7 +43,7 @@ params { intervals_y = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/targetY.interval_list" known_dbsnp = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/dbsnp_-138-.vcf.gz" ml_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.0.model" - mobile_element_references = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/mobile_element_references.tsv" + mobile_element_references = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/mobile_element_references.tsv" mobile_element_svdb_annotations = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/svdb_querydb_files.csv" reduced_penetrance = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/reduced_penetrance.tsv" score_config_mt = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/rank_model_snv.ini" @@ -55,6 +55,8 @@ params { vcfanno_lua = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_functions.lua" vcfanno_resources = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_resources.txt" vcfanno_toml = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_config.toml" + variant_consequences_snv = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/variant_consequences_v2.txt" + variant_consequences_sv = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/variant_consequences_v2.txt" vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" vep_filters = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/hgnc.txt" vep_cache_version = 107 diff --git a/conf/test_one_sample.config b/conf/test_one_sample.config index d521a8a3..f54448f8 100644 --- a/conf/test_one_sample.config +++ b/conf/test_one_sample.config @@ -43,7 +43,7 @@ params { intervals_y = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/targetY.interval_list" known_dbsnp = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/dbsnp_-138-.vcf.gz" ml_model = "https://s3.amazonaws.com/sentieon-release/other/SentieonDNAscopeModel1.0.model" - mobile_element_references = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/mobile_element_references.tsv" + mobile_element_references = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/mobile_element_references.tsv" mobile_element_svdb_annotations = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/svdb_querydb_files.csv" reduced_penetrance = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/reduced_penetrance.tsv" score_config_mt = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/rank_model_snv.ini" @@ -55,6 +55,8 @@ params { vcfanno_lua = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_functions.lua" vcfanno_resources = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_resources.txt" vcfanno_toml = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vcfanno_config.toml" + variant_consequences_snv = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/variant_consequences_v2.txt" + variant_consequences_sv = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/variant_consequences_v2.txt" vep_cache = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/vep_cache_and_plugins.tar.gz" vep_filters = "https://raw.githubusercontent.com/nf-core/test-datasets/raredisease/reference/hgnc.txt" vep_cache_version = 107 diff --git a/docs/usage.md b/docs/usage.md index 8f1ec601..f75baed9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -221,15 +221,16 @@ The mandatory and optional parameters for each category are tabulated below. ##### 7. SNV annotation & Ranking -| Mandatory | Optional | -| ----------------------------- | ------------------------------ | -| genome1 | reduced_penetrance7 | -| vcfanno_resources2 | vcfanno_lua | -| vcfanno_toml3 | vep_filters8 | -| vep_cache_version | cadd_resources9 | -| vep_cache4 | vep_plugin_files10 | -| gnomad_af5 | | -| score_config_snv6 | | +| Mandatory | Optional | +| ------------------------------------ | ------------------------------ | +| genome1 | reduced_penetrance8 | +| vcfanno_resources2 | vcfanno_lua | +| vcfanno_toml3 | vep_filters9 | +| vep_cache_version | cadd_resources10 | +| vep_cache4 | vep_plugin_files11 | +| gnomad_af5 | | +| score_config_snv6 | | +| variant_consequences_snv7 | | 1Genome version is used by VEP. You have the option to choose between GRCh37 and GRCh38.
2Path to VCF files and their indices used by vcfanno. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vcfanno_resources.txt).
@@ -240,10 +241,11 @@ See example cache [here](https://raw.githubusercontent.com/nf-core/test-datasets 5 GnomAD VCF files can be downloaded from [here](https://gnomad.broadinstitute.org/downloads). The option `gnomad_af` expects a tab-delimited file with no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/gnomad_reformated.tab.gz).
6Used by GENMOD for ranking the variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/rank_model_snv.ini).
-7Used by GENMOD while modeling the variants. Contains a list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv).
-8 This file contains a list of candidate genes (with [HGNC](https://www.genenames.org/) IDs) that is used to split the variants into canditate variants and research variants. Research variants contain all the variants, while candidate variants are a subset of research variants and are associated with candidate genes. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/hgnc.txt). Not required if --skip_vep_filter is set to true.
-9Path to a folder containing cadd annotations. Equivalent of the data/annotations/ folder described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation), and it is used to calculate CADD scores for small indels.
-10A CSV file that describes the files used by VEP's named and custom plugins. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vep_files.csv).
+7File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic and mitochondrial SNVs. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/variant_consequences_v2.txt). You can learn more about these terms [here](https://grch37.ensembl.org/info/genome/variation/prediction/predicted_data.html). +8Used by GENMOD while modeling the variants. Contains a list of loci that show [reduced penetrance](https://medlineplus.gov/genetics/understanding/inheritance/penetranceexpressivity/) in people. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/reduced_penetrance.tsv).
+9 This file contains a list of candidate genes (with [HGNC](https://www.genenames.org/) IDs) that is used to split the variants into canditate variants and research variants. Research variants contain all the variants, while candidate variants are a subset of research variants and are associated with candidate genes. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/hgnc.txt). Not required if --skip_vep_filter is set to true.
+10Path to a folder containing cadd annotations. Equivalent of the data/annotations/ folder described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation), and it is used to calculate CADD scores for small indels.
+11A CSV file that describes the files used by VEP's named and custom plugins. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/vep_files.csv).
> NB: We use CADD only to annotate small indels. To annotate SNVs with precomputed CADD scores, pass the file containing CADD scores as a resource to vcfanno instead. Files containing the precomputed CADD scores for SNVs can be downloaded from [here](https://cadd.gs.washington.edu/download) (description: "All possible SNVs of GRCh3<7/8>/hg3<7/8>") @@ -256,20 +258,23 @@ no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sampl | vep_cache_version | vep_filters | | vep_cache | vep_plugin_files | | score_config_sv | | +| variant_consequences_sv2 | | 1 A CSV file that describes the databases (VCFs or BEDPEs) used by SVDB for annotating structural variants. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/svdb_querydb_files.csv). Information about the column headers can be found [here](https://github.com/J35P312/SVDB#Query). +2 File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/variant_consequences_v2.txt). You can learn more about these terms [here](https://grch37.ensembl.org/info/genome/variation/prediction/predicted_data.html). ##### 9. Mitochondrial annotation -| Mandatory | Optional | -| ----------------- | ---------------- | -| genome | vep_filters | -| mito_name | vep_plugin_files | -| vcfanno_resources | | -| vcfanno_toml | | -| vep_cache_version | | -| vep_cache | | -| score_config_mt | | +| Mandatory | Optional | +| ------------------------ | ---------------- | +| genome | vep_filters | +| mito_name | vep_plugin_files | +| vcfanno_resources | | +| vcfanno_toml | | +| vep_cache_version | | +| vep_cache | | +| score_config_mt | | +| variant_consequences_snv | | ##### 10. Mobile element annotation @@ -279,6 +284,7 @@ no header and the following columns: `CHROM POS REF_ALLELE ALT_ALLELE AF`. Sampl | mobile_element_svdb_annotations1 | | | vep_cache_version | | | vep_cache | | +| variant_consequences_sv | | 1 A CSV file that describes the databases (VCFs) used by SVDB for annotating mobile elements with allele frequencies. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/svdb_querydb_files.csv). diff --git a/main.nf b/main.nf index 72b342a3..6c38b158 100644 --- a/main.nf +++ b/main.nf @@ -47,6 +47,8 @@ params.sdf = WorkflowMain.getGenomeAttribute(params, params.svdb_query_dbs = WorkflowMain.getGenomeAttribute(params, 'svdb_query_dbs') params.target_bed = WorkflowMain.getGenomeAttribute(params, 'target_bed') params.variant_catalog = WorkflowMain.getGenomeAttribute(params, 'variant_catalog') +params.variant_consequences_snv = WorkflowMain.getGenomeAttribute(params, 'variant_consequences_snv') +params.variant_consequences_sv = WorkflowMain.getGenomeAttribute(params, 'variant_consequences_sv') params.vep_filters = WorkflowMain.getGenomeAttribute(params, 'vep_filters') params.vcf2cytosure_blacklist = WorkflowMain.getGenomeAttribute(params, 'vcf2cytosure_blacklist') params.vcfanno_resources = WorkflowMain.getGenomeAttribute(params, 'vcfanno_resources') diff --git a/nextflow_schema.json b/nextflow_schema.json index 79045f6d..a5b71b42 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -625,6 +625,18 @@ "fa_icon": "fas fa-user-cog", "description": "Options used to facilitate the annotation of the variants.", "properties": { + "variant_consequences_snv": { + "type": "string", + "description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic and mitochondrial SNVs.", + "help_text": "For more information check https://grch37.ensembl.org/info/genome/variation/prediction/predicted_data.html", + "fa_icon": "fas fa-file-csv" + }, + "variant_consequences_sv": { + "type": "string", + "description": "File containing list of SO terms listed in the order of severity from most severe to lease severe for annotating genomic SVs.", + "help_text": "For more information check https://grch37.ensembl.org/info/genome/variation/prediction/predicted_data.html", + "fa_icon": "fas fa-file-csv" + }, "vep_cache_version": { "type": "integer", "default": 110, diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index 650ca446..69b7f031 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -40,11 +40,11 @@ if (params.run_rtgvcfeval) { if (!params.skip_snv_annotation) { mandatoryParams += ["genome", "vcfanno_resources", "vcfanno_toml", "vep_cache", "vep_cache_version", - "gnomad_af", "score_config_snv"] + "gnomad_af", "score_config_snv", "variant_consequences_snv"] } if (!params.skip_sv_annotation) { - mandatoryParams += ["genome", "vep_cache", "vep_cache_version", "score_config_sv"] + mandatoryParams += ["genome", "vep_cache", "vep_cache_version", "score_config_sv", "variant_consequences_sv"] if (!params.svdb_query_bedpedbs && !params.svdb_query_dbs) { println("params.svdb_query_bedpedbs or params.svdb_query_dbs should be set.") missingParamsCount += 1 @@ -52,7 +52,7 @@ if (!params.skip_sv_annotation) { } if (!params.skip_mt_annotation) { - mandatoryParams += ["genome", "mito_name", "vcfanno_resources", "vcfanno_toml", "vep_cache_version", "vep_cache"] + mandatoryParams += ["genome", "mito_name", "vcfanno_resources", "vcfanno_toml", "vep_cache_version", "vep_cache", "variant_consequences_snv"] } if (params.analysis_type.equals("wes")) { @@ -72,7 +72,7 @@ if (!params.skip_vep_filter) { } if (!params.skip_me_annotation) { - mandatoryParams += ["mobile_element_svdb_annotations"] + mandatoryParams += ["mobile_element_svdb_annotations", "variant_consequences_snv"] } for (param in mandatoryParams.unique()) { @@ -288,7 +288,10 @@ workflow RAREDISEASE { ch_target_intervals = ch_references.target_intervals ch_variant_catalog = params.variant_catalog ? Channel.fromPath(params.variant_catalog).map { it -> [[id:it[0].simpleName],it]}.collect() : Channel.value([[],[]]) - ch_variant_consequences = Channel.fromPath("$projectDir/assets/variant_consequences_v2.txt", checkIfExists: true).collect() + ch_variant_consequences_snv = params.variant_consequences_snv ? Channel.fromPath(params.variant_consequences_snv).collect() + : Channel.value([]) + ch_variant_consequences_sv = params.variant_consequences_sv ? Channel.fromPath(params.variant_consequences_sv).collect() + : Channel.value([]) ch_vcfanno_resources = params.vcfanno_resources ? Channel.fromPath(params.vcfanno_resources).splitText().map{it -> it.trim()}.collect() : Channel.value([]) ch_vcf2cytosure_blacklist = params.vcf2cytosure_blacklist ? Channel.fromPath(params.vcf2cytosure_blacklist).collect() @@ -490,7 +493,7 @@ workflow RAREDISEASE { ANN_CSQ_PLI_SV ( GENERATE_CLINICAL_SET_SV.out.vcf, - ch_variant_consequences + ch_variant_consequences_sv ) ch_versions = ch_versions.mix(ANN_CSQ_PLI_SV.out.versions) @@ -535,7 +538,7 @@ workflow RAREDISEASE { ANN_CSQ_PLI_SNV ( GENERATE_CLINICAL_SET_SNV.out.vcf, - ch_variant_consequences + ch_variant_consequences_snv ) ch_versions = ch_versions.mix(ANN_CSQ_PLI_SNV.out.versions) @@ -577,7 +580,7 @@ workflow RAREDISEASE { ANN_CSQ_PLI_MT( GENERATE_CLINICAL_SET_MT.out.vcf, - ch_variant_consequences + ch_variant_consequences_snv ) ch_versions = ch_versions.mix(ANN_CSQ_PLI_MT.out.versions) @@ -663,7 +666,7 @@ workflow RAREDISEASE { ch_genome_fasta, ch_genome_dictionary, ch_vep_cache, - ch_variant_consequences, + ch_variant_consequences_sv, ch_vep_filters, params.genome, params.vep_cache_version,