diff --git a/conf/igenomes.config b/conf/igenomes.config index f3e5ece2c0..81057a53fd 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -21,14 +21,19 @@ params { chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/Chromosomes" dbsnp = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf" dbsnp_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf.idx" + dbsnp_vqsr = '--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 dbsnp_138.b37.vcf' dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" germline_resource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh37.PASS.AC.AF.only.vcf.gz" germline_resource_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh37.PASS.AC.AF.only.vcf.gz.tbi" intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/intervals/wgs_calling_regions_Sarek.list" + known_snps = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/1000G_phase1.snps.high_confidence.b37.vcf" + known_snps_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/1000G_phase1.snps.high_confidence.b37.vcf.idx" + known_snps_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.snps.high_confidence.b37.vcf' known_indels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf" known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.idx" + known_indels_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.indels.b37.vcf --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.b37.vcf' mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem" snpeff_db = 'GRCh37.87' snpeff_genome = 'GRCh37' @@ -50,14 +55,19 @@ params { chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes" dbsnp = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" dbsnp_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" + dbsnp_vqsr = '--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 dbsnp_146.hg38.vcf.gz' dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" germline_resource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz" germline_resource_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GermlineResource/gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz.tbi" intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/intervals/wgs_calling_regions.hg38.bed" + known_snps = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000G_omni2.5.hg38.vcf.gz" + known_snps_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000G_omni2.5.hg38.vcf.gz.tbi" + known_snps_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_omni2.5.hg38.vcf.gz' known_indels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + known_indels_vqsr = '--resource:gatk,known=false,training=true,truth=true,prior=10.0 Homo_sapiens_assembly38.known_indels.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Control-FREEC/out100m2_hg38.gem" pon = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz" pon_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz.tbi" diff --git a/conf/modules.config b/conf/modules.config index dee1400f98..7500d8f199 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -601,14 +601,12 @@ process{ saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'FREEBAYES' { //To make sure no naming conflicts ensure with module BCFTOOLS_SORT & the naming being correct in the output folder ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}" : "${meta.id}.${target_bed.simpleName}" } ext.args = '--min-alternate-fraction 0.1 --min-mapping-quality 1' ext.when = { params.tools && params.tools.split(',').contains('freebayes') } - publishDir = [ - enabled: false - ] } withName: 'BCFTOOLS_SORT' { @@ -644,17 +642,14 @@ process{ ext.prefix = { meta.num_intervals <= 1 ? ( params.joint_germline ? "${meta.id}.haplotypecaller.g" : "${meta.id}.haplotypecaller" ) : ( params.joint_germline ? "${meta.id}.haplotypecaller.${intervals.simpleName}.g" :"${meta.id}.haplotypecaller.${intervals.simpleName}" ) } ext.when = { params.tools && params.tools.split(',').contains('haplotypecaller') } publishDir = [ + enabled: !params.joint_germline, mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/"}, pattern: "*{vcf.gz,vcf.gz.tbi}", saveAs: { meta.num_intervals > 1 ? null : "haplotypecaller/${meta.id}/${it}" } ] } - withName: 'CNNSCOREVARIANTS' { - publishDir = [ - enabled: false - ] - } + withName: 'FILTERVARIANTTRANCHES' { ext.prefix = {"${meta.id}.haplotypecaller"} ext.args = { "--info-key CNN_1D" } @@ -665,12 +660,57 @@ process{ ] } - withName: 'GENOTYPEGVCFS' { - ext.prefix = {"${meta.id}.haplotypecaller"} - ext.when = { params.tools && params.tools.split(',').contains('haplotypecaller') && params.joint_germline} + withName: 'GATK4_GENOMICSDBIMPORT' { + ext.prefix = { meta.num_intervals > 1 ? meta.intervals_name : "joint_interval" } + ext.when = { params.tools && params.tools.split(',').contains('haplotypecaller') && params.joint_germline && !params.no_intervals} + } + + withName: 'GATK4_GENOTYPEGVCFS' { + ext.prefix = { meta.num_intervals > 1 ? meta.intervals_name : "joint_germline" } + } + + withName: 'MERGE_GENOTYPEGVCFS' { + ext.prefix = "joint_germline" publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/variant_calling/haplotypecaller/${meta.id}/"}, + path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'VARIANTRECALIBRATOR_INDEL' { + ext.prefix = { "${meta.id}_INDEL" } + ext.args = "-an QD -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an DP -mode INDEL" + publishDir = [ + enabled: false + ] + } + + withName: 'VARIANTRECALIBRATOR_SNP' { + ext.prefix = { "${meta.id}_SNP" } + ext.args = "-an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode SNP" + publishDir = [ + enabled: false + ] + } + + withName: 'GATK4_APPLYVQSR_SNP'{ + ext.prefix = { "${meta.id}_SNP" } + ext.args = '--truth-sensitivity-filter-level 99.9 -mode SNP' + } + + withName: 'GATK4_APPLYVQSR_INDEL'{ + ext.prefix = { "${meta.id}_INDEL" } + ext.args = '--truth-sensitivity-filter-level 99.9 -mode INDEL' + } + + withName: 'MERGE_VQSR' { + ext.prefix = "joint_germline_recalibrated" + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/"}, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, pattern: "*{vcf.gz,vcf.gz.tbi}" ] } diff --git a/main.nf b/main.nf index 8bdfb18d76..049c9c186b 100644 --- a/main.nf +++ b/main.nf @@ -26,7 +26,6 @@ nextflow.enable.dsl = 2 GENOME PARAMETER VALUES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ - params.ascat_alleles = WorkflowMain.getGenomeAttribute(params, 'ascat_alleles') params.ascat_genome = WorkflowMain.getGenomeAttribute(params, 'ascat_genome') params.ascat_loci = WorkflowMain.getGenomeAttribute(params, 'ascat_loci') @@ -37,6 +36,7 @@ params.bwamem2 = WorkflowMain.getGenomeAttribute(params, 'bwamem2' params.chr_dir = WorkflowMain.getGenomeAttribute(params, 'chr_dir') params.dbsnp = WorkflowMain.getGenomeAttribute(params, 'dbsnp') params.dbsnp_tbi = WorkflowMain.getGenomeAttribute(params, 'dbsnp_tbi') +params.dbsnp_vqsr = WorkflowMain.getGenomeAttribute(params, 'dbsnp_vqsr') params.dict = WorkflowMain.getGenomeAttribute(params, 'dict') params.dragmap = WorkflowMain.getGenomeAttribute(params, 'dragmap') params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') @@ -44,8 +44,12 @@ params.fasta_fai = WorkflowMain.getGenomeAttribute(params, 'fasta_fa params.germline_resource = WorkflowMain.getGenomeAttribute(params, 'germline_resource') params.germline_resource_tbi = WorkflowMain.getGenomeAttribute(params, 'germline_resource_tbi') params.intervals = WorkflowMain.getGenomeAttribute(params, 'intervals') +params.known_snps = WorkflowMain.getGenomeAttribute(params, 'known_snps') +params.known_snps_tbi = WorkflowMain.getGenomeAttribute(params, 'known_snps_tbi') +params.known_snps_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_snps_vqsr') params.known_indels = WorkflowMain.getGenomeAttribute(params, 'known_indels') params.known_indels_tbi = WorkflowMain.getGenomeAttribute(params, 'known_indels_tbi') +params.known_indels_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_indels_vqsr') params.mappability = WorkflowMain.getGenomeAttribute(params, 'mappability') params.pon = WorkflowMain.getGenomeAttribute(params, 'pon') params.pon_tbi = WorkflowMain.getGenomeAttribute(params, 'pon_tbi') diff --git a/modules.json b/modules.json index b7572bf83a..ca1ec33538 100644 --- a/modules.json +++ b/modules.json @@ -160,7 +160,7 @@ "git_sha": "169b2b96c1167f89ab07127b7057c1d90a6996c7" }, "gatk4/variantrecalibrator": { - "git_sha": "169b2b96c1167f89ab07127b7057c1d90a6996c7" + "git_sha": "edfe28a5e0088b66ee92e7c58186059f9b5e62d5" }, "manta/germline": { "git_sha": "ffedf09b6e84b479c9c901274f74bb33f3777243" diff --git a/modules/nf-core/modules/gatk4/variantrecalibrator/main.nf b/modules/nf-core/modules/gatk4/variantrecalibrator/main.nf index 120aeade7a..961e60d8ef 100644 --- a/modules/nf-core/modules/gatk4/variantrecalibrator/main.nf +++ b/modules/nf-core/modules/gatk4/variantrecalibrator/main.nf @@ -8,8 +8,10 @@ process GATK4_VARIANTRECALIBRATOR { 'quay.io/biocontainers/gatk4:4.2.6.1--hdfd78af_0' }" input: - tuple val(meta), path(vcf), path(tbi) - tuple path(vcfs), path(tbis), val(labels) + tuple val(meta), path(vcf), path(tbi) // input vcf and tbi of variants to recalibrate + path resource_vcf // resource vcf + path resource_tbi // resource tbi + val labels // string (or list of strings) containing dedicated resource labels already formatted with '--resource:' tag path fasta path fai path dict @@ -28,7 +30,7 @@ process GATK4_VARIANTRECALIBRATOR { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def reference_command = fasta ? "--reference $fasta " : '' - def resource_command = labels.collect{"--resource:$it"}.join(' ') + def labels_command = labels.join(' ') def avail_mem = 3 if (!task.memory) { @@ -42,8 +44,8 @@ process GATK4_VARIANTRECALIBRATOR { --output ${prefix}.recal \\ --tranches-file ${prefix}.tranches \\ $reference_command \\ - $resource_command \\ --tmp-dir . \\ + $labels_command \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/modules/gatk4/variantrecalibrator/meta.yml b/modules/nf-core/modules/gatk4/variantrecalibrator/meta.yml index afe33d7a7a..6ed3a40e2c 100644 --- a/modules/nf-core/modules/gatk4/variantrecalibrator/meta.yml +++ b/modules/nf-core/modules/gatk4/variantrecalibrator/meta.yml @@ -33,6 +33,17 @@ input: type: file description: tbi file matching with -vcf pattern: "*.vcf.gz.tbi" + - resource_vcf: + type: file + description: all resource vcf files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz" + - resource_tbi: + type: file + description: all resource tbi files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz.tbi" + - labels: + type: string + description: necessary arguments for GATK VariantRecalibrator. Specified to directly match the resources provided. More information can be found at https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator - fasta: type: file description: The reference fasta file @@ -45,34 +56,6 @@ input: type: file description: GATK sequence dictionary pattern: "*.dict" - - allelespecific: - type: boolean - description: specify whether to use allele specific annotations - pattern: "{true,false}" - - resvcfs: - type: list - description: resource files to be used as truth, training and known sites resources, this imports the files into the module, file names are specified again in the resource_labels to be called via the command. - pattern: "*/hapmap_3.3.hg38_chr21.vcf.gz" - - restbis: - type: list - description: tbis for the corresponding vcfs files to be used as truth, training and known resources. - pattern: "*/hapmap_3.3.hg38_chr21.vcf.gz.tbi" - - reslabels: - type: list - description: labels for the resource files to be used as truth, training and known sites resources, label should include an identifier,which kind of resource(s) it is, prior value and name of the file. - pattern: "hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.hg38_chr21.vcf.gz" - - annotation: - type: list - description: specify which annotations should be used for calculations. - pattern: "['QD', 'MQ', 'FS', 'SOR']" - - mode: - type: string - description: specifies which recalibration mode to employ (SNP is default, BOTH is intended for testing only) - pattern: "{SNP,INDEL,BOTH}" - - rscript: - type: boolean - description: specify whether to generate rscript.plot output file - pattern: "{true,false}" output: - recal: type: file @@ -96,3 +79,4 @@ output: pattern: "*.versions.yml" authors: - "@GCJMackenzie" + - "@nickhsmith" diff --git a/nextflow.config b/nextflow.config index 3bb4aaf870..2ff9f8653b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -64,6 +64,8 @@ params { cf_window = null // by default we are not using this in Control-FREEC ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers + joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected + // Annotation vep_out_format = 'vcf' diff --git a/nextflow_schema.json b/nextflow_schema.json index 6812c92df2..beb67e0361 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -232,6 +232,12 @@ "description": "If true, skips germline variant calling for matched normal to tumor sample. Normal samples without matched tumor will still be processed through germline variant calling tools.", "help_text": "This can speed up computation for somatic variant calling with matched normal samples. If false, all normal samples are processed as well through the germline variantcalling tools. If true, only somatic variant calling is done." }, + "joint_germline": { + "type": "boolean", + "fa_icon": "fas fa-toolbox", + "description": "Turn on the joint germline variant calling for GATK haplotypecaller", + "help_text": "Uses all normal germline samples (as designated by `status` in the input csv) in the joint germline variant calling process." + }, "ascat_min_base_qual": { "type": "number", "default": 20, @@ -554,6 +560,11 @@ "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the dbsnp file. Combine with `--save_reference` to save for future runs.", "hidden": true }, + "dbsnp_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "label string for VariantRecalibration (haplotypecaller joint variant calling)" + }, "dict": { "type": "string", "fa_icon": "fas fa-file", @@ -611,6 +622,27 @@ "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs.", "hidden": true }, + "known_indels_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n1st label string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "known_snps": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nPath to known snps file." + }, + "known_snps_tbi": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known snps file snps.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs." + }, + "known_snps_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nlabel string for VariantRecalibration (haplotypecaller joint variant calling)" + }, "mappability": { "type": "string", "fa_icon": "fas fa-file", diff --git a/subworkflows/local/germline_variant_calling.nf b/subworkflows/local/germline_variant_calling.nf index 57fdf708a1..101c45448d 100644 --- a/subworkflows/local/germline_variant_calling.nf +++ b/subworkflows/local/germline_variant_calling.nf @@ -23,11 +23,12 @@ workflow GERMLINE_VARIANT_CALLING { fasta_fai // channel: [mandatory] fasta_fai intervals // channel: [mandatory] intervals/target regions intervals_bed_gz_tbi // channel: [mandatory] intervals/target regions index zipped and indexed - intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped, [] if no_intervals + intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped intervals_bed_combined_haplotypec // channel: [mandatory] intervals/target regions in one file unzipped, no_intervals.bed if no_intervals - known_sites - known_sites_tbi - // joint_germline // val: true/false on whether to run joint_germline calling, only works in combination with haplotypecaller at the moment + known_sites_indels + known_sites_indels_tbi + known_sites_snps + known_sites_snps_tbi main: @@ -151,19 +152,34 @@ workflow GERMLINE_VARIANT_CALLING { if (tools.split(',').contains('haplotypecaller')){ cram_recalibrated_intervals_haplotypecaller = cram_recalibrated_intervals .map{ meta, cram, crai, intervals -> - [meta, cram, crai, intervals, []] - } - RUN_HAPLOTYPECALLER( - cram_recalibrated_intervals_haplotypecaller, - fasta, - fasta_fai, - dict, - dbsnp, - dbsnp_tbi, - intervals_bed_combined_haplotypec, - known_sites, - known_sites_tbi - ) + + intervals_name = meta.num_intervals == 0 ? "no_interval" : intervals.simpleName + new_meta = params.joint_germline ? [ + data_type:meta.data_type, + id:meta.sample, + intervals_name:intervals_name, + num_intervals:meta.num_intervals, + patient:meta.patient, + sample:meta.sample, + sex:meta.sex, + status:meta.status + ] + : meta + + [new_meta, cram, crai, intervals, []] + } + + RUN_HAPLOTYPECALLER(cram_recalibrated_intervals_haplotypecaller, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + known_sites_indels, + known_sites_indels_tbi, + known_sites_snps, + known_sites_snps_tbi, + intervals_bed_combined_haplotypec) haplotypecaller_vcf = RUN_HAPLOTYPECALLER.out.filtered_vcf ch_versions = ch_versions.mix(RUN_HAPLOTYPECALLER.out.versions) diff --git a/subworkflows/local/prepare_genome.nf b/subworkflows/local/prepare_genome.nf index f01e0c221d..58d140aff1 100644 --- a/subworkflows/local/prepare_genome.nf +++ b/subworkflows/local/prepare_genome.nf @@ -17,6 +17,7 @@ include { SAMTOOLS_FAIDX } from '../../modules/nf-core/m include { TABIX_TABIX as TABIX_DBSNP } from '../../modules/nf-core/modules/tabix/tabix/main' include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../modules/nf-core/modules/tabix/tabix/main' include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../modules/nf-core/modules/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../modules/nf-core/modules/tabix/tabix/main' include { TABIX_TABIX as TABIX_PON } from '../../modules/nf-core/modules/tabix/tabix/main' include { UNTAR as UNTAR_CHR_DIR } from '../../modules/nf-core/modules/untar/main' include { UNZIP as UNZIP_ALLELES } from '../../modules/nf-core/modules/unzip/main' @@ -36,6 +37,7 @@ workflow PREPARE_GENOME { fasta_fai // channel: [optional] fasta_fai germline_resource // channel: [optional] germline_resource known_indels // channel: [optional] known_indels + known_snps pon // channel: [optional] pon @@ -57,6 +59,7 @@ workflow PREPARE_GENOME { // outputs are collected to maintain a single channel for relevant TBI files TABIX_DBSNP(dbsnp.flatten().map{ it -> [[id:it.baseName], it] }) TABIX_GERMLINE_RESOURCE(germline_resource.flatten().map{ it -> [[id:it.baseName], it] }) + TABIX_KNOWN_SNPS( known_snps.flatten().map{ it -> [[id:it.baseName], it] } ) TABIX_KNOWN_INDELS( known_indels.flatten().map{ it -> [[id:it.baseName], it] } ) TABIX_PON(pon.flatten().map{ it -> [[id:it.baseName], it] }) @@ -103,6 +106,7 @@ workflow PREPARE_GENOME { ch_versions = ch_versions.mix(MSISENSORPRO_SCAN.out.versions) ch_versions = ch_versions.mix(TABIX_DBSNP.out.versions) ch_versions = ch_versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) + ch_versions = ch_versions.mix(TABIX_KNOWN_SNPS.out.versions) ch_versions = ch_versions.mix(TABIX_KNOWN_INDELS.out.versions) ch_versions = ch_versions.mix(TABIX_PON.out.versions) @@ -114,6 +118,7 @@ workflow PREPARE_GENOME { dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi + known_snps_tbi = TABIX_KNOWN_SNPS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi diff --git a/subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf b/subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf index 68b1a770db..4beab66c3a 100644 --- a/subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf +++ b/subworkflows/nf-core/gatk4/joint_germline_variant_calling/main.nf @@ -1,92 +1,200 @@ // -// Run GATK haplotypecaller for all input samples, merge them with genomicsdbimport, perform joint genotyping with genotypeGVCFS and recalibrate with variantrecalibrator & applyvqsr. -// - -include { GATK4_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../../../modules/nf-core/modules/gatk4/haplotypecaller/main' -include { GATK4_GENOMICSDBIMPORT as GENOMICSDBIMPORT } from '../../../../modules/nf-core/modules/gatk4/genomicsdbimport/main' -include { GATK4_GENOTYPEGVCFS as GENOTYPEGVCFS } from '../../../../modules/nf-core/modules/gatk4/genotypegvcfs/main' -include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR } from '../../../../modules/nf-core/modules/gatk4/variantrecalibrator/main' -include { GATK4_APPLYVQSR as APPLYVQSR } from '../../../../modules/nf-core/modules/gatk4/applyvqsr/main' +// merge samples with genomicsdbimport, perform joint genotyping with genotypeGVCFS +include { BCFTOOLS_SORT } from '../../../../modules/nf-core/modules/bcftools/sort/main' +include { TABIX_TABIX as TABIX } from '../../../../modules/nf-core/modules/tabix/tabix/main' +include { GATK4_GENOMICSDBIMPORT } from '../../../../modules/nf-core/modules/gatk4/genomicsdbimport/main' +include { GATK4_GENOTYPEGVCFS } from '../../../../modules/nf-core/modules/gatk4/genotypegvcfs/main' +include { GATK4_MERGEVCFS as MERGE_GENOTYPEGVCFS } from '../../../../modules/nf-core/modules/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_VQSR } from '../../../../modules/nf-core/modules/gatk4/mergevcfs/main' +include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR_SNP } from '../../../../modules/nf-core/modules/gatk4/variantrecalibrator/main' +include { GATK4_APPLYVQSR as GATK4_APPLYVQSR_SNP } from '../../../../modules/nf-core/modules/gatk4/applyvqsr/main' +include { GATK4_APPLYVQSR as GATK4_APPLYVQSR_INDEL } from '../../../../modules/nf-core/modules/gatk4/applyvqsr/main' +include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR_INDEL} from '../../../../modules/nf-core/modules/gatk4/variantrecalibrator/main' workflow GATK_JOINT_GERMLINE_VARIANT_CALLING { take: - input // channel: [ val(meta), [ input ], [ input_index ], [] ] - fasta // channel: /path/to/reference/fasta - fai // channel: /path/to/reference/fasta/index - dict // channel: /path/to/reference/fasta/dictionary - sites // channel: /path/to/known/sites/file - sites_index // channel: /path/to/known/sites/index - allelespecific // channel: true/false run allelespecific mode of vqsr modules - resources // channel: [[resource, vcfs, forvariantrecal], [resource, tbis, forvariantrecal], [resource, labels, forvariantrecal]] - annotation // channel: [annotations, to, use, for, variantrecal, filtering] - mode // channel: which mode to run variantrecal: SNP/INDEL/BOTH - create_rscript // channel: true/false whether to generate rscript plots in variantrecal - truthsensitivity // channel: 0-100.0 truthsensitivity cutoff for applyvqsr + input // channel: [ val(meta), [ input ], [ input_index ], interval, [], []] + fasta // channel: /path/to/reference/fasta + fai // channel: /path/to/reference/fasta/index + dict // channel: /path/to/reference/fasta/dictionary + dbsnp + dbsnp_tbi + dbsnp_vqsr + resource_indels_vcf + resource_indels_tbi + known_indels_vqsr + resource_snps_vcf + resource_snps_tbi + known_snps_vqsr main: - ch_versions = Channel.empty() + ch_versions = Channel.empty() + + // + //Map input for GenomicsDBImport. + //rename based on num_intervals, group all samples by their interval_name/interval_file and restructure for channel + //group by 0,3 to avoid a list of metas and make sure that any intervals + // + gendb_input = input.map{ + meta, gvcf, tbi, intervals-> + new_meta = [ + id: "joint_variant_calling", + intervals_name: meta.intervals_name, + num_intervals: meta.num_intervals + ] + + [ new_meta, gvcf, tbi, intervals ] + + }.groupTuple(by:[0,3]).map{ new_meta, gvcf, tbi, intervals -> + [ new_meta, gvcf, tbi, intervals, [], [] ] + } // //Convert all sample vcfs into a genomicsdb workspace using genomicsdbimport. // - // gendb_input = .combine([interval_file]).combine(['']).combine([dict]) + GATK4_GENOMICSDBIMPORT ( gendb_input, false, false, false ) - // GENOMICSDBIMPORT ( gendb_input, false, false, false ) - // ch_versions = ch_versions.mix(GENOMICSDBIMPORT.out.versions) + genotype_input = GATK4_GENOMICSDBIMPORT.out.genomicsdb.map{ + meta, genomicsdb -> + [meta, genomicsdb, [], [], []] + } // //Joint genotyping performed using GenotypeGVCFs + //Sort vcfs called by interval within each VCF // - // ch_genotype_in = GENOMICSDBIMPORT.out.genomicsdb.collect() - // //[] is a placeholder for the input where the vcf tbi file would be passed in for non-genomicsdb workspace runs, which is not necessary for this workflow as it uses genomicsdb workspaces. - // ch_genotype_in.add([]) - // GENOTYPEGVCFS ( ch_genotype_in, fasta, fai, dict, sites, sites_index ) - // ch_versions = ch_versions.mix(GENOTYPEGVCFS.out.versions) + vcfs = GATK4_GENOTYPEGVCFS ( genotype_input, fasta, fai, dict, dbsnp, dbsnp_tbi).vcf + + BCFTOOLS_SORT(vcfs) + vcfs_sorted_input = BCFTOOLS_SORT.out.vcf.branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + vcfs_sorted_input_no_intervals = vcfs_sorted_input.no_intervals.map{ meta , vcf -> + + [[ id: "joint_variant_calling", + num_intervals: meta.num_intervals, + patient: "all_samples", + variantcaller: "haplotypecaller" + ] , vcf ] + } + + // Index vcf files if no scatter/gather by intervals + TABIX(vcfs_sorted_input_no_intervals) + + //Merge scatter/gather vcfs & index + //Rework meta for variantscalled.csv and annotation tools + MERGE_GENOTYPEGVCFS(vcfs_sorted_input.intervals.map{meta, vcf -> + [ + [ + id: "joint_variant_calling", + num_intervals: meta.num_intervals, + patient: "all_samples", + variantcaller: "haplotypecaller", + ], + vcf] + }.groupTuple() + ,dict) + + vqsr_input = Channel.empty().mix( + MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi), + vcfs_sorted_input_no_intervals.join(TABIX.out.tbi) + ) + + // Group resource labels for SNP and INDEL + snp_resource_labels = Channel.empty().mix(known_snps_vqsr,dbsnp_vqsr).collect() + indel_resource_labels = Channel.empty().mix(known_indels_vqsr,dbsnp_vqsr).collect() - // // setting run_vqsr to false skips the VQSR process, for if user does not wish to perform VQSR, - // // or want to run the hard filtering recommended by gatk best practices for runs with a low number of samples instead. - // if (run_vqsr) { - // // - // //Perform first step in VQSR using VariantRecalibrator - // // - // ch_gvcf = GENOTYPEGVCFS.out.vcf.collect() - // ch_gtbi = GENOTYPEGVCFS.out.tbi.collect() - // ch_vrecal_in = ch_gvcf.combine(ch_gtbi, by: 0) + // + //Recalibrate SNP and INDEL separately. + // + VARIANTRECALIBRATOR_SNP( + vqsr_input, + resource_snps_vcf, + resource_snps_tbi, + snp_resource_labels, + fasta, + fai, + dict) + + VARIANTRECALIBRATOR_INDEL( + vqsr_input, + resource_indels_vcf, + resource_indels_tbi, + indel_resource_labels, + fasta, + fai, + dict) - // VARIANTRECALIBRATOR ( ch_vrecal_in, fasta, fai, dict, allelespecific, resources, annotation, mode, create_rscript ) + // + //Prepare SNP and INDEL separately for ApplyVQSR + // - // ch_versions = ch_versions.mix(VARIANTRECALIBRATOR.out.versions) + //Join results of variant recalibration into a single channel tuple + //Rework meta for variantscalled.csv and annotation tools + vqsr_input_snp = vqsr_input.join( VARIANTRECALIBRATOR_SNP.out.recal) + .join( VARIANTRECALIBRATOR_SNP.out.idx) + .join( VARIANTRECALIBRATOR_SNP.out.tranches) + .map{ meta, vcf, tbi, recal, index, tranche -> + + new_meta = [ + id: "recalibrated_joint_variant_calling", + num_intervals: meta.num_intervals, + patient: "all_samples", + variantcaller: "haplotypecaller", + ] + + [new_meta, vcf, tbi, recal, index, tranche] + } + + //Join results of variant recalibration into a single channel tuple + //Rework meta for variantscalled.csv and annotation tools + vqsr_input_indel = vqsr_input.join( VARIANTRECALIBRATOR_INDEL.out.recal).join( + VARIANTRECALIBRATOR_INDEL.out.idx).join( + VARIANTRECALIBRATOR_INDEL.out.tranches).map{ meta, vcf, tbi, recal, index, tranche -> + + new_meta = [ + id: "recalibrated_joint_variant_calling", + num_intervals: meta.num_intervals, + patient: "all_samples", + variantcaller: "haplotypecaller" + ] + + [new_meta, vcf, tbi, recal, index, tranche] + } + + GATK4_APPLYVQSR_SNP(vqsr_input_snp, + fasta, + fai, + dict ) + + GATK4_APPLYVQSR_INDEL(vqsr_input_indel, + fasta, + fai, + dict ) + + vqsr_snp_vcf = GATK4_APPLYVQSR_SNP.out.vcf + vqsr_indel_vcf = GATK4_APPLYVQSR_INDEL.out.vcf - // // - // //Perform second step in VQSR using ApplyVQSR - // // - // ch_recal = VARIANTRECALIBRATOR.out.recal.collect() - // ch_idx = VARIANTRECALIBRATOR.out.idx.collect() - // ch_tranches = VARIANTRECALIBRATOR.out.tranches.collect() - // ch_vqsr_in = ch_vrecal_in.combine(ch_recal, by: 0).combine(ch_idx, by: 0).combine(ch_tranches, by: 0) + // + //Merge VQSR outputs into final VCF + // + MERGE_VQSR( + vqsr_snp_vcf.mix(vqsr_indel_vcf).groupTuple(), + dict + ) - // APPLYVQSR ( ch_vqsr_in, fasta, fai, dict, allelespecific, truthsensitivity, mode ) + ch_versions = ch_versions.mix(GATK4_GENOMICSDBIMPORT.out.versions) + ch_versions = ch_versions.mix(GATK4_GENOTYPEGVCFS.out.versions) + ch_versions = ch_versions.mix(VARIANTRECALIBRATOR_SNP.out.versions) + ch_versions = ch_versions.mix(GATK4_APPLYVQSR_SNP.out.versions) - // ch_versions = ch_versions.mix(APPLYVQSR.out.versions) - // } emit: - // haplotc_vcf = run_haplotc ? HAPLOTYPECALLER.out.vcf.collect() : [] // channel: [ val(meta), [ vcf ] ] - // haplotc_index = run_haplotc ? HAPLOTYPECALLER.out.tbi.collect() : [] // channel: [ val(meta), [ tbi ] ] - - // genomicsdb = GENOMICSDBIMPORT.out.genomicsdb.collect() // channel: [ val(meta), [ genomicsdb ] ] - - // genotype_vcf = GENOTYPEGVCFS.out.vcf.collect() // channel: [ val(meta), [ vcf ] ] - // genotype_index = GENOTYPEGVCFS.out.vcf.collect() // channel: [ val(meta), [ tbi ] ] - - // recal_file = run_vqsr ? VARIANTRECALIBRATOR.out.recal.collect() : [] // channel: [ val(meta), [ recal ] ] - // recal_index = run_vqsr ? VARIANTRECALIBRATOR.out.idx.collect() : [] // channel: [ val(meta), [ idx ] ] - // recal_tranches = run_vqsr ? VARIANTRECALIBRATOR.out.tranches.collect() : [] // channel: [ val(meta), [ tranches ] ] - - // vqsr_vcf = run_vqsr ? APPLYVQSR.out.vcf.collect() : [] // channel: [ val(meta), [ vcf ] ] - // vqsr_index = run_vqsr ? APPLYVQSR.out.tbi.collect() : [] // channel: [ val(meta), [ tbi ] ] - - versions = ch_versions // channel: [ versions.yml ] + versions = ch_versions // channel: [ versions.yml ] + genotype_vcf = Channel.empty().mix( vcfs_sorted_input_no_intervals, MERGE_GENOTYPEGVCFS.out.vcf, MERGE_VQSR.out.vcf) // channel: [ val(meta), [ vcf ] ] + genotype_index = Channel.empty().mix( TABIX.out.tbi, MERGE_GENOTYPEGVCFS.out.tbi, MERGE_VQSR.out.tbi) // channel: [ val(meta), [ tbi ] ] } diff --git a/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf b/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf index c41371fbb0..f2d948fc67 100644 --- a/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf +++ b/subworkflows/nf-core/variantcalling/haplotypecaller/main.nf @@ -1,5 +1,4 @@ include { GATK4_MERGEVCFS as MERGE_HAPLOTYPECALLER } from '../../../../modules/nf-core/modules/gatk4/mergevcfs/main' -include { GATK4_GENOTYPEGVCFS as GENOTYPEGVCFS } from '../../../../modules/nf-core/modules/gatk4/genotypegvcfs/main' include { GATK4_HAPLOTYPECALLER as HAPLOTYPECALLER } from '../../../../modules/nf-core/modules/gatk4/haplotypecaller/main' include { GATK_JOINT_GERMLINE_VARIANT_CALLING as JOINT_GERMLINE } from '../../../../subworkflows/nf-core/gatk4/joint_germline_variant_calling/main' include { GATK_SINGLE_SAMPLE_GERMLINE_VARIANT_CALLING as SINGLE_SAMPLE } from '../../../../subworkflows/nf-core/gatk4/single_sample_germline_variant_calling/main' @@ -12,9 +11,11 @@ workflow RUN_HAPLOTYPECALLER { dict // channel: [mandatory] dbsnp // channel: [] dbsnp_tbi + known_sites_indels + known_sites_indels_tbi + known_sites_snps + known_sites_snps_tbi intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped, no_intervals.bed if no_intervals - known_sites - known_sites_tbi // channel: [optional] main: @@ -41,9 +42,41 @@ workflow RUN_HAPLOTYPECALLER { no_intervals: it[0].num_intervals <= 1 }.set{haplotypecaller_tbi_branch} - // Only when using intervals - MERGE_HAPLOTYPECALLER( - haplotypecaller_vcf_branch.intervals + if (params.joint_germline) { + // merge vcf and tbis + genotype_gvcf_to_call = Channel.empty().mix(HAPLOTYPECALLER.out.vcf + .join(HAPLOTYPECALLER.out.tbi) + .join(cram).map{ meta, vcf, tbi, cram, crai, intervals, dragstr_model -> + [ meta, vcf, tbi, intervals ] + }) + // make channels from labels + dbsnp_vqsr = params.dbsnp_vqsr ? Channel.value(params.dbsnp_vqsr) : Channel.empty() + known_indels_vqsr = params.known_indels_vqsr ? Channel.value(params.known_indels_vqsr) : Channel.empty() + known_snps_vqsr = params.known_snps_vqsr ? Channel.value(params.known_snps_vqsr) : Channel.empty() + + + JOINT_GERMLINE( + genotype_gvcf_to_call, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr) + + filtered_vcf = JOINT_GERMLINE.out.genotype_vcf + ch_versions = ch_versions.mix(JOINT_GERMLINE.out.versions) + } else { + + // Only when using intervals + MERGE_HAPLOTYPECALLER( + haplotypecaller_vcf_branch.intervals .map{ meta, vcf -> new_meta = [ @@ -55,73 +88,32 @@ workflow RUN_HAPLOTYPECALLER { status: meta.status ] - [groupKey(new_meta, new_meta.num_intervals), vcf] - }.groupTuple(), - dict) - - haplotypecaller_vcf = Channel.empty().mix( - MERGE_HAPLOTYPECALLER.out.vcf, - haplotypecaller_vcf_branch.no_intervals) - - haplotypecaller_tbi = Channel.empty().mix( - MERGE_HAPLOTYPECALLER.out.tbi, - haplotypecaller_tbi_branch.no_intervals) - - if (params.joint_germline) { - - // haplotypecaller_vcf_list = haplotypecaller_vcf.toList() - // haplotypecaller_tbi_list = haplotypecaller_vcf.toList() - - // joint_germline_vcf_tbi = [ [id: "joint_germline"], - // haplotypecaller_vcf_list, - // haplotypecaller_tbi_list ] - // GATK_JOINT_GERMLINE_VARIANT_CALLING( - // joint_germline_vcf_tbi, - // fasta, - // fasta_fai, - // intervals, - // dict, - // dbsnp, - // dbsnp_tbi, - // allelespecific? - // resources? - // annotation? - // "BOTH", - // true, - // truthsensitivity -> parameter or module? - // ) - - // filtered_vcf = JOINT_GERMLINE.out.vcf - // ch_versions = ch_versions.mix(GATK_JOINT_GERMLINE_VARIANT_CALLING.out.versions) - } else { - SINGLE_SAMPLE( - haplotypecaller_vcf.join(haplotypecaller_tbi), - fasta, - fasta_fai, - dict, - intervals_bed_combined, - known_sites, - known_sites_tbi - ) - - filtered_vcf = SINGLE_SAMPLE.out.filtered_vcf.map{ meta, vcf -> - [[ - id: meta.sample, - num_intervals: meta.num_intervals, - patient: meta.patient, - sample: meta.sample, - sex: meta.sex, - status: meta.status, - variantcaller: "haplotypecaller" - ], - vcf]} - ch_versions = ch_versions.mix(SINGLE_SAMPLE.out.versions) + [groupKey(new_meta, new_meta.num_intervals), vcf] + }.groupTuple(), + dict) + + haplotypecaller_vcf = Channel.empty().mix( + MERGE_HAPLOTYPECALLER.out.vcf, + haplotypecaller_vcf_branch.no_intervals) + + haplotypecaller_tbi = Channel.empty().mix( + MERGE_HAPLOTYPECALLER.out.tbi, + haplotypecaller_tbi_branch.no_intervals) + + SINGLE_SAMPLE(haplotypecaller_vcf.join(haplotypecaller_tbi), + fasta, + fasta_fai, + dict, + intervals_bed_combined, + known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), + known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) + + filtered_vcf = SINGLE_SAMPLE.out.filtered_vcf.map{ meta, vcf-> [[patient:meta.patient, sample:meta.sample, status:meta.status, sex:meta.sex, id:meta.sample, num_intervals:meta.num_intervals, variantcaller:"haplotypecaller"], vcf]} + ch_versions = ch_versions.mix( SINGLE_SAMPLE.out.versions, + HAPLOTYPECALLER.out.versions, + MERGE_HAPLOTYPECALLER.out.versions) } - - ch_versions = ch_versions.mix(MERGE_HAPLOTYPECALLER.out.versions) - ch_versions = ch_versions.mix(HAPLOTYPECALLER.out.versions) - emit: versions = ch_versions filtered_vcf diff --git a/tests/csv/3.0/mapped_joint_bam.csv b/tests/csv/3.0/mapped_joint_bam.csv new file mode 100644 index 0000000000..89ebfdf77f --- /dev/null +++ b/tests/csv/3.0/mapped_joint_bam.csv @@ -0,0 +1,3 @@ +patient,status,sample,bam,bai +test,0,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai +test1,0,test1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai diff --git a/tests/test_tools.yml b/tests/test_tools.yml index f839a0e2de..f3b74b8f00 100644 --- a/tests/test_tools.yml +++ b/tests/test_tools.yml @@ -471,34 +471,23 @@ - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi -# - name: Run joint germline variant calling with haplotypecaller -# command: nextflow run main.nf -profile test,tools_germline,docker --tools haplotypecaller --joint_germline -# tags: -# - germline -# - gvcf -# - haplotypecaller -# - variant_calling -# files: -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.g.vcf.gz -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.g.vcf.gz.tbi -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.vcf.gz -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.vcf.gz.tbi -# - path: results/csv/variantcalled.csv - -# - name: Run joint germline variant calling with haplotypecaller without intervals -# command: nextflow run main.nf -profile test,tools_germline,docker --tools haplotypecaller --joint_germline --no_intervals -# tags: -# - germline -# - gvcf -# - haplotypecaller -# - no_intervals -# - variant_calling -# files: -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.g.vcf.gz -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.g.vcf.gz.tbi -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.vcf.gz -# - path: results/variant_calling/haplotypecaller/sample1/sample1.haplotypecaller.vcf.gz.tbi -# - path: results/csv/variantcalled.csv +- name: Run joint germline variant calling with haplotypecaller + command: nextflow run main.nf -profile test,targeted,docker --input ./tests/csv/3.0/mapped_joint_bam.csv --tools haplotypecaller --joint_germline true --step variant_calling + tags: + - germline + - haplotypecaller + - variant_calling + files: + - path: results/csv/variantcalled.csv + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + - path: results/reports/bcftools/haplotypecaller/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz.tbi - name: Run variant calling on germline sample with manta command: nextflow run main.nf -profile test,tools_germline,docker --tools manta diff --git a/workflows/sarek.nf b/workflows/sarek.nf index ed1a4f397f..06089af9d6 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -30,6 +30,8 @@ def checkPathParamList = [ params.germline_resource_tbi, params.input, params.intervals, + params.known_snps, + params.known_snps_tbi, params.known_indels, params.known_indels_tbi, params.mappability, @@ -108,8 +110,12 @@ if(!params.dbsnp && !params.known_indels){ } if(params.tools && params.tools.split(',').contains('haplotypecaller')){ log.warn "If Haplotypecaller is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + } } +if (params.joint_germline && (!params.dbsnp || !params.known_indels || !params.known_snps || params.no_intervals)){ + log.warn "If Haplotypecaller is specified, without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \nJoint germline variant calling also requires intervals in order to genotype the samples. As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed." +} // Fails when missing tools for variant_calling or annotate if ((params.step == 'variant_calling' || params.step == 'annotate') && !params.tools) { @@ -147,10 +153,12 @@ ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_l ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) fasta = params.fasta ? Channel.fromPath(params.fasta).collect() : Channel.empty() fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) //Mutec2 does not require a germline resource, so set to optional input known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) //PON is optional for Mutect2 (but highly recommended) @@ -312,6 +320,7 @@ workflow SAREK { fasta_fai, germline_resource, known_indels, + known_snps, pon) // Gather built indices or get them from the params @@ -326,6 +335,7 @@ workflow SAREK { gc_file = PREPARE_GENOME.out.gc_file germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : PREPARE_GENOME.out.germline_resource_tbi : [] known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : PREPARE_GENOME.out.known_indels_tbi : Channel.value([]) + known_snps_tbi = params.known_snps ? params.known_snps_tbi ? Channel.fromPath(params.known_snps_tbi).collect() : PREPARE_GENOME.out.known_snps_tbi : Channel.value([]) loci_files = PREPARE_GENOME.out.loci_files pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : PREPARE_GENOME.out.pon_tbi : [] msisensorpro_scan = PREPARE_GENOME.out.msisensorpro_scan @@ -336,10 +346,13 @@ workflow SAREK { params.aligner == "bwa-mem2" ? bwamem2 : dragmap - // known_sites is made by grouping both the dbsnp and the known indels ressources + // known_sites is made by grouping both the dbsnp and the known snps/indels resources // Which can either or both be optional - known_sites = dbsnp.concat(known_indels).collect() - known_sites_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + known_sites_indels = dbsnp.concat(known_indels).collect() + known_sites_indels_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + + known_sites_snps = dbsnp.concat(known_snps).collect() + known_sites_snps_tbi = dbsnp_tbi.concat(known_snps_tbi).collect() // Build intervals if needed PREPARE_INTERVALS(fasta_fai) @@ -678,8 +691,8 @@ workflow SAREK { fasta, fasta_fai, intervals, - known_sites, - known_sites_tbi) + known_sites_indels, + known_sites_indels_tbi) ch_table_bqsr_spark = PREPARE_RECALIBRATION_SPARK.out.table_bqsr @@ -693,8 +706,8 @@ workflow SAREK { fasta, fasta_fai, intervals, - known_sites, - known_sites_tbi) + known_sites_indels, + known_sites_indels_tbi) ch_table_bqsr_no_spark = PREPARE_RECALIBRATION.out.table_bqsr @@ -909,9 +922,10 @@ workflow SAREK { intervals_bed_gz_tbi, intervals_bed_combined, // [] if no_intervals, else interval_bed_combined.bed PREPARE_INTERVALS.out.intervals_bed_combined, // no_intervals.bed if no intervals, else interval_bed_combined.bed; Channel operations possible - known_sites, - known_sites_tbi) - // params.joint_germline) + known_sites_indels, + known_sites_indels_tbi, + known_sites_snps, + known_sites_snps_tbi) // TUMOR ONLY VARIANT CALLING TUMOR_ONLY_VARIANT_CALLING(