Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Add NGSCheckMate in as part of a cram sampleQC subworkflow #1252

Merged
merged 29 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
94ca07d
Add NGSCheckMate
SPPearce Sep 24, 2023
21d38a2
Update CHANGELOG
SPPearce Sep 24, 2023
2e72d1c
Update tools list for sampleqc
SPPearce Sep 25, 2023
cee8614
Update CHANGELOG.md
SPPearce Sep 29, 2023
9c48c87
Merge branch 'dev' into ngscheckmate
SPPearce Sep 29, 2023
a2496de
Update conf/igenomes.config
SPPearce Sep 29, 2023
05f681e
Update conf/igenomes.config
SPPearce Sep 29, 2023
9d9105c
Update CHANGELOG.md
SPPearce Sep 29, 2023
2ab1e4d
Update conf/igenomes.config
SPPearce Sep 29, 2023
ac649e0
Update conf/igenomes.config
SPPearce Sep 29, 2023
a0e5f43
Add tests, swap to ngscheckmate
SPPearce Sep 30, 2023
2c95984
Merge branch 'dev' into ngscheckmate
SPPearce Sep 30, 2023
97e1fa6
Fix NGSCheckMate test name
SPPearce Oct 1, 2023
64273f3
Merge remote-tracking branch 'refs/remotes/origin/ngscheckmate' into …
SPPearce Oct 1, 2023
ad97096
Update test
SPPearce Oct 1, 2023
7378fe9
Update output path and docs
SPPearce Oct 2, 2023
d37c24e
Change ngscheckmate publishdir
SPPearce Oct 2, 2023
4e73bdb
Update tests/config/tags.yml
SPPearce Oct 2, 2023
8f59abc
Merge branch 'dev' into ngscheckmate
maxulysse Oct 11, 2023
3acaef9
Fix merge conflict
SPPearce Nov 7, 2023
e1706a3
Apply code review suggestions, fix channel to mpileup
SPPearce Nov 7, 2023
01ae112
Swap around bed location in confs
SPPearce Nov 7, 2023
13df981
Swap to modules test-data
SPPearce Nov 7, 2023
32227d3
Apply suggestions from code review
SPPearce Nov 7, 2023
10185fa
Add getGenomeAttribute check
SPPearce Nov 7, 2023
ac87102
Update conf/test.config
SPPearce Nov 7, 2023
cf1e116
Move blank line
SPPearce Nov 7, 2023
5f25439
Add to somatic full test
SPPearce Nov 7, 2023
4e55d5d
Merge remote-tracking branch 'origin/dev' into ngscheckmate
SPPearce Nov 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#1113](https://github.com/nf-core/sarek/pull/1113) - Adding CNVkit genemetrics module
- [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline
- [#1252](https://github.com/nf-core/sarek/pull/1252) - Added NGSCheckMate tool for checking that samples come from the same individual
- [#1271](https://github.com/nf-core/sarek/pull/1271) - Back to dev
- [#1290](https://github.com/nf-core/sarek/pull/1290) - Add nf-test for whole pipeline.

Expand Down
4 changes: 4 additions & 0 deletions conf/igenomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ params {
known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz.tbi"
known_indels_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.indels.b37.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.b37.vcf.gz'
mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem"
ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/NGSCheckMate/SNP_GRCh37_hg19_wChr.bed"
snpeff_db = 87
snpeff_genome = 'GRCh37'
vep_cache_version = 110
Expand Down Expand Up @@ -68,6 +69,7 @@ params {
known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi"
known_indels_vqsr = '--resource:gatk,known=false,training=true,truth=true,prior=10.0 Homo_sapiens_assembly38.known_indels.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'
mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Control-FREEC/out100m2_hg38.gem"
ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/NGSCheckMate/SNP_GRCh38_hg38_wChr.bed"
pon = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz"
pon_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz.tbi"
snpeff_db = 105
Expand All @@ -79,6 +81,7 @@ params {
'Ensembl.GRCh37' {
bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa"
ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/NGSCheckMate/SNP_GRCh37_hg19_woChr.bed"
readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt"
snpeff_db = 87
snpeff_genome = 'GRCh37'
Expand All @@ -89,6 +92,7 @@ params {
'NCBI.GRCh38' {
bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa"
ngscheckmate_bed ="${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/NGSCheckMate/SNP_GRCh38_hg38_wChr.bed"
snpeff_db = 105
snpeff_genome = 'GRCh38'
vep_cache_version = 110
Expand Down
26 changes: 26 additions & 0 deletions conf/modules/ngscheckmate.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
process {
withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" {
maxulysse marked this conversation as resolved.
Show resolved Hide resolved

publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/reports/ngscheckmate/vcfs" },
pattern: "*{vcf.gz}"
]
ext.prefix = { "${meta.id}.ngscheckmate" }
ext.when = { params.tools && params.tools.split(',').contains('ngscheckmate') }
ext.args2 = '--no-version --ploidy 1 -c'
ext.args3 = '--no-version'
}

withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" {
ext.args = '-V'

publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/reports/ngscheckmate/" },
pattern: "*"
]

}

}
1 change: 1 addition & 0 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ params {
bcftools_annotations = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/vcf/test2.vcf.gz"
bcftools_annotations_index = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/vcf/test2.vcf.gz.tbi"
bcftools_header_lines = "${projectDir}/tests/config/bcfann_test_header.txt"
ngscheckmate_bed = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/chr21/germlineresources/SNP_GRCh38_hg38_wChr.bed"

SPPearce marked this conversation as resolved.
Show resolved Hide resolved
// default params
split_fastq = 0 // no FASTQ splitting
Expand Down
1 change: 1 addition & 0 deletions conf/test/cache.config
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ params {
vep_cache_version = 110
vep_genome = 'WBcel235'
vep_species = 'caenorhabditis_elegans'
ngscheckmate_bed = params.test_data['homo_sapiens']['genome']['ngscheckmate_bed']

// default params
split_fastq = 0 // no FASTQ splitting
Expand Down
2 changes: 1 addition & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/HCC1395_WXS_somatic_full_test.csv'

// Other params
tools = 'strelka,mutect2,freebayes,ascat,manta,cnvkit,tiddit,controlfreec,vep'
tools = 'strelka,mutect2,freebayes,ascat,manta,cnvkit,tiddit,controlfreec,vep,ngscheckmate'
split_fastq = 20000000
intervals = 's3://ngi-igenomes/test-data/sarek/S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR.bed'
wes = true
Expand Down
20 changes: 20 additions & 0 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [FastQC](#fastqc)
- [FastP](#fastp)
- [Mosdepth](#mosdepth)
- [NGSCheckMate](#ngscheckmate)
- [GATK MarkDuplicates reports](#gatk-markduplicates-reports)
- [Sentieon Dedup reports](#sentieon-dedup-reports)
- [samtools stats](#samtools-stats)
Expand Down Expand Up @@ -985,6 +986,25 @@ Plots will show:
- CSI index for per-base depth for targeted data, per-window (500bp) depth of WGS
</details>

#### NGSCheckMate

[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool for determining whether samples come from the same genetic individual, using a set of commonly heterozygous SNPs. This enables for the detecting of sample mislabelling events. The output includes a text file indicating whether samples have matched or not according to the algorithm, as well as a dendrogram visualising these results.

<details markdown="1">
<summary>Output files for all samples</summary>

**Output directory: `{outdir}/reports/ngscheckmate/`**

- `ngscheckmate_all.txt`
- Tab delimited text file listing all the comparisons made, whether they were considered as a match, with the correlation and a normalised depth.
- `ngscheckmate_matched.txt`
- Tab delimited text file listing only the comparison that were considered to match, with the correlation and a normalised depth.
- `ngscheckmate_output_corr_matrix.txt`
- Tab delimited text file containing a matrix of all correlations for all comparisons made.
- `vcfs/<sample>.vcf.gz`
- Set of vcf files for each sample. Contains calls for the set of SNP positions used to calculate sample relatedness.
</details>

#### GATK MarkDuplicates reports

More information in the [GATK MarkDuplicates section](#gatk-markduplicates)
Expand Down
1 change: 1 addition & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ params.known_indels = WorkflowMain.getGenomeAttribute(params, 'known_in
params.known_indels_tbi = WorkflowMain.getGenomeAttribute(params, 'known_indels_tbi')
params.known_indels_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_indels_vqsr')
params.mappability = WorkflowMain.getGenomeAttribute(params, 'mappability')
params.ngscheckmate_bed = WorkflowMain.getGenomeAttribute(params, 'ngscheckmate_bed')
params.pon = WorkflowMain.getGenomeAttribute(params, 'pon')
params.pon_tbi = WorkflowMain.getGenomeAttribute(params, 'pon_tbi')
params.snpeff_db = WorkflowMain.getGenomeAttribute(params, 'snpeff_db')
Expand Down
12 changes: 11 additions & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"bcftools/mpileup": {
"branch": "master",
"git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
"installed_by": ["modules"]
"installed_by": ["bam_ngscheckmate", "modules"]
},
"bcftools/sort": {
"branch": "master",
Expand Down Expand Up @@ -334,6 +334,11 @@
"git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
"installed_by": ["modules"]
},
"ngscheckmate/ncm": {
"branch": "master",
"git_sha": "32d6725f584ebf460de39b7c1c53a29d5384d697",
"installed_by": ["bam_ngscheckmate"]
},
"samblaster": {
"branch": "master",
"git_sha": "8fc1d24c710ebe1d5de0f2447ec9439fd3d9d66a",
Expand Down Expand Up @@ -483,6 +488,11 @@
},
"subworkflows": {
"nf-core": {
"bam_ngscheckmate": {
"branch": "master",
"git_sha": "32d6725f584ebf460de39b7c1c53a29d5384d697",
"installed_by": ["subworkflows"]
},
"vcf_annotate_ensemblvep": {
"branch": "master",
"git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f",
Expand Down
64 changes: 64 additions & 0 deletions modules/nf-core/ngscheckmate/ncm/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

77 changes: 77 additions & 0 deletions modules/nf-core/ngscheckmate/ncm/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,9 @@ includeConfig 'conf/modules/recalibrate.config'
includeConfig 'conf/modules/trimming.config'
includeConfig 'conf/modules/umi.config'

//ngscheckmate
includeConfig 'conf/modules/ngscheckmate.config'

// variant calling
includeConfig 'conf/modules/ascat.config'
includeConfig 'conf/modules/cnvkit.config'
Expand All @@ -393,7 +396,6 @@ includeConfig 'conf/modules/sentieon_haplotyper.config'
includeConfig 'conf/modules/sentieon_haplotyper_joint_germline.config'
includeConfig 'conf/modules/strelka.config'
includeConfig 'conf/modules/tiddit.config'

includeConfig 'conf/modules/post_variant_calling.config'

//annotate
Expand Down
9 changes: 8 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@
"fa_icon": "fas fa-toolbox",
"description": "Tools to use for duplicate marking, variant calling and/or for annotation.",
"help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.",
"pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(?<!,)$"

"pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|ngscheckmate|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(?<!,)$"
},
"skip_tools": {
"type": "string",
Expand Down Expand Up @@ -752,6 +753,12 @@
"hidden": true,
"help_text": "If you use AWS iGenomes, this has already been set for you appropriately."
},
"ngscheckmate_bed": {
"type": "string",
"fa_icon": "fas fa-file",
"description": "Path to SNP bed file for sample checking with NGSCheckMate",
"help_text": "If you use AWS iGenomes, this has already been set for you appropriately."
},
"snpeff_db": {
"type": "string",
"fa_icon": "fas fa-database",
Expand Down
30 changes: 30 additions & 0 deletions subworkflows/local/cram_sampleqc/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
include { BAM_NGSCHECKMATE } from '../../../subworkflows/nf-core/bam_ngscheckmate/main'

workflow CRAM_SAMPLEQC {

take:
ch_cram // channel: [ val(meta), cram, crai ]
ngscheckmate_bed // channel: [ ngscheckmate_bed ]
fasta // channel: [ fasta ]

main:

ch_versions = Channel.empty()

ch_ngscheckmate_bed = ngscheckmate_bed.map{bed -> [[id: "ngscheckmate"], bed]}

ch_fasta = fasta.map{fasta -> [[id: "genome"], fasta]}

BAM_NGSCHECKMATE ( ch_cram.map{meta, cram, crai -> [meta, cram]}, ch_ngscheckmate_bed, ch_fasta)
ch_versions = ch_versions.mix(BAM_NGSCHECKMATE.out.versions.first())

emit:
corr_matrix = BAM_NGSCHECKMATE.out.corr_matrix // channel: [ meta, corr_matrix ]
matched = BAM_NGSCHECKMATE.out.matched // channel: [ meta, matched ]
all = BAM_NGSCHECKMATE.out.all // channel: [ meta, all ]
vcf = BAM_NGSCHECKMATE.out.vcf // channel: [ meta, vcf ]
pdf = BAM_NGSCHECKMATE.out.pdf // channel: [ meta, pdf ]

versions = ch_versions // channel: [ versions.yml ]
}

Loading