Merge pull request #635 from szilvajuhos/master

To process targeted sequencing with a target BED
SciLifeLab · Sep 21, 2018 · 1539230 · 1539230
2 parents 06fa728 + 60da87c
commit 1539230
Show file tree

Hide file tree

Showing 16 changed files with 318 additions and 138 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,14 +11,12 @@ env:
   global:
     - NXF_VER=0.31.0 SGT_VER=2.5.1
   matrix:
-    - CE=singularity TEST=SOMATIC
     - CE=docker      TEST=SOMATIC
     - CE=docker      TEST=ANNOTATEVEP
-    - CE=singularity TEST=ANNOTATESNPEFF
     - CE=docker      TEST=ANNOTATESNPEFF
-    - CE=singularity TEST=GERMLINE
     - CE=docker      TEST=GERMLINE
 
+
 install:
   # Install Nextflow (and Singularity if needed)
   - "./scripts/install.sh --engine $CE"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,10 +12,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 -   [#613](https://github.com/SciLifeLab/Sarek/pull/613) - Add Issue Templates (bug report and feature request)
 -   [#614](https://github.com/SciLifeLab/Sarek/pull/614) - Add PR Template
 -   [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Add presentation
--   [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Update documentation
+-   [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update documentation
 -   [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore`
 -   [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule
 -   [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add a complete example analysis to docs
+-   [#635](https://github.com/SciLifeLab/Sarek/pull/635) - To process targeted sequencing with a target BED
 -   [#640](https://github.com/SciLifeLab/Sarek/pull/640), [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Add helper script for changing version number
 
 ### `Changed`
@@ -31,7 +32,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 -   [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering
 -   [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images
 -   [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs
--   [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST
+-   [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE\_CHECKLIST
 -   [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update conda channel order priorities
 -   [#642](https://github.com/SciLifeLab/Sarek/pull/642) - MultiQC 1.5 -> 1.6
 -   [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Qualimap 2.2.2a -> 2.2.2b

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # [![Sarek](https://raw.githubusercontent.com/SciLifeLab/Sarek/master/docs/images/Sarek_logo.png "Sarek")](http://sarek.scilifelab.se/)
 
-#### An open-source analysis pipeline to detect germline or somatic variants from whole genome sequencing
+#### An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing
 
 [![Nextflow version][nextflow-badge]][nextflow-link]
 [![Travis build status][travis-badge]][travis-link]

diff --git a/Sarek-data b/Sarek-data
diff --git a/annotate.nf b/annotate.nf
@@ -216,7 +216,7 @@ process RunVEP {
   finalannotator = annotator == "snpeff" ? 'merge' : 'vep'
   genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome
   """
-  vep --dir /opt/vep/.vep/ \
+  /opt/vep/src/ensembl-vep/vep --dir /opt/vep/.vep/  \
   -i ${vcf} \
   -o ${vcf.simpleName}_VEP.ann.vcf \
   --assembly ${genome} \

diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# this script concatenates all VCFs that are in the local directory: the 
+# purpose is to make a single VCF from all the VCFs that were created from different intervals
+
+usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
+
+while getopts "i:c:o:t:" p; do
+	case "${p}" in
+		i)
+			genomeIndex=${OPTARG}
+			;;
+		c)
+			cpus=${OPTARG}
+			;;
+		o)
+			outputFile=${OPTARG}
+			;;
+		t)
+			targetBED=${OPTARG}
+			;;
+		*)
+			usage
+			;;
+	esac
+done
+shift $((OPTIND-1))
+
+if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
+if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
+if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi
+
+set -euo pipefail
+
+# first make a header from one of the VCF intervals
+# get rid of interval information only from the GATK command-line, but leave the rest
+FIRSTVCF=$(ls *.vcf | head -n 1)
+sed -n '/^[^#]/q;p' $FIRSTVCF | \
+awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
+> header
+
+# Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
+# header in the VCF as it is optional (FreeBayes does not save it, for example)
+CONTIGS=($(cut -f1 ${genomeIndex}))
+
+# concatenate VCFs in the correct order
+(
+  cat header
+
+  for chr in "${CONTIGS[@]}"; do
+    # Skip if globbing would not match any file to avoid errors such as
+    # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
+    # was not processed.
+    pattern="${chr}_*.vcf"
+    if ! compgen -G "${pattern}" > /dev/null; then continue; fi
+
+    # ls -v sorts by numeric value ("version"), which means that chr1_100_
+    # is sorted *after* chr1_99_.
+    for vcf in $(ls -v ${pattern}); do
+      # Determine length of header.
+      # The 'q' command makes sed exit when it sees the first non-header
+      # line, which avoids reading in the entire file.
+      L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
+
+      # Then print all non-header lines. Since tail is very fast (nearly as
+      # fast as cat), this is way more efficient than using a single sed,
+      # awk or grep command.
+      tail -n +$((L+1)) ${vcf}
+    done
+  done
+) | bgzip -@${cpus} > rawcalls.vcf.gz
+tabix rawcalls.vcf.gz
+
+set +u
+
+# now we have the concatenated VCF file, check for WES/panel targets, and generate a subset if there is a BED provided
+echo "target is $targetBED"
+if [ ! -z ${targetBED+x} ]; then
+	echo "Selecting subset..."
+	bcftools isec --targets-file ${targetBED} rawcalls.vcf.gz | bgzip -@${cpus} > ${outputFile}.gz
+	tabix ${outputFile}.gz
+else
+	# simply rename the raw calls as WGS results
+	for f in rawcalls*; do mv -v $f ${outputFile}${f#rawcalls.vcf}; done
+fi
+
diff --git a/conf/base.config b/conf/base.config
@@ -33,6 +33,7 @@ params {
   step = 'mapping' // Default step is mapping
   strelkaBP = false // Don't use Manta's candidate indels as input to Strelka
   tag = 'latest' // Default tag is latest, to be overwritten by --tag <version>
+  targetBED = false // no targets by default
   test = false // Not testing by default
   verbose = false // Enable for more verbose information
   version = '2.1.0' // Workflow version

diff --git a/docs/USE_CASES.md b/docs/USE_CASES.md
@@ -164,7 +164,16 @@ SUBJECT_ID  XX    0    SAMPLEIDN    /samples/SAMPLEIDN.bam    /samples/SAMPLEIDN
 SUBJECT_ID  XX    1    SAMPLEIDT    /samples/SAMPLEIDT.bam    /samples/SAMPLEIDT.bai
 SUBJECT_ID  XX    1    SAMPLEIDR    /samples/SAMPLEIDR.bam    /samples/SAMPLEIDR.bai
 ```
-
 If you want to restart a previous run of the pipeline, you may not have a recalibrated BAM file.
 This is the case if HaplotypeCaller was the only tool (recalibration is done on-the-fly with HaplotypeCaller to improve performance and save space).
 In this case, you need to start with `--step=recalibrate` (see previous section).
+
+## Processing targeted (whole exome or panel) sequencing data
+
+The recommended flow for thrgeted sequencing data is to use the whole genome workflow as it is, but also provide a BED file containing targets for variant calling. 
+The Strelka part of the workflow will pick up these intervals, and activate the `--exome` flag to process deeper coverage. It is adviced to pad the variant calling 
+regions (exons or the target) to some extent before submitting to the workflow. To add the target BED file configure the flow like:
+
+```bash
+nextflow run SciLifeLab/Sarek/germlineVC.nf --tools haplotypecaller,strelka,mutect2 --targetBED targets.bed --sample my_panel.tsv
+```
diff --git a/germlineVC.nf b/germlineVC.nf
@@ -363,7 +363,8 @@ process ConcatVCF {
     file(genomeIndex) from Channel.value(referenceMap.genomeIndex)
 
   output:
-    set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfConcatenated
+		// we have this funny *_* pattern to avoid copying the raw calls to publishdir
+    set variantCaller, idPatient, idSampleNormal, idSampleTumor, file("*_*.vcf.gz"), file("*_*.vcf.gz.tbi") into vcfConcatenated
 
 
   when: ( 'haplotypecaller' in tools || 'mutect2' in tools || 'freebayes' in tools ) && !params.onlyQC
@@ -373,47 +374,14 @@ process ConcatVCF {
   else if (variantCaller == 'gvcf-hc') outputFile = "haplotypecaller_${idSampleNormal}.g.vcf"
   else outputFile = "${variantCaller}_${idSampleTumor}_vs_${idSampleNormal}.vcf"
 
-  """
-	set -euo pipefail
-  # first make a header from one of the VCF intervals
-  # get rid of interval information only from the GATK command-line, but leave the rest
-  FIRSTVCF=\$(ls *.vcf | head -n 1)
-  sed -n '/^[^#]/q;p' \$FIRSTVCF | \
-  awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if(\$i!~/intervals=/ && \$i !~ /out=/){printf("%s ",\$i)}}printf("\\n")}' \
-  > header
-
-  # Get list of contigs from the FASTA index (.fai). We cannot use the ##contig
-  # header in the VCF as it is optional (FreeBayes does not save it, for example)
-  CONTIGS=(\$(cut -f1 ${genomeIndex}))
-
-  # concatenate VCFs in the correct order
-  (
-    cat header
-
-    for chr in "\${CONTIGS[@]}"; do
-      # Skip if globbing would not match any file to avoid errors such as
-      # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
-      # was not processed.
-      pattern="\${chr}_*.vcf"
-      if ! compgen -G "\${pattern}" > /dev/null; then continue; fi
-
-      # ls -v sorts by numeric value ("version"), which means that chr1_100_
-      # is sorted *after* chr1_99_.
-      for vcf in \$(ls -v \${pattern}); do
-        # Determine length of header.
-        # The 'q' command makes sed exit when it sees the first non-header
-        # line, which avoids reading in the entire file.
-        L=\$(sed -n '/^[^#]/q;p' \${vcf} | wc -l)
-
-        # Then print all non-header lines. Since tail is very fast (nearly as
-        # fast as cat), this is way more efficient than using a single sed,
-        # awk or grep command.
-        tail -n +\$((L+1)) \${vcf}
-      done
-    done
-  ) | bgzip > ${outputFile}.gz
-  tabix ${outputFile}.gz
-  """
+	if(params.targetBED)		// targeted
+		concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} -t ${params.targetBED}"
+	else										// WGS
+		concatOptions = "-i ${genomeIndex} -c ${task.cpus} -o ${outputFile} "
+
+	"""
+	concatenateVCFs.sh ${concatOptions}
+	"""
 }
 
 if (params.verbose) vcfConcatenated = vcfConcatenated.view {
@@ -441,23 +409,32 @@ process RunSingleStrelka {
   when: 'strelka' in tools && !params.onlyQC
 
   script:
-  """
-  configureStrelkaGermlineWorkflow.py \
-  --bam ${bam} \
-  --referenceFasta ${genomeFile} \
-  --runDir Strelka
-
-  python Strelka/runWorkflow.py -m local -j ${task.cpus}
-
-  mv Strelka/results/variants/genome.*.vcf.gz \
-    Strelka_${idSample}_genome.vcf.gz
-  mv Strelka/results/variants/genome.*.vcf.gz.tbi \
-    Strelka_${idSample}_genome.vcf.gz.tbi
-  mv Strelka/results/variants/variants.vcf.gz \
-    Strelka_${idSample}_variants.vcf.gz
-  mv Strelka/results/variants/variants.vcf.gz.tbi \
-    Strelka_${idSample}_variants.vcf.gz.tbi
-  """
+	"""
+	if [ ! -s "${params.targetBED}" ]; then
+		# do WGS
+		configureStrelkaGermlineWorkflow.py \
+		--bam ${bam} \
+		--referenceFasta ${genomeFile} \
+		--runDir Strelka
+	else
+		# WES or targeted
+		bgzip --threads ${task.cpus} -c ${params.targetBED} > call_targets.bed.gz
+		tabix call_targets.bed.gz
+		configureStrelkaGermlineWorkflow.py \
+		--bam ${bam} \
+		--referenceFasta ${genomeFile} \
+		--exome \
+		--callRegions call_targets.bed.gz \
+		--runDir Strelka
+	fi
+
+	# always run this part
+		python Strelka/runWorkflow.py -m local -j ${task.cpus}
+		mv Strelka/results/variants/genome.*.vcf.gz Strelka_${idSample}_genome.vcf.gz
+		mv Strelka/results/variants/genome.*.vcf.gz.tbi Strelka_${idSample}_genome.vcf.gz.tbi
+		mv Strelka/results/variants/variants.vcf.gz Strelka_${idSample}_variants.vcf.gz
+		mv Strelka/results/variants/variants.vcf.gz.tbi Strelka_${idSample}_variants.vcf.gz.tbi
+	"""
 }
 
 if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view {
@@ -675,6 +652,7 @@ def minimalInformationMessage() {
   log.info "TSV file    : " + tsvFile
   log.info "Genome      : " + params.genome
   log.info "Genome_base : " + params.genome_base
+  log.info "Target BED  : " + params.targetBED
   log.info "Tools       : " + tools.join(', ')
   log.info "Containers"
   if (params.repository != "") log.info "  Repository   : " + params.repository

diff --git a/lib/QC.groovy b/lib/QC.groovy
@@ -60,7 +60,7 @@ class QC {
 // Get VEP version
   static def getVersionVEP() {
     """
-    vep --help > v_vep.txt
+    /opt/vep/src/ensembl-vep/vep --help > v_vep.txt
     """
   }
 }
diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy
@@ -88,6 +88,8 @@ class SarekUtils {
       'strelka-BP',
       'strelkaBP',
       'tag',
+      'target-BED',
+      'targetBED',
       'test',
       'tools',
       'total-memory',

diff --git a/scripts/test.sh b/scripts/test.sh
@@ -8,6 +8,7 @@ PROFILE=singularity
 SAMPLE=Sarek-data/testdata/tsv/tiny.tsv
 TEST=ALL
 TRAVIS=${TRAVIS:-false}
+CPUS=2
 
 TMPDIR=`pwd`/tmp
 mkdir -p $TMPDIR
@@ -51,14 +52,18 @@ do
     BUILD=true
     shift # past value
     ;;
+    -c|--cpus)
+    CPUS=$2
+    shift # past value
+    ;;
     *) # unknown option
     shift # past argument
     ;;
   esac
 done
 
 function run_wrapper() {
-  ./scripts/wrapper.sh $@ --profile $PROFILE --genome $GENOME --genomeBase $PWD/References/$GENOME --verbose
+  ./scripts/wrapper.sh $@ --profile $PROFILE --genome $GENOME --genomeBase $PWD/References/$GENOME --verbose --cpus ${CPUS}
 }
 
 function clean_repo() {
@@ -84,18 +89,23 @@ then
   fi
 fi
 
+
 if [[ ALL,GERMLINE =~ $TEST ]]
 then
-  run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller
-  run_wrapper --germline --step recalibrate --noReports
-  clean_repo
+	# Added Strelka to germline test (no Strelka best practices test for this small data) and not asking for reports
+	run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --noReports
+	run_wrapper --germline --sampleDir Sarek-data/testdata/tiny/normal --variantCalling --tools HaplotypeCaller,Strelka --bed `pwd`/Sarek-data/testdata/target.bed --noReports
+	run_wrapper --germline --step recalibrate --noReports
+	clean_repo
 fi
 
 if [[ ALL,SOMATIC =~ $TEST ]]
 then
-  run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports
-  run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP
-  clean_repo
+	run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools FreeBayes,HaplotypeCaller,Manta,Mutect2 --noReports
+	run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny-manta.tsv --variantCalling --tools Manta,Strelka --noReports --strelkaBP
+	# Disabling targeted somatic as it is practically the same as the germline, and takes aaaages
+	#run_wrapper --somatic --sample Sarek-data/testdata/tsv/tiny.tsv --variantCalling --tools Mutect2,Strelka --bed `pwd`/Sarek-data/testdata/target.bed
+	clean_repo
 fi
 
 if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]]
@@ -115,7 +125,7 @@ then
   clean_repo
 fi
 
-if [[ ALL,BUILDCONTAINERS =~ $TEST ]] && [[ $PROFILE == docker ]]
+if [[ BUILDCONTAINERS =~ $TEST ]] && [[ $PROFILE == docker ]]
 then
   ./scripts/do_all.sh --genome $GENOME
 fi