Merge pull request #146 from lconde-ucl/dev

Fix issue #135: mutect2 fails with --no_intervals option
nf-core · Mar 6, 2020 · b27014f · b27014f
2 parents 6e7c9f0 + 9c45c36
commit b27014f
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 49 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -100,7 +100,7 @@ jobs:
           docker pull nfcore/sarek:dev
           docker tag nfcore/sarek:dev nfcore/sarek:dev
       - name: Run test for minimal genomes
-        run: nextflow run . -profile test,docker --skipQC all --verbose --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka
+        run: nextflow run . -profile test,docker --skipQC all --verbose --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes
 
   profile:
     env:
@@ -146,4 +146,4 @@ jobs:
           docker pull nfcore/sarek:dev
           docker tag nfcore/sarek:dev nfcore/sarek:dev
       - name: Run ${{ matrix.tool }} test
-        run: nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}
+        run: nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}
diff --git a/bin/concatenateVCFs.sh b/bin/concatenateVCFs.sh
@@ -4,7 +4,7 @@ set -euo pipefail
 # This script concatenates all VCF files that are in the local directory,
 # that were created from different intervals to make a single final VCF
 
-usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
+usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus> <-n>" 1>&2; exit 1; }
 
 while [[ $# -gt 0 ]]
 do
@@ -30,6 +30,10 @@ do
 			shift # past argument
 	    shift # past value
 			;;
+		-n)
+			noInt=1
+			shift # past argument
+			;;
 		*)
 			usage
 			shift # past argument
@@ -41,45 +45,54 @@ if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
 if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
 if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi
 
-# First make a header from one of the VCF
-# Remove interval information from the GATK command-line, but leave the rest
-FIRSTVCF=$(ls *.vcf | head -n 1)
-sed -n '/^[^#]/q;p' $FIRSTVCF | \
-awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
-> header
-
-# Get list of contigs from the FASTA index (.fai)
-# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)
-
-CONTIGS=($(cut -f1 ${genomeIndex}))
-
-# Concatenate VCFs in the correct order
-(
-  cat header
 
-  for chr in "${CONTIGS[@]}"; do
-    # Skip if globbing would not match any file to avoid errors such as
-    # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
-    # was not processed.
-    pattern="${chr}_*.vcf"
-    if ! compgen -G "${pattern}" > /dev/null; then continue; fi
-
-    # ls -v sorts by numeric value ("version"), which means that chr1_100_
-    # is sorted *after* chr1_99_.
-    for vcf in $(ls -v ${pattern}); do
-      # Determine length of header.
-      # The 'q' command makes sed exit when it sees the first non-header
-      # line, which avoids reading in the entire file.
-      L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
-
-      # Then print all non-header lines. Since tail is very fast (nearly as
-      # fast as cat), this is way more efficient than using a single sed,
-      # awk or grep command.
-      tail -n +$((L+1)) ${vcf}
-    done
-  done
-) | bgzip -@${cpus} > rawcalls.vcf.gz
-tabix rawcalls.vcf.gz
+if [ -z ${noInt+x} ] 
+then
+	# First make a header from one of the VCF
+	# Remove interval information from the GATK command-line, but leave the rest
+	FIRSTVCF=$(ls *.vcf | head -n 1)
+	sed -n '/^[^#]/q;p' $FIRSTVCF | \
+	awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
+	> header
+
+	# Get list of contigs from the FASTA index (.fai)
+	# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)
+
+	CONTIGS=($(cut -f1 ${genomeIndex}))
+
+	# Concatenate VCFs in the correct order
+	(
+	  cat header
+
+	  for chr in "${CONTIGS[@]}"; do
+	    # Skip if globbing would not match any file to avoid errors such as
+	    # "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
+	    # was not processed.
+	    pattern="${chr}_*.vcf"
+	    if ! compgen -G "${pattern}" > /dev/null; then continue; fi
+
+	    # ls -v sorts by numeric value ("version"), which means that chr1_100_
+	    # is sorted *after* chr1_99_.
+	    for vcf in $(ls -v ${pattern}); do
+	      # Determine length of header.
+	      # The 'q' command makes sed exit when it sees the first non-header
+	      # line, which avoids reading in the entire file.
+	      L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)
+
+	      # Then print all non-header lines. Since tail is very fast (nearly as
+	      # fast as cat), this is way more efficient than using a single sed,
+	      # awk or grep command.
+	      tail -n +$((L+1)) ${vcf}
+	    done
+	  done
+	) | bgzip -@${cpus} > rawcalls.vcf.gz
+	tabix rawcalls.vcf.gz
+else
+        VCF=$(ls no_intervals*.vcf)
+        cp $VCF rawcalls.vcf 
+        bgzip -@${cpus} rawcalls.vcf
+        tabix rawcalls.vcf.gz
+fi
 
 set +u
 

diff --git a/main.nf b/main.nf
@@ -1755,12 +1755,13 @@ process HaplotypeCaller {
     when: 'haplotypecaller' in tools
 
     script:
+    intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
     """
     gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
         HaplotypeCaller \
         -R ${fasta} \
         -I ${bam} \
-        -L ${intervalBed} \
+        ${intervalsOptions} \
         -D ${dbsnp} \
         -O ${intervalBed.baseName}_${idSample}.g.vcf \
         -ERC GVCF
@@ -1792,6 +1793,7 @@ process GenotypeGVCFs {
 
     script:
     // Using -L is important for speed and we have to index the interval files also
+    intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
     """
     gatk --java-options -Xmx${task.memory.toGiga()}g \
         IndexFeatureFile \
@@ -1800,7 +1802,7 @@ process GenotypeGVCFs {
     gatk --java-options -Xmx${task.memory.toGiga()}g \
         GenotypeGVCFs \
         -R ${fasta} \
-        -L ${intervalBed} \
+        ${intervalsOptions} \
         -D ${dbsnp} \
         -V ${gvcf} \
         -O ${intervalBed.baseName}_${idSample}.vcf
@@ -2090,6 +2092,7 @@ process FreeBayes {
     when: 'freebayes' in tools
 
     script:
+    intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}"
     """
     freebayes \
         -f ${fasta} \
@@ -2101,7 +2104,7 @@ process FreeBayes {
         --min-alternate-fraction 0.03 \
         --min-repeat-entropy 1 \
         --min-alternate-count 2 \
-        -t ${intervalBed} \
+        ${intervalsOptions} \
         ${bamTumor} \
         ${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf
     """
@@ -2137,14 +2140,15 @@ process Mutect2 {
     // please make a panel-of-normals, using at least 40 samples
     // https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2
     PON = params.pon ? "--panel-of-normals ${pon}" : ""
+    intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
     """
     # Get raw calls
     gatk --java-options "-Xmx${task.memory.toGiga()}g" \
       Mutect2 \
       -R ${fasta}\
       -I ${bamTumor}  -tumor ${idSampleTumor} \
       -I ${bamNormal} -normal ${idSampleNormal} \
-      -L ${intervalBed} \
+      ${intervalsOptions} \
       --germline-resource ${germlineResource} \
       ${PON} \
       -O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf
@@ -2219,8 +2223,9 @@ process ConcatVCF {
     else
           outputFile = "${variantCaller}_${idSample}.vcf"
     options = params.target_bed ? "-t ${targetBED}" : ""
+    intervalsOptions = params.no_intervals ? "-n" : ""
     """
-    concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options}
+    concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions}
     """
 }
 
@@ -2250,12 +2255,13 @@ process PileupSummariesForMutect2 {
     when: 'mutect2' in tools
 
     script:
+    intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
     """
     gatk --java-options "-Xmx${task.memory.toGiga()}g" \
         GetPileupSummaries \
         -I ${bamTumor} \
         -V ${germlineResource} \
-        -L ${intervalBed} \
+        ${intervalsOptions} \
         -O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table
     """
 }
@@ -2778,11 +2784,11 @@ mpileupOutTumor = Channel.create()
 mpileupOut
     .choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0}
 
-mpileupOut = mpileupOutNormal.combine(mpileupOutTumor)
+mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0)
 
 mpileupOut = mpileupOut.map {
     idPatientNormal, idSampleNormal, mpileupOutNormal,
-    idPatientTumor, idSampleTumor, mpileupOutTumor ->
+    idSampleTumor, mpileupOutTumor ->
     [idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor]
 }