Skip to content

Commit

Permalink
Merge pull request #146 from lconde-ucl/dev
Browse files Browse the repository at this point in the history
Fix issue #135: mutect2 fails with --no_intervals option
  • Loading branch information
maxulysse authored Mar 6, 2020
2 parents 6e7c9f0 + 9c45c36 commit b27014f
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 49 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ jobs:
docker pull nfcore/sarek:dev
docker tag nfcore/sarek:dev nfcore/sarek:dev
- name: Run test for minimal genomes
run: nextflow run . -profile test,docker --skipQC all --verbose --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka
run: nextflow run . -profile test,docker --skipQC all --verbose --genome ${{ matrix.genome }} ${{ matrix.intervals }} --tools Manta,mpileup,Strelka,FreeBayes

profile:
env:
Expand Down Expand Up @@ -146,4 +146,4 @@ jobs:
docker pull nfcore/sarek:dev
docker tag nfcore/sarek:dev nfcore/sarek:dev
- name: Run ${{ matrix.tool }} test
run: nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}
run: nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}
91 changes: 52 additions & 39 deletions bin/concatenateVCFs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -euo pipefail
# This script concatenates all VCF files that are in the local directory,
# that were created from different intervals to make a single final VCF

usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus> <-n>" 1>&2; exit 1; }

while [[ $# -gt 0 ]]
do
Expand All @@ -30,6 +30,10 @@ do
shift # past argument
shift # past value
;;
-n)
noInt=1
shift # past argument
;;
*)
usage
shift # past argument
Expand All @@ -41,45 +45,54 @@ if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi

# First make a header from one of the VCF
# Remove interval information from the GATK command-line, but leave the rest
FIRSTVCF=$(ls *.vcf | head -n 1)
sed -n '/^[^#]/q;p' $FIRSTVCF | \
awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
> header

# Get list of contigs from the FASTA index (.fai)
# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)

CONTIGS=($(cut -f1 ${genomeIndex}))

# Concatenate VCFs in the correct order
(
cat header

for chr in "${CONTIGS[@]}"; do
# Skip if globbing would not match any file to avoid errors such as
# "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
# was not processed.
pattern="${chr}_*.vcf"
if ! compgen -G "${pattern}" > /dev/null; then continue; fi

# ls -v sorts by numeric value ("version"), which means that chr1_100_
# is sorted *after* chr1_99_.
for vcf in $(ls -v ${pattern}); do
# Determine length of header.
# The 'q' command makes sed exit when it sees the first non-header
# line, which avoids reading in the entire file.
L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)

# Then print all non-header lines. Since tail is very fast (nearly as
# fast as cat), this is way more efficient than using a single sed,
# awk or grep command.
tail -n +$((L+1)) ${vcf}
done
done
) | bgzip -@${cpus} > rawcalls.vcf.gz
tabix rawcalls.vcf.gz
if [ -z ${noInt+x} ]
then
# First make a header from one of the VCF
# Remove interval information from the GATK command-line, but leave the rest
FIRSTVCF=$(ls *.vcf | head -n 1)
sed -n '/^[^#]/q;p' $FIRSTVCF | \
awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
> header

# Get list of contigs from the FASTA index (.fai)
# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)

CONTIGS=($(cut -f1 ${genomeIndex}))

# Concatenate VCFs in the correct order
(
cat header

for chr in "${CONTIGS[@]}"; do
# Skip if globbing would not match any file to avoid errors such as
# "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
# was not processed.
pattern="${chr}_*.vcf"
if ! compgen -G "${pattern}" > /dev/null; then continue; fi

# ls -v sorts by numeric value ("version"), which means that chr1_100_
# is sorted *after* chr1_99_.
for vcf in $(ls -v ${pattern}); do
# Determine length of header.
# The 'q' command makes sed exit when it sees the first non-header
# line, which avoids reading in the entire file.
L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)

# Then print all non-header lines. Since tail is very fast (nearly as
# fast as cat), this is way more efficient than using a single sed,
# awk or grep command.
tail -n +$((L+1)) ${vcf}
done
done
) | bgzip -@${cpus} > rawcalls.vcf.gz
tabix rawcalls.vcf.gz
else
VCF=$(ls no_intervals*.vcf)
cp $VCF rawcalls.vcf
bgzip -@${cpus} rawcalls.vcf
tabix rawcalls.vcf.gz
fi

set +u

Expand Down
22 changes: 14 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -1755,12 +1755,13 @@ process HaplotypeCaller {
when: 'haplotypecaller' in tools

script:
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
HaplotypeCaller \
-R ${fasta} \
-I ${bam} \
-L ${intervalBed} \
${intervalsOptions} \
-D ${dbsnp} \
-O ${intervalBed.baseName}_${idSample}.g.vcf \
-ERC GVCF
Expand Down Expand Up @@ -1792,6 +1793,7 @@ process GenotypeGVCFs {

script:
// Using -L is important for speed and we have to index the interval files also
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
gatk --java-options -Xmx${task.memory.toGiga()}g \
IndexFeatureFile \
Expand All @@ -1800,7 +1802,7 @@ process GenotypeGVCFs {
gatk --java-options -Xmx${task.memory.toGiga()}g \
GenotypeGVCFs \
-R ${fasta} \
-L ${intervalBed} \
${intervalsOptions} \
-D ${dbsnp} \
-V ${gvcf} \
-O ${intervalBed.baseName}_${idSample}.vcf
Expand Down Expand Up @@ -2090,6 +2092,7 @@ process FreeBayes {
when: 'freebayes' in tools

script:
intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}"
"""
freebayes \
-f ${fasta} \
Expand All @@ -2101,7 +2104,7 @@ process FreeBayes {
--min-alternate-fraction 0.03 \
--min-repeat-entropy 1 \
--min-alternate-count 2 \
-t ${intervalBed} \
${intervalsOptions} \
${bamTumor} \
${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf
"""
Expand Down Expand Up @@ -2137,14 +2140,15 @@ process Mutect2 {
// please make a panel-of-normals, using at least 40 samples
// https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2
PON = params.pon ? "--panel-of-normals ${pon}" : ""
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
# Get raw calls
gatk --java-options "-Xmx${task.memory.toGiga()}g" \
Mutect2 \
-R ${fasta}\
-I ${bamTumor} -tumor ${idSampleTumor} \
-I ${bamNormal} -normal ${idSampleNormal} \
-L ${intervalBed} \
${intervalsOptions} \
--germline-resource ${germlineResource} \
${PON} \
-O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf
Expand Down Expand Up @@ -2219,8 +2223,9 @@ process ConcatVCF {
else
outputFile = "${variantCaller}_${idSample}.vcf"
options = params.target_bed ? "-t ${targetBED}" : ""
intervalsOptions = params.no_intervals ? "-n" : ""
"""
concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options}
concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions}
"""
}

Expand Down Expand Up @@ -2250,12 +2255,13 @@ process PileupSummariesForMutect2 {
when: 'mutect2' in tools

script:
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
gatk --java-options "-Xmx${task.memory.toGiga()}g" \
GetPileupSummaries \
-I ${bamTumor} \
-V ${germlineResource} \
-L ${intervalBed} \
${intervalsOptions} \
-O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table
"""
}
Expand Down Expand Up @@ -2778,11 +2784,11 @@ mpileupOutTumor = Channel.create()
mpileupOut
.choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0}

mpileupOut = mpileupOutNormal.combine(mpileupOutTumor)
mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0)

mpileupOut = mpileupOut.map {
idPatientNormal, idSampleNormal, mpileupOutNormal,
idPatientTumor, idSampleTumor, mpileupOutTumor ->
idSampleTumor, mpileupOutTumor ->
[idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor]
}

Expand Down

0 comments on commit b27014f

Please sign in to comment.