Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue #135: mutect2 fails with --no_intervals option #146

Merged
merged 6 commits into from
Mar 6, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 52 additions & 39 deletions bin/concatenateVCFs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -euo pipefail
# This script concatenates all VCF files that are in the local directory,
# that were created from different intervals to make a single final VCF

usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus>" 1>&2; exit 1; }
usage() { echo "Usage: $0 [-i genome_index_file] [-o output.file.no.gz.extension] <-t target.bed> <-c cpus> <-n>" 1>&2; exit 1; }

while [[ $# -gt 0 ]]
do
Expand All @@ -30,6 +30,10 @@ do
shift # past argument
shift # past value
;;
-n)
noInt=1
shift # past argument
;;
*)
usage
shift # past argument
Expand All @@ -41,45 +45,54 @@ if [ -z ${genomeIndex} ]; then echo "Missing index file "; usage; fi
if [ -z ${cpus} ]; then echo "No CPUs defined: setting to 1"; cpus=1; fi
if [ -z ${outputFile} ]; then echo "Missing output file name"; usage; fi

# First make a header from one of the VCF
# Remove interval information from the GATK command-line, but leave the rest
FIRSTVCF=$(ls *.vcf | head -n 1)
sed -n '/^[^#]/q;p' $FIRSTVCF | \
awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
> header

# Get list of contigs from the FASTA index (.fai)
# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)

CONTIGS=($(cut -f1 ${genomeIndex}))

# Concatenate VCFs in the correct order
(
cat header

for chr in "${CONTIGS[@]}"; do
# Skip if globbing would not match any file to avoid errors such as
# "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
# was not processed.
pattern="${chr}_*.vcf"
if ! compgen -G "${pattern}" > /dev/null; then continue; fi

# ls -v sorts by numeric value ("version"), which means that chr1_100_
# is sorted *after* chr1_99_.
for vcf in $(ls -v ${pattern}); do
# Determine length of header.
# The 'q' command makes sed exit when it sees the first non-header
# line, which avoids reading in the entire file.
L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)

# Then print all non-header lines. Since tail is very fast (nearly as
# fast as cat), this is way more efficient than using a single sed,
# awk or grep command.
tail -n +$((L+1)) ${vcf}
done
done
) | bgzip -@${cpus} > rawcalls.vcf.gz
tabix rawcalls.vcf.gz
if [ -z ${noInt+x} ]
then
# First make a header from one of the VCF
# Remove interval information from the GATK command-line, but leave the rest
FIRSTVCF=$(ls *.vcf | head -n 1)
sed -n '/^[^#]/q;p' $FIRSTVCF | \
awk '!/GATKCommandLine/{print}/GATKCommandLine/{for(i=1;i<=NF;i++){if($i!~/intervals=/ && $i !~ /out=/){printf("%s ",$i)}}printf("\n")}' \
> header

# Get list of contigs from the FASTA index (.fai)
# ##contig header in the VCF cannot be used as it is optional (FreeBayes does not save it, for example)

CONTIGS=($(cut -f1 ${genomeIndex}))

# Concatenate VCFs in the correct order
(
cat header

for chr in "${CONTIGS[@]}"; do
# Skip if globbing would not match any file to avoid errors such as
# "ls: cannot access chr3_*.vcf: No such file or directory" when chr3
# was not processed.
pattern="${chr}_*.vcf"
if ! compgen -G "${pattern}" > /dev/null; then continue; fi

# ls -v sorts by numeric value ("version"), which means that chr1_100_
# is sorted *after* chr1_99_.
for vcf in $(ls -v ${pattern}); do
# Determine length of header.
# The 'q' command makes sed exit when it sees the first non-header
# line, which avoids reading in the entire file.
L=$(sed -n '/^[^#]/q;p' ${vcf} | wc -l)

# Then print all non-header lines. Since tail is very fast (nearly as
# fast as cat), this is way more efficient than using a single sed,
# awk or grep command.
tail -n +$((L+1)) ${vcf}
done
done
) | bgzip -@${cpus} > rawcalls.vcf.gz
tabix rawcalls.vcf.gz
else
VCF=$(ls no_intervals*.vcf)
cp $VCF rawcalls.vcf
bgzip -@${cpus} rawcalls.vcf
tabix rawcalls.vcf.gz
fi

set +u

Expand Down
22 changes: 14 additions & 8 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -1755,12 +1755,13 @@ process HaplotypeCaller {
when: 'haplotypecaller' in tools

script:
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
gatk --java-options "-Xmx${task.memory.toGiga()}g -Xms6000m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \
HaplotypeCaller \
-R ${fasta} \
-I ${bam} \
-L ${intervalBed} \
${intervalsOptions} \
-D ${dbsnp} \
-O ${intervalBed.baseName}_${idSample}.g.vcf \
-ERC GVCF
Expand Down Expand Up @@ -1792,6 +1793,7 @@ process GenotypeGVCFs {

script:
// Using -L is important for speed and we have to index the interval files also
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
gatk --java-options -Xmx${task.memory.toGiga()}g \
IndexFeatureFile \
Expand All @@ -1800,7 +1802,7 @@ process GenotypeGVCFs {
gatk --java-options -Xmx${task.memory.toGiga()}g \
GenotypeGVCFs \
-R ${fasta} \
-L ${intervalBed} \
${intervalsOptions} \
-D ${dbsnp} \
-V ${gvcf} \
-O ${intervalBed.baseName}_${idSample}.vcf
Expand Down Expand Up @@ -2090,6 +2092,7 @@ process FreeBayes {
when: 'freebayes' in tools

script:
intervalsOptions = params.no_intervals ? "" : "-t ${intervalBed}"
"""
freebayes \
-f ${fasta} \
Expand All @@ -2101,7 +2104,7 @@ process FreeBayes {
--min-alternate-fraction 0.03 \
--min-repeat-entropy 1 \
--min-alternate-count 2 \
-t ${intervalBed} \
${intervalsOptions} \
${bamTumor} \
${bamNormal} > ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf
"""
Expand Down Expand Up @@ -2137,14 +2140,15 @@ process Mutect2 {
// please make a panel-of-normals, using at least 40 samples
// https://gatkforums.broadinstitute.org/gatk/discussion/11136/how-to-call-somatic-mutations-using-gatk4-mutect2
PON = params.pon ? "--panel-of-normals ${pon}" : ""
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
# Get raw calls
gatk --java-options "-Xmx${task.memory.toGiga()}g" \
Mutect2 \
-R ${fasta}\
-I ${bamTumor} -tumor ${idSampleTumor} \
-I ${bamNormal} -normal ${idSampleNormal} \
-L ${intervalBed} \
${intervalsOptions} \
--germline-resource ${germlineResource} \
${PON} \
-O ${intervalBed.baseName}_${idSampleTumor}_vs_${idSampleNormal}.vcf
Expand Down Expand Up @@ -2219,8 +2223,9 @@ process ConcatVCF {
else
outputFile = "${variantCaller}_${idSample}.vcf"
options = params.target_bed ? "-t ${targetBED}" : ""
intervalsOptions = params.no_intervals ? "" : "-n"
"""
concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options}
concatenateVCFs.sh -i ${fastaFai} -c ${task.cpus} -o ${outputFile} ${options} ${intervalsOptions}
"""
}

Expand Down Expand Up @@ -2250,12 +2255,13 @@ process PileupSummariesForMutect2 {
when: 'mutect2' in tools

script:
intervalsOptions = params.no_intervals ? "" : "-L ${intervalBed}"
"""
gatk --java-options "-Xmx${task.memory.toGiga()}g" \
GetPileupSummaries \
-I ${bamTumor} \
-V ${germlineResource} \
-L ${intervalBed} \
${intervalsOptions} \
-O ${intervalBed.baseName}_${idSampleTumor}_pileupsummaries.table
"""
}
Expand Down Expand Up @@ -2778,11 +2784,11 @@ mpileupOutTumor = Channel.create()
mpileupOut
.choice(mpileupOutTumor, mpileupOutNormal) {statusMap[it[0], it[1]] == 0 ? 1 : 0}

mpileupOut = mpileupOutNormal.combine(mpileupOutTumor)
mpileupOut = mpileupOutNormal.combine(mpileupOutTumor, by:0)

mpileupOut = mpileupOut.map {
idPatientNormal, idSampleNormal, mpileupOutNormal,
idPatientTumor, idSampleTumor, mpileupOutTumor ->
idSampleTumor, mpileupOutTumor ->
[idPatientNormal, idSampleNormal, idSampleTumor, mpileupOutNormal, mpileupOutTumor]
}

Expand Down