Skip to content

Commit

Permalink
Realignment filter
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin committed May 22, 2018
1 parent ec36b13 commit 31c4f06
Show file tree
Hide file tree
Showing 14 changed files with 676 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.iterators.ReadFilteringIterator;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.Collections;
import java.util.Iterator;
import java.util.*;

/**
* Wrapper around ReadsDataSource that presents reads overlapping a specific interval to a client,
Expand Down Expand Up @@ -90,6 +90,16 @@ public SimpleInterval getInterval() {
*/
@Override
public Iterator<GATKRead> iterator() {
return iterator(interval);
}

/**
* Get an iterator over the reads of the backing data source over a given interval. Will return an empty iterator if this
* context has no backing source of reads.
*
* @return iterator over the reads in this context
*/
public Iterator<GATKRead> iterator(final SimpleInterval interval) {
// We can't perform a query if we lack either a dataSource or an interval to query on
if ( dataSource == null || interval == null ) {
return Collections.<GATKRead>emptyList().iterator();
Expand All @@ -99,4 +109,5 @@ public Iterator<GATKRead> iterator() {
dataSource.query(interval) :
new ReadFilteringIterator(dataSource.query(interval), readFilter);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package org.broadinstitute.hellbender.tools.walkers.realignmentfilter;

import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.commons.math3.util.Pair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine;
import org.broadinstitute.hellbender.utils.Trilean;
import org.broadinstitute.hellbender.utils.bwa.BwaMemAlignment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import picard.cmdline.programgroups.VariantFilteringProgramGroup;

import java.io.File;
import java.util.*;

/**
* <p>Filter false positive alignment artifacts from a VCF callset.</p>
*
* <p>
* FilterAlignmentArtifacts identifies alignment artifacts, that is, apparent variants due to reads being mapped to the wrong genomic locus.
* </p>
* <p>
* Alignment artifacts can occur whenever there is sufficient sequence similarity between two or more regions in the genome
* to confuse the alignment algorithm. This can occur when the aligner for whatever reason overestimate how uniquely a read
* maps, thereby assigning it too high of a mapping quality. It can also occur through no fault of the aligner due to gaps in
* the reference, which can also hide the true position to which a read should map. By using a good alignment algorithm
* (the GATK wrapper of BWA-MEM), giving it sensitive settings (which may have been impractically slow for the original
* bam alignment) and mapping to the best available reference we can avoid these pitfalls. The last point is especially important:
* one can (and should) use a BWA-MEM index image corresponding to the best reference, regardless of the reference to which
* the bam was aligned.
* </p>
* <p>
* This tool is featured in the Somatic Short Mutation calling Best Practice Workflow.
* See <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11136">Tutorial#11136</a> for a
* step-by-step description of the workflow and <a href="https://software.broadinstitute.org/gatk/documentation/article?id=11127">Article#11127</a>
* for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the
* <a href="https://github.com/broadinstitute/gatk/tree/master/scripts/mutect2_wdl">Mutect2 WDL scripts directory</a>.
* </p>
* <p>
* The bam input to this tool should be the reassembly bamout produced by HaplotypeCaller or Mutect2 in the process of generating
* the input callset. The original bam will also work but might fail to filter some indels. The reference passed with the -R argument
* must be the reference to which the input bam was realigned. This does not need to correspond to the reference of the BWA-MEM
* index image. The latter should be derived from the best available reference, for example hg38 in humans as of February 2018.
* </p>
*
* <h3>Usage example</h3>
* <pre>
* gatk FilterAlignmentArtifacts \
* -R hg19.fasta
* -V somatic.vcf.gz \
* -I somatic_bamout.bam \
* --bwa-mem-index-image hg38.index_image \
* -O filtered.vcf.gz
* </pre>
*
*/
@CommandLineProgramProperties(
summary = "Filter alignment artifacts from a vcf callset.",
oneLineSummary = "Filter alignment artifacts from a vcf callset.",
programGroup = VariantFilteringProgramGroup.class
)
@DocumentedFeature
@ExperimentalFeature
public class FilterAlignmentArtifacts extends VariantWalker {
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="The output filtered VCF file", optional=false)
private final String outputVcf = null;

public static final int DEFAULT_INDEL_START_TOLERANCE = 5;
public static final String INDEL_START_TOLERANCE_LONG_NAME = "indel-start-tolerance";
@Argument(fullName = INDEL_START_TOLERANCE_LONG_NAME,
doc="Max distance between indel start of aligned read in the bam and the variant in the vcf", optional=true)
private int indelStartTolerance = DEFAULT_INDEL_START_TOLERANCE;

public static final int DEFAULT_FRAGMENT_SIZE = 1000;
public static final String FRAGMENT_SIZE_LONG_NAME = "fragment-size";
@Argument(fullName = FRAGMENT_SIZE_LONG_NAME,
doc="Distance away from variant to look for reads' mates.", optional=true)
private int fragmentSize = DEFAULT_FRAGMENT_SIZE;

public static final int DEFAULT_MAX_FAILED_REALIGNMENTS = 3;
public static final String MAX_FAILED_REALIGNMENTS_LONG_NAME = "max-failed-realignments";
@Argument(fullName = MAX_FAILED_REALIGNMENTS_LONG_NAME,
doc="Maximum number of failed read realignments before a variant is rejected.", optional=true)
private int maxFailedRealignments = DEFAULT_MAX_FAILED_REALIGNMENTS;

public static final int DEFAULT_SUFFICIENT_GOOD_REALIGNMENTS = 2;
public static final String SUFFICIENT_GOOD_REALIGNMENTS_LONG_NAME = "sufficient-good-realignments";
@Argument(fullName = SUFFICIENT_GOOD_REALIGNMENTS_LONG_NAME,
doc="Sufficient number of good read realignments to accept a variant.", optional=true)
private int sufficientGoodRealignments = DEFAULT_SUFFICIENT_GOOD_REALIGNMENTS;


@ArgumentCollection
protected RealignmentArgumentCollection realignmentArgumentCollection = new RealignmentArgumentCollection();

private VariantContextWriter vcfWriter;
private RealignmentEngine realignmentEngine;

@Override
public List<ReadFilter> getDefaultReadFilters() {
return Mutect2Engine.makeStandardMutect2ReadFilters();
}

@Override
public boolean requiresReads() { return true; }

@Override
public void onTraversalStart() {
realignmentEngine = new RealignmentEngine(realignmentArgumentCollection);
vcfWriter = createVCFWriter(new File(outputVcf));

final VCFHeader inputHeader = getHeaderForVariants();
final Set<VCFHeaderLine> headerLines = new HashSet<>(inputHeader.getMetaDataInSortedOrder());
headerLines.add(GATKVCFHeaderLines.getFilterLine(GATKVCFConstants.ALIGNMENT_ARTIFACT_FILTER_NAME));
headerLines.addAll(getDefaultToolVCFHeaderLines());
final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples());
vcfWriter.writeHeader(vcfHeader);
}

@Override
public Object onTraversalSuccess() {
return "SUCCESS";
}

@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext refContext, final FeatureContext fc) {
Trilean passesFilter = vc.getNAlleles() == 1 ? Trilean.TRUE : Trilean.UNKNOWN;

final MutableInt failedRealignmentCount = new MutableInt(0);
final MutableInt succeededRealignmentCount = new MutableInt(0);

final Map<GATKRead, GATKRead> mates = realignmentArgumentCollection.dontUseMates ? null
: ReadUtils.getReadToMateMap(readsContext, fragmentSize);

for (final GATKRead read : readsContext) {
if (passesFilter != Trilean.UNKNOWN) {
break;
} else if ( !RealignmentEngine.supportsVariant(read, vc, indelStartTolerance)) {
continue;
}

final RealignmentEngine.RealignmentResult readRealignment = realignmentEngine.realign(read);

// if there's no mate we go by the read realignment
if (mates == null || !mates.containsKey(read)) {
(readRealignment.isGood() ? succeededRealignmentCount : failedRealignmentCount).increment();
} else {
// check whether the pair maps uniquely
final GATKRead mate = mates.get(read);
final RealignmentEngine.RealignmentResult mateRealignment = realignmentEngine.realign(mate);

final List<BwaMemAlignment> readRealignments = readRealignment.getRealignments();
final List<BwaMemAlignment> mateRealignments = mateRealignment.getRealignments();
final List<Pair<BwaMemAlignment, BwaMemAlignment>> plausiblePairs = RealignmentEngine.findPlausiblePairs(readRealignments, mateRealignments, realignmentArgumentCollection.maxReasonableFragmentLength);

if (plausiblePairs.size() <= 1) {
succeededRealignmentCount.increment();
} else {
plausiblePairs.sort(Comparator.comparingInt(pair -> -pairScore(pair)) );
final int scoreDiff = pairScore(plausiblePairs.get(0)) - pairScore(plausiblePairs.get(1));
if (scoreDiff >= realignmentArgumentCollection.minAlignerScoreDifference) {
succeededRealignmentCount.increment();
} else {
failedRealignmentCount.increment();
}
}
}

if (failedRealignmentCount.intValue() > maxFailedRealignments) {
passesFilter = Trilean.FALSE;
} else if (succeededRealignmentCount.intValue() >= sufficientGoodRealignments) {
passesFilter = Trilean.TRUE;
}
}

// if we haven't decided yet due to too few supporting reads, fail if there are more failures than successes
passesFilter = passesFilter != Trilean.UNKNOWN ? passesFilter :
Trilean.of(failedRealignmentCount.intValue() <= succeededRealignmentCount.intValue());

vcfWriter.add(passesFilter == Trilean.TRUE ? vc : new VariantContextBuilder(vc).filter(GATKVCFConstants.ALIGNMENT_ARTIFACT_FILTER_NAME).make());
}

private static int pairScore(final Pair<BwaMemAlignment, BwaMemAlignment> pair) {
return pair.getFirst().getAlignerScore() + pair.getSecond().getAlignerScore();
}

@Override
public void closeTool() {
if ( vcfWriter != null ) {
vcfWriter.close();
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package org.broadinstitute.hellbender.tools.walkers.realignmentfilter;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.tools.BwaMemIndexImageCreator;

public class RealignmentArgumentCollection {
public static final int DEFAULT_MIN_SEED_LENGTH = 14;
public static final double DEFAULT_DROP_RATIO = 0.2;
public static final double DEFAULT_SEED_SPLIT_FACTOR = 0.5;
public static final int DEFAULT_MAX_REASONABLE_FRAGMENT_LENGTH = 100000;
public static final int DEFAULT_MIN_ALIGNER_SCORE_DIFFERENCE = 20;
public static final int DEFAULT_NUM_REGULAR_CONTIGS = 25;

/**
* BWA-mem index image created by {@link BwaMemIndexImageCreator}
*/
@Argument(fullName = "bwa-mem-index-image", shortName = "index", doc = "BWA-mem index image")
public String bwaMemIndexImage;

/**
* Turn off the default mate-aware realignment
*/
@Argument(fullName = "dont-use-mates", doc = "Realign individual reads without using their mates", optional = true)
public boolean dontUseMates = false;

/**
* Maximum fragment length to be considered a reasonable pair alignment
*/
@Argument(fullName = "max-reasonable-fragment-length", doc = "Maximum fragment length to be considered a reasonable pair alignment", optional = true)
public int maxReasonableFragmentLength = DEFAULT_MAX_REASONABLE_FRAGMENT_LENGTH;

/**
* Minimum difference between best and second-best alignment for a read to be considered well-mapped
*/
@Argument(fullName = "min-aligner-score-difference", doc = "Minimum difference between best and second-best alignment for a read to be considered well-mapped", optional = true)
public int minAlignerScoreDifference = DEFAULT_MIN_ALIGNER_SCORE_DIFFERENCE;

/**
* Number of regular i.e. non-alt contigs
*/
@Argument(fullName = "num-regular-contigs", doc = "Number of regular i.e. non-alt contigs", optional = true)
public int numRegularContigs = DEFAULT_NUM_REGULAR_CONTIGS;

/**
* BWA-mem minimum seed length
*/
@Argument(fullName = "minimum-seed-length", shortName = "min-seed-length", optional = true, doc = "Minimum number of matching bases to seed a MEM")
public int minSeedLength = DEFAULT_MIN_SEED_LENGTH;

/**
* BWA-mem extension drop ratio
*/
@Argument(fullName = "drop-ratio", shortName = "drop-ratio", doc = "Fraction of best MEM extension score below which other extensions are dropped")
public double dropRatio = DEFAULT_DROP_RATIO;

/**
* BWA-mem seed split factor
*/
@Argument(fullName = "seed-split-factor", shortName = "split-factor", doc = "MEMs longer than the minimum seed length times this factor are split and re-seeded.")
public double splitFactor = DEFAULT_SEED_SPLIT_FACTOR;

}
Loading

0 comments on commit 31c4f06

Please sign in to comment.